From ed33f4ae54d21da162c648e618da97bcade629a5 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 10 Oct 2025 20:01:16 +0000
Subject: [PATCH 01/45] initial commit of benchmarks

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md     | 330 +++++++
 benchmarks/attention_benchmarks/__init__.py   |  46 +
 benchmarks/attention_benchmarks/batch_spec.py | 325 +++++++
 benchmarks/attention_benchmarks/benchmark.py  | 587 ++++++++++++
 benchmarks/attention_benchmarks/common.py     | 416 +++++++++
 .../configs/mla_decode.yaml                   |  61 ++
 .../configs/mla_mixed_batch.yaml              |  60 ++
 .../configs/speculative_decode.yaml           |  66 ++
 .../configs/standard_attention.yaml           |  40 +
 .../configs/study1_cutlass_numsplits.yaml     |  54 ++
 .../configs/study2_hopper_head_count.yaml     |  52 ++
 .../configs/study3_flashinfer_vs_cutlass.yaml |  55 ++
 .../configs/study4_reorder_threshold.yaml     | 130 +++
 benchmarks/attention_benchmarks/mla_runner.py | 842 ++++++++++++++++++
 benchmarks/attention_benchmarks/runner.py     | 334 +++++++
 .../attention_benchmarks/test_batch_spec.py   | 181 ++++
 16 files changed, 3579 insertions(+)
 create mode 100644 benchmarks/attention_benchmarks/README.md
 create mode 100644 benchmarks/attention_benchmarks/__init__.py
 create mode 100644 benchmarks/attention_benchmarks/batch_spec.py
 create mode 100644 benchmarks/attention_benchmarks/benchmark.py
 create mode 100644 benchmarks/attention_benchmarks/common.py
 create mode 100644 benchmarks/attention_benchmarks/configs/mla_decode.yaml
 create mode 100644 benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
 create mode 100644 benchmarks/attention_benchmarks/configs/speculative_decode.yaml
 create mode 100644 benchmarks/attention_benchmarks/configs/standard_attention.yaml
 create mode 100644 benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
 create mode 100644 benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
 create mode 100644 benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
 create mode 100644 benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
 create mode 100644 benchmarks/attention_benchmarks/mla_runner.py
 create mode 100644 benchmarks/attention_benchmarks/runner.py
 create mode 100644 benchmarks/attention_benchmarks/test_batch_spec.py
diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
new file mode 100644
index 000000000000..0ec6ee1c4e34
--- /dev/null
+++ b/benchmarks/attention_benchmarks/README.md
@@ -0,0 +1,330 @@
+# vLLM Attention Benchmarking Suite
+
+Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
+
+## Quick Start
+
+```bash
+cd benchmarks/attention_benchmarks
+
+# Test the parser
+python test_batch_spec.py
+# ✓ All tests pass
+
+# Run one of the 4 research studies
+python benchmark.py --config configs/study1_cutlass_numsplits.yaml
+python benchmark.py --config configs/study2_hopper_head_count.yaml
+python benchmark.py --config configs/study3_flashinfer_vs_cutlass.yaml
+python benchmark.py --config configs/study4_reorder_threshold.yaml
+
+# Or run custom benchmarks
+python benchmark.py \
+    --backends flash flashinfer \
+    --batch-specs "q2k" "8s1k" "2q2k_32s1k" \
+    --output-csv results.csv
+```
+
+## Batch Specification Grammar
+
+Express complex workloads concisely:
+
+```python
+"q2k"                    # 2048-token prefill
+"8s1k"                   # 8 decode requests (1k KV cache each)
+"2q2k_32s1k"             # 2 prefills + 32 decodes
+"spec4s1k"               # 4-token speculative decode
+"chunk8q16k"             # Chunked 16k prefill
+"2q2k_spec4s1k_32s1k"    # Complex: 2 prefill + 1 spec + 32 decode
+```
+
+### Grammar Rules
+
+```
+Prefill:        (<count>?) q<tokens>(k?)                 # q2k = 2048 tokens
+Decode:         (<count>?) s<kv_len>(k?)                 # 8s1k = 8 x 1k KV
+Speculative:    (<count>?) spec<len> s<kv_len>(k?)      # spec4s1k
+Chunked:        (<count>?) chunk<size> q<tokens>(k?)    # chunk8q16k
+Mixed:          Use _ to combine                         # 2q2k_32s1k
+
+'k' suffix = multiply by 1024
+```
+
+## Research Studies
+
+The suite includes 4 pre-configured studies to answer key MLA optimization questions. Each study is a single YAML file you can run directly:
+
+### Study 1: CUTLASS MLA num-splits Optimization
+
+**Question:** Should we revert the CUTLASS MLA num-splits heuristic (PRs #24966, #25509)?
+
+```bash
+python benchmark.py --config configs/study1_cutlass_numsplits.yaml
+```
+
+Tests CUTLASS MLA with different `num_kv_splits` values (1, 2, 4, 8, 16, 32) across various batch sizes and compares against auto-selection.
+
+### Study 2: FlashAttn MLA vs FlashMLA on Hopper
+
+**Question:** Does head count matter for FlashAttn MLA vs FlashMLA on Hopper GPUs?
+
+```bash
+# Test with default head count (128)
+python benchmark.py --config configs/study2_hopper_head_count.yaml
+
+# Test with different head counts
+for heads in 16 32 64 128 256; do
+    python benchmark.py --config configs/study2_hopper_head_count.yaml \
+        --num-q-heads $heads \
+        --output-csv study2_heads_${heads}.csv
+done
+```
+
+Compares FlashAttn MLA and FlashMLA performance with varying attention head counts.
+
+### Study 3: FlashInfer-MLA vs Optimized CUTLASS
+
+**Question:** Is FlashInfer-MLA better than CUTLASS MLA after num-splits optimization?
+
+```bash
+python benchmark.py --config configs/study3_flashinfer_vs_cutlass.yaml
+```
+
+Compares FlashInfer-MLA against CUTLASS MLA with optimized `num_kv_splits` values.
+
+### Study 4: Reorder Batch Threshold Optimization (Decode vs Prefill Crossover)
+
+**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
+
+**Methodology:** Reproduces the original `benchmark_mla_threshold.py` study using the new interface:
+- For each query length (1-125), test BOTH decode and prefill pipelines
+- Find the crossover point where prefill becomes faster
+- Analyze how this varies across batch sizes (1-256)
+
+```bash
+python benchmark.py --config configs/study4_reorder_threshold.yaml
+```
+
+Tests query lengths from 1-125 (fine-grained 1-16, step 2 for 17-64, step 4 for 65-125) across 9 batch sizes. For each query length, compares:
+- **Decode pipeline**: `threshold >= query_length`
+- **Prefill pipeline**: `threshold < query_length`
+
+Outputs the optimal threshold (last query length where decode is faster) for each batch size.
+
+---
+
+## Universal Benchmark
+
+The `benchmark.py` script handles **all** backends - both standard attention and MLA.
+
+### Standard Attention (Flash/Triton/FlashInfer)
+
+```bash
+python benchmark.py \
+    --backends flash triton flashinfer \
+    --batch-specs "q2k" "8s1k" "2q2k_32s1k" \
+    --num-layers 10 \
+    --repeats 5 \
+    --output-csv results.csv
+```
+
+### MLA Backends
+
+```bash
+# Compare all MLA backends
+python benchmark.py \
+    --backends cutlass_mla flashinfer_mla flash_attn_mla flashmla \
+    --batch-specs "64s1k" "64s4k" \
+    --output-csv mla_results.csv
+```
+
+### Parameter Sweeps
+
+#### CUTLASS MLA num-splits Optimization
+
+```bash
+python benchmark.py \
+    --backend cutlass_mla \
+    --batch-specs "64s1k" "64s4k" "64s16k" \
+    --num-splits 1 2 4 8 16 \
+    --compare-auto \
+    --output-json optimal_splits.json
+```
+
+**Answers:** What is the optimal `num_kv_splits` for CUTLASS MLA?
+
+#### Reorder Batch Threshold Optimization
+
+```bash
+python benchmark.py \
+    --backend flashmla \
+    --batch-specs "spec4s1k" "spec8s2k" \
+    --thresholds 1 4 16 64 256 512 \
+    --output-csv threshold_sweep.csv
+```
+
+**Answers:** What's the optimal `reorder_batch_threshold` for speculative decoding?
+
+### All Command-Line Options
+
+```
+--backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
+                                    # flashinfer_mla, flash_attn_mla, flashmla
+--backend BACKEND                   # Single backend (alternative to --backends)
+--batch-specs SPEC [SPEC ...]       # Batch specifications (default: ["q2k", "8s1k"])
+
+# Model configuration
+--num-layers N                      # Number of layers (default: 10)
+--head-dim N                        # Head dimension (default: 128)
+--num-q-heads N                     # Query heads (default: 32)
+--num-kv-heads N                    # KV heads (default: 8)
+--block-size N                      # Block size (default: 16)
+
+# Benchmark settings
+--device DEVICE                     # Device (default: cuda:0)
+--repeats N                         # Repetitions (default: 1)
+--warmup-iters N                    # Warmup iterations (default: 3)
+--profile-memory                    # Profile memory usage
+
+# MLA-specific parameter sweeps
+--num-splits N [N ...]              # CUTLASS MLA: Test multiple num_kv_splits
+--thresholds N [N ...]              # FlashMLA/FlashAttn MLA: Test multiple thresholds
+--compare-auto                      # CUTLASS MLA: Also test auto num_kv_splits
+
+# Output
+--output-csv FILE                   # Save to CSV
+--output-json FILE                  # Save to JSON
+```
+
+## Hardware Requirements
+
+| Backend | Hardware |
+|---------|----------|
+| Flash/Triton/FlashInfer | Any CUDA GPU |
+| CUTLASS MLA | Blackwell (SM100+) |
+| FlashAttn MLA | Hopper (SM90+) |
+| FlashMLA | Hopper (SM90+) |
+| FlashInfer-MLA | Any CUDA GPU |
+
+## Using MLA Runner Directly
+
+All MLA backends are available in `mla_runner.py`:
+
+```python
+from mla_runner import (
+    run_cutlass_mla_benchmark,
+    run_flashinfer_mla_benchmark,
+    run_flashattn_mla_benchmark,
+    run_flashmla_benchmark,
+)
+from common import BenchmarkConfig
+
+config = BenchmarkConfig(
+    backend="cutlass_mla",
+    batch_spec="64s4k",
+    num_layers=10,
+    head_dim=576,
+    num_q_heads=128,
+    num_kv_heads=1,
+    block_size=128,
+    device="cuda:0",
+    repeats=5,
+    warmup_iters=3,
+)
+
+# CUTLASS MLA with specific num_kv_splits
+result = run_cutlass_mla_benchmark(config, num_kv_splits=4)
+print(f"Time: {result['mean']:.6f}s, Throughput: {result['throughput']:.1f} tok/s")
+
+# FlashInfer-MLA
+result = run_flashinfer_mla_benchmark(config)
+
+# FlashAttn MLA (Hopper SM90+)
+result = run_flashattn_mla_benchmark(config, reorder_batch_threshold=64)
+
+# FlashMLA (Hopper SM90+)
+result = run_flashmla_benchmark(config, reorder_batch_threshold=64)
+```
+
+## Python API
+
+```python
+from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
+from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
+
+# Parse batch specs
+requests = parse_batch_spec("2q2k_spec4s1k_32s1k")
+print(format_batch_spec(requests))
+# "2 prefill (2x2k), 1 specdecode (1xq4s1k), 32 decode (32x1k)"
+
+# Get batch statistics
+stats = get_batch_stats(requests)
+print(f"Total tokens: {stats['total_tokens']}")
+print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
+
+# Format results
+formatter = ResultsFormatter()
+formatter.save_csv(results, "output.csv")
+formatter.save_json(results, "output.json")
+```
+
+## File Structure
+
+```
+attention_benchmarks/
+├── README.md                      # This file
+│
+├── batch_spec.py                  # Grammar parser (tested)
+├── common.py                      # Infrastructure
+├── runner.py                      # Standard attention helpers
+├── mla_runner.py                  # MLA helpers (ALL 4 backends)
+├── test_batch_spec.py             # Tests (all passing)
+│
+├── benchmark.py                   # Universal benchmark script
+│
+└── configs/                       # Pre-configured studies
+    ├── study1_cutlass_numsplits.yaml      # CUTLASS num-splits optimization
+    ├── study2_hopper_head_count.yaml      # FlashAttn vs FlashMLA head count
+    ├── study3_flashinfer_vs_cutlass.yaml  # FlashInfer vs optimized CUTLASS
+    └── study4_reorder_threshold.yaml      # Reorder threshold optimization
+```
+
+## Tips
+
+**1. Warmup matters** - Use `--warmup-iters 10` for stable results
+
+**2. Multiple repeats** - Use `--repeats 20` for low variance
+
+**3. Save results** - Always use `--output-csv` or `--output-json`
+
+**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
+
+**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
+
+**6. Parameter sweeps** - Use `--num-splits` or `--thresholds` to find optimal values
+
+## Troubleshooting
+
+**Import errors?**
+```bash
+source /path/to/vllm/.venv/bin/activate
+```
+
+**Backend not supported?**
+- Check hardware requirements above
+- Some backends need Hopper/Blackwell
+
+**OOM?**
+- Reduce batch size: `"32s1k"` → `"16s1k"`
+- Reduce sequence length: `"64s16k"` → `"64s4k"`
+
+## What's Included
+
+✅ Extended batch spec grammar with tests (all passing!)
+✅ Universal benchmark script for all backends
+✅ Standard attention support (Flash/Triton/FlashInfer)
+✅ MLA runner with ALL 4 backends
+✅ Parameter sweep modes (num-splits, thresholds)
+✅ Rich console output + CSV/JSON export
+✅ Pre-built configuration files (optional)
+
+**~5,000 lines of code, fully simplified, ready to benchmark!** 🚀
diff --git a/benchmarks/attention_benchmarks/__init__.py b/benchmarks/attention_benchmarks/__init__.py
new file mode 100644
index 000000000000..617ea863499d
--- /dev/null
+++ b/benchmarks/attention_benchmarks/__init__.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""vLLM Attention Benchmarking Suite."""
+
+from .batch_spec import (
+    BatchRequest,
+    format_batch_spec,
+    get_batch_stats,
+    parse_batch_spec,
+    parse_manual_batch,
+    reorder_for_flashinfer,
+    split_by_type,
+)
+from .common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    BenchmarkRunner,
+    MockLayer,
+    MockModelConfig,
+    ResultsFormatter,
+    get_attention_scale,
+    setup_mla_dims,
+)
+
+__all__ = [
+    # Batch specification
+    "BatchRequest",
+    "parse_batch_spec",
+    "parse_manual_batch",
+    "format_batch_spec",
+    "reorder_for_flashinfer",
+    "split_by_type",
+    "get_batch_stats",
+    # Benchmarking infrastructure
+    "BenchmarkConfig",
+    "BenchmarkResult",
+    "BenchmarkRunner",
+    "ResultsFormatter",
+    # Mock objects
+    "MockLayer",
+    "MockModelConfig",
+    # Utilities
+    "setup_mla_dims",
+    "get_attention_scale",
+]
diff --git a/benchmarks/attention_benchmarks/batch_spec.py b/benchmarks/attention_benchmarks/batch_spec.py
new file mode 100644
index 000000000000..d412b09ad7c9
--- /dev/null
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Extended batch specification grammar for attention benchmarks.
+
+Grammar (underscore-separated segments):
+  Prefill:        (<count>?) q<q_len>(k?) (s<kv_len>(k?))?
+  Decode:         (<count>?) s<kv_len>(k?)
+  Spec decode:    (<count>?) spec<spec_len> s<kv_len>(k?)
+  Chunked prefill: (<count>?) chunk<chunk_size> q<q_len>(k?)
+
+  'k' suffix multiplies by 1024
+
+Examples:
+  q2k                    -> [(2048, 2048)]
+  8s1k                   -> [(1, 1024)] * 8
+  2q1k_32s1k             -> [(1024, 1024)] * 2 + [(1, 1024)] * 32
+  spec4s1k               -> [(4, 1024)]  # 4-token speculative decode
+  chunk8q16k             -> [(16384, 16384)] with chunking hint
+  2q1ks2k_spec4s1k_32s1k -> [(1024, 2048)] * 2 + [(4, 1024)] + [(1, 1024)] * 32
+"""
+
+from collections import Counter
+from dataclasses import dataclass
+from typing import Optional
+
+import regex as re
+
+
+@dataclass
+class BatchRequest:
+    """Represents a single request in a batch."""
+
+    q_len: int  # Query length
+    kv_len: int  # KV cache length
+    is_speculative: bool = False  # Is this speculative decoding?
+    spec_length: int = 0  # Number of speculative tokens (if speculative)
+    is_chunked: bool = False  # Should use chunked prefill?
+    chunk_size: Optional[int] = None  # Chunk size for chunked prefill
+
+    @property
+    def is_decode(self) -> bool:
+        """True if this is a decode request (q_len == 1)."""
+        return self.q_len == 1 and self.kv_len > 1
+
+    @property
+    def is_prefill(self) -> bool:
+        """True if this is a pure prefill (q_len == kv_len)."""
+        return self.q_len > 1 and self.kv_len == self.q_len
+
+    @property
+    def is_extend(self) -> bool:
+        """True if this is context extension (q_len > 1, kv_len > q_len)."""
+        return self.q_len > 1 and self.kv_len > self.q_len
+
+    @property
+    def context_len(self) -> int:
+        """Context length (KV cache - query)."""
+        return self.kv_len - self.q_len
+
+    def as_tuple(self) -> tuple[int, int]:
+        """Return as (q_len, kv_len) tuple for compatibility."""
+        return (self.q_len, self.kv_len)
+
+
+def parse_manual_batch(batch_args: list[str]) -> list[BatchRequest]:
+    """
+    Parse manual batch pairs ['q,kv', ...] into list of BatchRequest.
+
+    Args:
+        batch_args: List of strings in format "q_len,kv_len"
+
+    Returns:
+        List of BatchRequest objects
+
+    Raises:
+        ValueError: If format is invalid or kv_len < q_len
+    """
+    requests = []
+    for s in batch_args:
+        try:
+            q_str, kv_str = s.split(",")
+            q, kv = int(q_str), int(kv_str)
+            if kv < q:
+                raise ValueError(f"kv_len ({kv}) must be >= q_len ({q})")
+            requests.append(BatchRequest(q_len=q, kv_len=kv))
+        except Exception as e:
+            raise ValueError(f"Invalid batch pair '{s}': {e}") from e
+    return requests
+
+
+def _parse_size(size_str: str, k_suffix: str) -> int:
+    """Parse size string with optional 'k' suffix."""
+    size = int(size_str)
+    return size * 1024 if k_suffix == "k" else size
+
+
+def parse_batch_spec(spec: str) -> list[BatchRequest]:
+    """
+    Parse batch specification string into list of BatchRequest objects.
+
+    Args:
+        spec: Batch specification string (see module docstring for grammar)
+
+    Returns:
+        List of BatchRequest objects
+
+    Raises:
+        ValueError: If spec format is invalid
+    """
+    requests = []
+
+    for seg in spec.split("_"):
+        # Try chunked prefill pattern: (<count>?) chunk<chunk_size> q<q_len>(k?)
+        m = re.match(r"^(?:(\d+))?chunk(\d+)q(\d+)(k?)$", seg)
+        if m:
+            cnt = int(m.group(1)) if m.group(1) else 1
+            chunk_size = int(m.group(2))
+            q_len = _parse_size(m.group(3), m.group(4))
+            requests.extend(
+                [
+                    BatchRequest(
+                        q_len=q_len,
+                        kv_len=q_len,
+                        is_chunked=True,
+                        chunk_size=chunk_size,
+                    )
+                ]
+                * cnt
+            )
+            continue
+
+        # Try speculative decode pattern: (<count>?) spec<spec_len> s<kv_len>(k?)
+        m = re.match(r"^(?:(\d+))?spec(\d+)s(\d+)(k?)$", seg)
+        if m:
+            cnt = int(m.group(1)) if m.group(1) else 1
+            spec_len = int(m.group(2))
+            kv_len = _parse_size(m.group(3), m.group(4))
+            requests.extend(
+                [
+                    BatchRequest(
+                        q_len=spec_len,
+                        kv_len=kv_len,
+                        is_speculative=True,
+                        spec_length=spec_len,
+                    )
+                ]
+                * cnt
+            )
+            continue
+
+        # Try prefill/extend pattern: (<count>?) q<q_len>(k?) (s<kv_len>(k?))?
+        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
+        if m:
+            cnt = int(m.group(1)) if m.group(1) else 1
+            q_len = _parse_size(m.group(2), m.group(3))
+            kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
+            requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
+            continue
+
+        # Try decode pattern: (<count>?) s<kv_len>(k?)
+        m = re.match(r"^(?:(\d+))?s(\d+)(k?)$", seg)
+        if m:
+            cnt = int(m.group(1)) if m.group(1) else 1
+            kv_len = _parse_size(m.group(2), m.group(3))
+            requests.extend([BatchRequest(q_len=1, kv_len=kv_len)] * cnt)
+            continue
+
+        raise ValueError(f"Invalid batch spec segment: '{seg}'")
+
+    return requests
+
+
+def format_batch_spec(requests: list[BatchRequest]) -> str:
+    """
+    Format list of BatchRequest into human-readable string.
+
+    Groups requests by type and provides counts and sizes.
+
+    Args:
+        requests: List of BatchRequest objects
+
+    Returns:
+        Formatted string describing the batch
+    """
+    kinds = {
+        "prefill": [],
+        "extend": [],
+        "chunked_prefill": [],
+        "specdecode": [],
+        "decode": [],
+        "unknown": [],
+    }
+
+    for req in requests:
+        tup = (req.q_len, req.kv_len)
+        if req.is_chunked:
+            kinds["chunked_prefill"].append(tup)
+        elif req.is_speculative:
+            kinds["specdecode"].append(tup)
+        elif req.is_prefill:
+            kinds["prefill"].append(tup)
+        elif req.is_extend:
+            kinds["extend"].append(tup)
+        elif req.is_decode:
+            kinds["decode"].append(tup)
+        else:
+            kinds["unknown"].append(tup)
+
+    parts = []
+    for kind in [
+        "prefill",
+        "extend",
+        "chunked_prefill",
+        "specdecode",
+        "decode",
+        "unknown",
+    ]:
+        lst = kinds[kind]
+        if not lst:
+            continue
+
+        cnt_total = len(lst)
+        ctr = Counter(lst)
+        inner = []
+
+        for (q, kv), cnt in ctr.items():
+            if kind in ("prefill", "chunked_prefill"):
+                size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                inner.append(f"{cnt}x{size}")
+            elif kind == "decode":
+                size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}x{size}")
+            else:  # extend, specdecode, unknown
+                qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}xq{qstr}s{kstr}")
+
+        parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
+
+    return ", ".join(parts)
+
+
+def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
+    """
+    Reorder requests for FlashInfer: decode first, then prefill.
+
+    FlashInfer expects decode requests before prefill requests for
+    optimal performance.
+
+    Args:
+        requests: Original list of BatchRequest
+
+    Returns:
+        Reordered list with decode requests first
+    """
+    decodes = [r for r in requests if r.is_decode]
+    non_decodes = [r for r in requests if not r.is_decode]
+    return decodes + non_decodes
+
+
+def split_by_type(
+    requests: list[BatchRequest],
+) -> dict[str, list[BatchRequest]]:
+    """
+    Split requests by type for analysis.
+
+    Args:
+        requests: List of BatchRequest
+
+    Returns:
+        Dict with keys: 'decode', 'prefill', 'extend', 'speculative', 'chunked'
+    """
+    result = {
+        "decode": [],
+        "prefill": [],
+        "extend": [],
+        "speculative": [],
+        "chunked": [],
+    }
+
+    for req in requests:
+        if req.is_chunked:
+            result["chunked"].append(req)
+        elif req.is_speculative:
+            result["speculative"].append(req)
+        elif req.is_decode:
+            result["decode"].append(req)
+        elif req.is_prefill:
+            result["prefill"].append(req)
+        elif req.is_extend:
+            result["extend"].append(req)
+
+    return result
+
+
+def get_batch_stats(requests: list[BatchRequest]) -> dict:
+    """
+    Compute statistics about a batch.
+
+    Args:
+        requests: List of BatchRequest
+
+    Returns:
+        Dict with batch statistics
+    """
+    by_type = split_by_type(requests)
+
+    return {
+        "total_requests": len(requests),
+        "num_decode": len(by_type["decode"]),
+        "num_prefill": len(by_type["prefill"]),
+        "num_extend": len(by_type["extend"]),
+        "num_speculative": len(by_type["speculative"]),
+        "num_chunked": len(by_type["chunked"]),
+        "total_tokens": sum(r.q_len for r in requests),
+        "total_kv_cache": sum(r.kv_len for r in requests),
+        "max_q_len": max((r.q_len for r in requests), default=0),
+        "max_kv_len": max((r.kv_len for r in requests), default=0),
+        "avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
+        "avg_kv_len": (
+            sum(r.kv_len for r in requests) / len(requests) if requests else 0
+        ),
+    }
diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
new file mode 100644
index 000000000000..00ab98430835
--- /dev/null
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -0,0 +1,587 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Universal vLLM Attention Benchmark
+
+Benchmark any attention backend with the extended grammar.
+Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
+
+Examples:
+    # Standard attention
+    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8s1k"
+
+    # MLA backends
+    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64s1k"
+
+    # CUTLASS num-splits sweep
+    python benchmark.py --backend cutlass_mla \
+                        --batch-specs "64s1k" \
+                        --num-splits 1 4 8 16
+
+    # Speculative decode threshold tuning
+    python benchmark.py --backend flashmla \
+                        --batch-specs "spec4s1k" \
+                        --thresholds 1 4 16 64
+"""
+
+import argparse
+import sys
+from dataclasses import replace
+from pathlib import Path
+
+import yaml
+from rich.console import Console
+from tqdm import tqdm
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from batch_spec import parse_batch_spec
+from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
+
+
+def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
+    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
+    from runner import run_attention_benchmark_impl
+
+    return run_attention_benchmark_impl(config)
+
+
+def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
+    """Run MLA benchmark with appropriate backend."""
+    from mla_runner import (
+        run_cutlass_mla_benchmark,
+        run_flashattn_mla_benchmark,
+        run_flashinfer_mla_benchmark,
+        run_flashmla_benchmark,
+    )
+
+    backend_map = {
+        "cutlass_mla": run_cutlass_mla_benchmark,
+        "flashinfer_mla": run_flashinfer_mla_benchmark,
+        "flash_attn_mla": run_flashattn_mla_benchmark,
+        "flashmla": run_flashmla_benchmark,
+    }
+
+    runner = backend_map[config.backend]
+    result_dict = runner(config, **kwargs)
+
+    return BenchmarkResult(
+        config=config,
+        mean_time=result_dict["mean"],
+        std_time=result_dict["std"],
+        min_time=result_dict["min"],
+        max_time=result_dict["max"],
+        throughput_tokens_per_sec=result_dict["throughput"],
+    )
+
+
+def load_config_from_yaml(config_path: str) -> dict:
+    """Load configuration from YAML file."""
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Universal vLLM attention benchmark",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    # Config file
+    parser.add_argument(
+        "--config",
+        help="Path to YAML config file (overrides other args)",
+    )
+
+    # Backend selection
+    parser.add_argument(
+        "--backends",
+        nargs="+",
+        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        "flashinfer_mla, flash_attn_mla, flashmla)",
+    )
+    parser.add_argument(
+        "--backend",
+        help="Single backend (alternative to --backends)",
+    )
+
+    # Batch specifications
+    parser.add_argument(
+        "--batch-specs",
+        nargs="+",
+        default=["q2k", "8s1k"],
+        help="Batch specifications using extended grammar",
+    )
+
+    # Model config
+    parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
+    parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
+    parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
+    parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
+    parser.add_argument("--block-size", type=int, default=16, help="Block size")
+
+    # Benchmark settings
+    parser.add_argument("--device", default="cuda:0", help="Device")
+    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
+    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
+    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
+
+    # MLA-specific options
+    parser.add_argument(
+        "--num-splits",
+        type=int,
+        nargs="+",
+        help="CUTLASS MLA: Test multiple num_kv_splits values",
+    )
+    parser.add_argument(
+        "--thresholds",
+        type=int,
+        nargs="+",
+        help="FlashMLA/FlashAttn MLA: Test multiple reorder_batch_threshold values",
+    )
+    parser.add_argument(
+        "--compare-auto",
+        action="store_true",
+        help="CUTLASS MLA: Also test auto num_kv_splits",
+    )
+
+    # Output
+    parser.add_argument("--output-csv", help="Save to CSV")
+    parser.add_argument("--output-json", help="Save to JSON")
+
+    args = parser.parse_args()
+
+    console = Console()
+    console.print("[bold cyan]vLLM Attention Benchmark[/]")
+
+    # Load config from YAML if provided
+    if args.config:
+        console.print(f"[yellow]Loading config from: {args.config}[/]")
+        yaml_config = load_config_from_yaml(args.config)
+
+        # Show description if available
+        if "description" in yaml_config:
+            console.print(f"[dim]{yaml_config['description']}[/]")
+
+        # Override args with YAML values
+        # (YAML takes precedence unless CLI arg was explicitly set)
+        # Backend(s)
+        if "backend" in yaml_config:
+            args.backend = yaml_config["backend"]
+            args.backends = None
+        elif "backends" in yaml_config:
+            args.backends = yaml_config["backends"]
+            args.backend = None
+
+        # Check for special modes
+        if "mode" in yaml_config:
+            args.mode = yaml_config["mode"]
+        else:
+            args.mode = None
+
+        # Batch specs and sizes
+        if "batch_specs" in yaml_config:
+            args.batch_specs = yaml_config["batch_specs"]
+        if "batch_sizes" in yaml_config:
+            args.batch_sizes = yaml_config["batch_sizes"]
+        else:
+            args.batch_sizes = None
+
+        # Model config
+        if "model" in yaml_config:
+            model = yaml_config["model"]
+            args.num_layers = model.get("num_layers", args.num_layers)
+            args.head_dim = model.get("head_dim", args.head_dim)
+            args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
+            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
+            args.block_size = model.get("block_size", args.block_size)
+
+        # Benchmark settings
+        if "benchmark" in yaml_config:
+            bench = yaml_config["benchmark"]
+            args.device = bench.get("device", args.device)
+            args.repeats = bench.get("repeats", args.repeats)
+            args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
+            args.profile_memory = bench.get("profile_memory", args.profile_memory)
+
+        # MLA-specific sweeps
+        if "num_splits" in yaml_config:
+            args.num_splits = yaml_config["num_splits"]
+        if "thresholds" in yaml_config:
+            args.thresholds = yaml_config["thresholds"]
+        if "compare_auto" in yaml_config:
+            args.compare_auto = yaml_config["compare_auto"]
+
+        # Output
+        if "output" in yaml_config:
+            output = yaml_config["output"]
+            if "csv" in output and not args.output_csv:
+                args.output_csv = output["csv"]
+            if "json" in output and not args.output_json:
+                args.output_json = output["json"]
+
+        console.print()
+
+    # Determine backends
+    backends = args.backends or ([args.backend] if args.backend else ["flash"])
+    console.print(f"Backends: {', '.join(backends)}")
+    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
+    console.print()
+
+    # Run benchmarks
+    all_results = []
+
+    # Handle special mode: decode_vs_prefill comparison
+    if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
+        console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
+        console.print(
+            "[dim]For each query length, testing both decode and prefill pipelines[/]"
+        )
+
+        # Extract batch sizes from config
+        batch_sizes = getattr(args, "batch_sizes", [1])
+
+        # Calculate total benchmarks: batch_specs * batch_sizes * 2 (decode + prefill)
+        total = len(args.batch_specs) * len(batch_sizes) * 2
+
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for batch_size in batch_sizes:
+                for spec in args.batch_specs:
+                    # Parse the batch spec to get query length
+                    requests = parse_batch_spec(spec)
+                    if not requests:
+                        console.print(
+                            f"[red]Error: Could not parse batch spec '{spec}'[/]"
+                        )
+                        continue
+
+                    # Get query length from first request
+                    query_length = requests[0].q_len
+
+                    # Create batch spec for this batch size
+                    # For batch_size > 1, we need to prepend the count
+                    batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
+
+                    backend = backends[0]  # Use first backend (should only be one)
+
+                    # Test 1: Decode pipeline (threshold >= query_length)
+                    decode_threshold = query_length
+                    config_decode = BenchmarkConfig(
+                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
+                        batch_spec=batch_spec,
+                        num_layers=args.num_layers,
+                        head_dim=args.head_dim,
+                        num_q_heads=args.num_q_heads,
+                        num_kv_heads=args.num_kv_heads,
+                        block_size=args.block_size,
+                        device=args.device,
+                        repeats=args.repeats,
+                        warmup_iters=args.warmup_iters,
+                        profile_memory=args.profile_memory,
+                    )
+
+                    try:
+                        clean_config = replace(config_decode, backend=backend)
+                        result = run_mla_benchmark(
+                            clean_config, reorder_batch_threshold=decode_threshold
+                        )
+                        result = replace(result, config=config_decode)
+                        all_results.append(result)
+                    except Exception as e:
+                        console.print(
+                            f"[red]Error decode qlen={query_length} "
+                            f"bs={batch_size}: {e}[/]"
+                        )
+                        result = BenchmarkResult(
+                            config=config_decode,
+                            mean_time=float("inf"),
+                            std_time=0,
+                            min_time=float("inf"),
+                            max_time=float("inf"),
+                            error=str(e),
+                        )
+                        all_results.append(result)
+
+                    pbar.update(1)
+
+                    # Test 2: Prefill pipeline (threshold < query_length)
+                    if query_length > 1:
+                        prefill_threshold = query_length - 1
+                        config_prefill = BenchmarkConfig(
+                            backend=f"{backend}_prefill_qlen{query_length}_bs{batch_size}",
+                            batch_spec=batch_spec,
+                            num_layers=args.num_layers,
+                            head_dim=args.head_dim,
+                            num_q_heads=args.num_q_heads,
+                            num_kv_heads=args.num_kv_heads,
+                            block_size=args.block_size,
+                            device=args.device,
+                            repeats=args.repeats,
+                            warmup_iters=args.warmup_iters,
+                            profile_memory=args.profile_memory,
+                        )
+
+                        try:
+                            clean_config = replace(config_prefill, backend=backend)
+                            result = run_mla_benchmark(
+                                clean_config, reorder_batch_threshold=prefill_threshold
+                            )
+                            result = replace(result, config=config_prefill)
+                            all_results.append(result)
+                        except Exception as e:
+                            console.print(
+                                f"[red]Error prefill qlen={query_length} "
+                                f"bs={batch_size}: {e}[/]"
+                            )
+                            result = BenchmarkResult(
+                                config=config_prefill,
+                                mean_time=float("inf"),
+                                std_time=0,
+                                min_time=float("inf"),
+                                max_time=float("inf"),
+                                error=str(e),
+                            )
+                            all_results.append(result)
+
+                    pbar.update(1)
+
+        # Display decode vs prefill results
+        console.print("\n[bold green]Decode vs Prefill Results:[/]")
+
+        # Group by batch size
+        by_batch_size = {}
+        for r in all_results:
+            if r.success:
+                # Extract batch size from backend name
+                parts = r.config.backend.split("_")
+                bs_part = [p for p in parts if p.startswith("bs")]
+                if bs_part:
+                    bs = int(bs_part[0][2:])
+                    if bs not in by_batch_size:
+                        by_batch_size[bs] = []
+                    by_batch_size[bs].append(r)
+
+        # For each batch size, analyze crossover point
+        for bs in sorted(by_batch_size.keys()):
+            console.print(f"\n[bold cyan]Batch size: {bs}[/]")
+            results = by_batch_size[bs]
+
+            # Group by query length
+            by_qlen = {}
+            for r in results:
+                parts = r.config.backend.split("_")
+                qlen_part = [p for p in parts if p.startswith("qlen")]
+                if qlen_part:
+                    qlen = int(qlen_part[0][4:])
+                    if qlen not in by_qlen:
+                        by_qlen[qlen] = {}
+
+                    pipeline = "decode" if "decode" in r.config.backend else "prefill"
+                    by_qlen[qlen][pipeline] = r
+
+            # Find crossover point
+            last_decode_faster = None
+            for qlen in sorted(by_qlen.keys()):
+                pipelines = by_qlen[qlen]
+                if "decode" in pipelines and "prefill" in pipelines:
+                    decode_time = pipelines["decode"].mean_time
+                    prefill_time = pipelines["prefill"].mean_time
+                    faster = "decode" if decode_time < prefill_time else "prefill"
+
+                    speedup = (
+                        prefill_time / decode_time
+                        if decode_time < prefill_time
+                        else decode_time / prefill_time
+                    )
+
+                    console.print(
+                        f"  qlen={qlen:3d}: decode={decode_time:.6f}s, "
+                        f"prefill={prefill_time:.6f}s -> "
+                        f"[bold]{faster}[/] ({speedup:.2f}x)"
+                    )
+
+                    if faster == "decode":
+                        last_decode_faster = qlen
+
+            if last_decode_faster is not None:
+                optimal_threshold = last_decode_faster
+                console.print(
+                    f"\n  [bold green]Optimal threshold for batch_size={bs}: "
+                    f"{optimal_threshold}[/]"
+                )
+                console.print(
+                    f"  [dim](Use decode pipeline for query_length <= "
+                    f"{optimal_threshold})[/]"
+                )
+            else:
+                console.print(
+                    f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
+                )
+
+    # Handle special cases: num-splits sweep or threshold sweep
+    elif args.num_splits or args.thresholds:
+        # Sweep mode
+        sweep_param = "num_splits" if args.num_splits else "thresholds"
+        sweep_values = args.num_splits or args.thresholds
+
+        if args.compare_auto and args.num_splits:
+            sweep_values = list(sweep_values) + ["auto"]
+
+        console.print(f"[yellow]Sweep mode: testing {sweep_param} = {sweep_values}[/]")
+
+        total = len(backends) * len(args.batch_specs) * len(sweep_values)
+
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for backend in backends:
+                for spec in args.batch_specs:
+                    for value in sweep_values:
+                        # Create config
+                        config = BenchmarkConfig(
+                            backend=f"{backend}_{sweep_param}_{value}",
+                            batch_spec=spec,
+                            num_layers=args.num_layers,
+                            head_dim=args.head_dim,
+                            num_q_heads=args.num_q_heads,
+                            num_kv_heads=args.num_kv_heads,
+                            block_size=args.block_size,
+                            device=args.device,
+                            repeats=args.repeats,
+                            warmup_iters=args.warmup_iters,
+                            profile_memory=args.profile_memory,
+                        )
+
+                        try:
+                            # Create a clean config with just the backend name
+                            # for the actual benchmark but keep the full name
+                            # with sweep params in the result
+                            clean_config = replace(config, backend=backend)
+
+                            if args.num_splits:
+                                # CUTLASS num_kv_splits
+                                num_splits = None if value == "auto" else value
+                                result = run_mla_benchmark(
+                                    clean_config, num_kv_splits=num_splits
+                                )
+                            else:
+                                # Threshold sweep
+                                result = run_mla_benchmark(
+                                    clean_config, reorder_batch_threshold=value
+                                )
+
+                            # Replace the result's config with the one that has
+                            # the sweep params in the name
+                            result = replace(result, config=config)
+                            all_results.append(result)
+                        except Exception as e:
+                            console.print(
+                                f"[red]Error {backend} {spec} {sweep_param}="
+                                f"{value}: {e}[/]"
+                            )
+                            result = BenchmarkResult(
+                                config=config,
+                                mean_time=float("inf"),
+                                std_time=0,
+                                min_time=float("inf"),
+                                max_time=float("inf"),
+                                error=str(e),
+                            )
+                            all_results.append(result)
+
+                        pbar.update(1)
+
+        # Display sweep results
+        console.print("\n[bold green]Sweep Results:[/]")
+        backend_names = [
+            f"{b}_{sweep_param}_{v}" for b in backends for v in sweep_values
+        ]
+        formatter = ResultsFormatter(console)
+        formatter.print_table(all_results, backend_names)
+
+        # Show optimal
+        console.print(f"\n[bold cyan]Optimal {sweep_param} per batch spec:[/]")
+        by_spec = {}
+        for r in all_results:
+            if r.success:
+                spec = r.config.batch_spec
+                if spec not in by_spec:
+                    by_spec[spec] = []
+                by_spec[spec].append(r)
+
+        for spec in sorted(by_spec.keys()):
+            results = by_spec[spec]
+            best = min(results, key=lambda r: r.mean_time)
+            console.print(
+                f"  {spec}: [bold green]{best.config.backend}[/] "
+                f"({best.mean_time:.6f}s)"
+            )
+
+    else:
+        # Normal mode: compare backends
+        total = len(backends) * len(args.batch_specs)
+
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for spec in args.batch_specs:
+                for backend in backends:
+                    config = BenchmarkConfig(
+                        backend=backend,
+                        batch_spec=spec,
+                        num_layers=args.num_layers,
+                        head_dim=args.head_dim,
+                        num_q_heads=args.num_q_heads,
+                        num_kv_heads=args.num_kv_heads,
+                        block_size=args.block_size,
+                        device=args.device,
+                        repeats=args.repeats,
+                        warmup_iters=args.warmup_iters,
+                        profile_memory=args.profile_memory,
+                    )
+
+                    try:
+                        # Determine if MLA backend
+                        if backend in [
+                            "cutlass_mla",
+                            "flashinfer_mla",
+                            "flash_attn_mla",
+                            "flashmla",
+                        ]:
+                            result = run_mla_benchmark(config)
+                        else:
+                            result = run_standard_attention_benchmark(config)
+
+                        all_results.append(result)
+                    except Exception as e:
+                        console.print(f"[red]Error {backend} {spec}: {e}[/]")
+                        import traceback
+
+                        traceback.print_exc()
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=float("inf"),
+                            std_time=0,
+                            min_time=float("inf"),
+                            max_time=float("inf"),
+                            error=str(e),
+                        )
+                        all_results.append(result)
+
+                    pbar.update(1)
+
+        # Display results
+        console.print("\n[bold green]Results:[/]")
+        formatter = ResultsFormatter(console)
+        formatter.print_table(all_results, backends)
+
+    # Save results
+    if all_results:
+        formatter = ResultsFormatter(console)
+        if args.output_csv:
+            formatter.save_csv(all_results, args.output_csv)
+        if args.output_json:
+            formatter.save_json(all_results, args.output_json)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
new file mode 100644
index 000000000000..b4adbf05821b
--- /dev/null
+++ b/benchmarks/attention_benchmarks/common.py
@@ -0,0 +1,416 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Common utilities for attention benchmarking."""
+
+import csv
+import json
+import math
+import time
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+import numpy as np
+import torch
+from rich.console import Console
+from rich.table import Table
+
+# Mock classes for vLLM attention infrastructure
+
+
+class MockHfConfig:
+    """Mock HuggingFace config that satisfies vLLM's requirements."""
+
+    def __init__(self, mla_dims: dict):
+        self.num_attention_heads = mla_dims["num_q_heads"]
+        self.num_key_value_heads = mla_dims["num_kv_heads"]
+        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
+        self.model_type = "deepseek_v2"
+        self.is_encoder_decoder = False
+
+    def get_text_config(self):
+        return self
+
+
+class MockLayer:
+    """Mock attention layer with scale parameters."""
+
+    def __init__(self, device: torch.device):
+        self._k_scale = torch.tensor(1.0, device=device)
+        self._v_scale = torch.tensor(1.0, device=device)
+        self._q_scale = torch.tensor(1.0, device=device)
+        # Scalar floats for kernels that need them
+        self._k_scale_float = float(self._k_scale.item())
+        self._v_scale_float = float(self._v_scale.item())
+        self._q_scale_float = float(self._q_scale.item())
+
+
+class MockModelConfig:
+    """Mock model configuration."""
+
+    def __init__(
+        self,
+        num_q_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dtype: torch.dtype = torch.float16,
+        max_model_len: int = 32768,
+    ):
+        self._n_q = num_q_heads
+        self._n_kv = num_kv_heads
+        self._d = head_dim
+        self.dtype = dtype
+        self.max_model_len = max_model_len
+
+    def get_num_attention_heads(self, _=None) -> int:
+        return self._n_q
+
+    def get_num_kv_heads(self, _=None) -> int:
+        return self._n_kv
+
+    def get_head_size(self) -> int:
+        return self._d
+
+
+class MockParallelConfig:
+    """Mock parallel configuration."""
+
+    pass
+
+
+class MockCompilationConfig:
+    """Mock compilation configuration."""
+
+    def __init__(self):
+        self.full_cuda_graph = False
+        self.static_forward_context = {}
+
+
+class MockVLLMConfig:
+    """Mock VLLM configuration."""
+
+    def __init__(self):
+        self.compilation_config = MockCompilationConfig()
+
+
+class MockRunner:
+    """Mock GPU runner for metadata builders."""
+
+    def __init__(
+        self,
+        seq_lens: np.ndarray,
+        query_start_locs: np.ndarray,
+        device: torch.device,
+        num_q_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dtype: torch.dtype,
+    ):
+        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
+        self.parallel_config = MockParallelConfig()
+        self.vllm_config = MockVLLMConfig()
+        self.seq_lens_np = seq_lens
+        self.query_start_loc_np = query_start_locs
+        self.device = device
+        self.attention_chunk_size = None
+        self.num_query_heads = num_q_heads
+        self.num_kv_heads = num_kv_heads
+        self.dtype = dtype
+
+
+@dataclass
+class BenchmarkConfig:
+    """Configuration for a single benchmark run."""
+
+    backend: str
+    batch_spec: str
+    num_layers: int
+    head_dim: int
+    num_q_heads: int
+    num_kv_heads: int
+    block_size: int
+    device: str
+    dtype: torch.dtype = torch.float16
+    repeats: int = 1
+    warmup_iters: int = 3
+    profile_memory: bool = False
+    use_cuda_graphs: bool = False
+
+    # MLA-specific
+    kv_lora_rank: Optional[int] = None
+    qk_nope_head_dim: Optional[int] = None
+    qk_rope_head_dim: Optional[int] = None
+    v_head_dim: Optional[int] = None
+
+    # Backend-specific tuning
+    num_kv_splits: Optional[int] = None  # CUTLASS MLA
+    reorder_batch_threshold: Optional[int] = None  # FlashAttn MLA, FlashMLA
+
+
+@dataclass
+class BenchmarkResult:
+    """Results from a single benchmark run."""
+
+    config: BenchmarkConfig
+    mean_time: float  # seconds
+    std_time: float  # seconds
+    min_time: float  # seconds
+    max_time: float  # seconds
+    throughput_tokens_per_sec: Optional[float] = None
+    memory_allocated_mb: Optional[float] = None
+    memory_reserved_mb: Optional[float] = None
+    error: Optional[str] = None
+
+    @property
+    def success(self) -> bool:
+        """Whether benchmark completed successfully."""
+        return self.error is None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "config": asdict(self.config),
+            "mean_time": self.mean_time,
+            "std_time": self.std_time,
+            "min_time": self.min_time,
+            "max_time": self.max_time,
+            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
+            "memory_allocated_mb": self.memory_allocated_mb,
+            "memory_reserved_mb": self.memory_reserved_mb,
+            "error": self.error,
+        }
+
+
+class BenchmarkRunner:
+    """Base class for running attention benchmarks."""
+
+    def __init__(self, config: BenchmarkConfig):
+        self.config = config
+        self.device = torch.device(config.device)
+        torch.cuda.set_device(self.device)
+
+    def run(self, **kwargs) -> BenchmarkResult:
+        """
+        Run benchmark with current configuration.
+
+        Returns:
+            BenchmarkResult with timing and memory statistics
+        """
+        raise NotImplementedError
+
+    def _time_kernel(self, fn, warmup: int = 3, repeats: int = 10) -> dict:
+        """
+        Time a kernel function with warmup and multiple repeats.
+
+        Args:
+            fn: Callable to time
+            warmup: Number of warmup iterations
+            repeats: Number of measurement iterations
+
+        Returns:
+            Dict with timing statistics
+        """
+        # Warmup
+        for _ in range(warmup):
+            fn()
+        torch.cuda.synchronize()
+
+        # Measure
+        times = []
+        for _ in range(repeats):
+            torch.cuda.synchronize()
+            start = time.time()
+            fn()
+            torch.cuda.synchronize()
+            times.append(time.time() - start)
+
+        return {
+            "mean": np.mean(times),
+            "std": np.std(times),
+            "min": np.min(times),
+            "max": np.max(times),
+        }
+
+    def _get_memory_stats(self) -> dict:
+        """Get current CUDA memory statistics."""
+        return {
+            "allocated_mb": torch.cuda.memory_allocated(self.device) / 1024**2,
+            "reserved_mb": torch.cuda.memory_reserved(self.device) / 1024**2,
+        }
+
+
+class ResultsFormatter:
+    """Format and display benchmark results."""
+
+    def __init__(self, console: Optional[Console] = None):
+        self.console = console or Console()
+
+    def print_table(
+        self,
+        results: list[BenchmarkResult],
+        backends: list[str],
+        compare_to_fastest: bool = True,
+    ):
+        """
+        Print results as a rich table.
+
+        Args:
+            results: List of BenchmarkResult
+            backends: List of backend names being compared
+            compare_to_fastest: Show percentage comparison to fastest
+        """
+        # Group by batch spec
+        by_spec = {}
+        for r in results:
+            spec = r.config.batch_spec
+            if spec not in by_spec:
+                by_spec[spec] = {}
+            by_spec[spec][r.config.backend] = r
+
+        table = Table(title="Attention Benchmark Results")
+        table.add_column("Batch Spec", no_wrap=True)
+
+        multi = len(backends) > 1
+        for backend in backends:
+            # Time column
+            col_time = f"{backend} Time (s)"
+            table.add_column(col_time, justify="right", no_wrap=True)
+            if multi and compare_to_fastest:
+                # Relative performance column
+                col_rel = f"{backend} vs Fastest"
+                table.add_column(col_rel, justify="right", no_wrap=True)
+
+        # Add rows
+        for spec in sorted(by_spec.keys()):
+            spec_results = by_spec[spec]
+            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
+            best_time = min(times.values()) if times else 0.0
+
+            row = [spec]
+            for backend in backends:
+                if backend in spec_results:
+                    r = spec_results[backend]
+                    if r.success:
+                        row.append(f"{r.mean_time:.6f}")
+                        if multi and compare_to_fastest:
+                            pct = (
+                                (r.mean_time / best_time * 100) if best_time > 0 else 0
+                            )
+                            pct_str = f"{pct:.1f}%"
+                            if r.mean_time == best_time:
+                                pct_str = f"[bold green]{pct_str}[/]"
+                            row.append(pct_str)
+                    else:
+                        row.append("[red]ERROR[/]")
+                        if multi and compare_to_fastest:
+                            row.append("-")
+                else:
+                    row.append("-")
+                    if multi and compare_to_fastest:
+                        row.append("-")
+
+            table.add_row(*row)
+
+        self.console.print(table)
+
+    def save_csv(self, results: list[BenchmarkResult], path: str):
+        """Save results to CSV file."""
+        if not results:
+            return
+
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(path, "w", newline="") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=[
+                    "backend",
+                    "batch_spec",
+                    "num_layers",
+                    "mean_time",
+                    "std_time",
+                    "throughput",
+                    "memory_mb",
+                ],
+            )
+            writer.writeheader()
+            for r in results:
+                writer.writerow(
+                    {
+                        "backend": r.config.backend,
+                        "batch_spec": r.config.batch_spec,
+                        "num_layers": r.config.num_layers,
+                        "mean_time": r.mean_time,
+                        "std_time": r.std_time,
+                        "throughput": r.throughput_tokens_per_sec or 0,
+                        "memory_mb": r.memory_allocated_mb or 0,
+                    }
+                )
+
+        self.console.print(f"[green]Saved CSV results to {path}[/]")
+
+    def save_json(self, results: list[BenchmarkResult], path: str):
+        """Save results to JSON file."""
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+
+        data = [r.to_dict() for r in results]
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+
+        self.console.print(f"[green]Saved JSON results to {path}[/]")
+
+
+def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
+    """
+    Get MLA dimensions for known models.
+
+    Args:
+        model_name: Model identifier
+
+    Returns:
+        Dict with MLA dimension configuration
+    """
+    configs = {
+        "deepseek-v2": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v3": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v2-lite": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 16,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+    }
+
+    if model_name not in configs:
+        raise ValueError(
+            f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
+        )
+
+    return configs[model_name]
+
+
+def get_attention_scale(head_dim: int) -> float:
+    """Compute attention scale factor (1/sqrt(d))."""
+    return 1.0 / math.sqrt(head_dim)
diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
new file mode 100644
index 000000000000..d8e06e7ba5ba
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -0,0 +1,61 @@
+# MLA decode-only benchmark configuration
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1  # MLA uses single latent KV
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
+
+batch_specs:
+  # Small batches, varying sequence lengths
+  - "16s512"     # 16 requests, 512 KV cache
+  - "16s1k"      # 16 requests, 1k KV cache
+  - "16s2k"      # 16 requests, 2k KV cache
+  - "16s4k"      # 16 requests, 4k KV cache
+
+  # Medium batches
+  - "32s1k"      # 32 requests, 1k KV cache
+  - "32s2k"      # 32 requests, 2k KV cache
+  - "32s4k"      # 32 requests, 4k KV cache
+  - "32s8k"      # 32 requests, 8k KV cache
+
+  # Large batches
+  - "64s1k"      # 64 requests, 1k KV cache
+  - "64s2k"      # 64 requests, 2k KV cache
+  - "64s4k"      # 64 requests, 4k KV cache
+  - "64s8k"      # 64 requests, 8k KV cache
+
+  # Very large batches
+  - "128s1k"     # 128 requests, 1k KV cache
+  - "128s2k"     # 128 requests, 2k KV cache
+
+  # Long context
+  - "32s16k"     # 32 requests, 16k KV cache
+  - "32s32k"     # 32 requests, 32k KV cache
+
+backends:
+  - cutlass_mla
+  - flashinfer_mla
+  - flash_attn_mla  # Hopper only
+  - flashmla        # Hopper only
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: true
+
+# Backend-specific tuning
+cutlass_mla:
+  num_kv_splits: auto  # or specific value like 4, 8, 16
+
+flash_attn_mla:
+  reorder_batch_threshold: 512
+
+flashmla:
+  reorder_batch_threshold: 1
diff --git a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
new file mode 100644
index 000000000000..b75fb99e4cbd
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -0,0 +1,60 @@
+# MLA mixed batch benchmark (prefill + decode)
+# Tests chunked prefill performance
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+
+batch_specs:
+  # Small prefill + decode
+  - "1q1k_8s1k"           # 1 prefill + 8 decode
+  - "2q2k_16s1k"          # 2 prefill + 16 decode
+  - "4q1k_32s2k"          # 4 prefill + 32 decode
+
+  # Medium prefill + decode
+  - "2q4k_32s2k"          # 2 medium prefill + 32 decode
+  - "4q4k_64s2k"          # 4 medium prefill + 64 decode
+  - "8q2k_64s4k"          # 8 prefill + 64 decode
+
+  # Large prefill + decode (chunked prefill stress test)
+  - "2q8k_32s1k"          # 2 large prefill + 32 decode
+  - "1q16k_16s2k"         # 1 very large prefill + 16 decode
+  - "2q16k_32s4k"         # 2 very large prefill + 32 decode
+
+  # Context extension + decode
+  - "2q1ks2k_16s1k"       # 2 extend + 16 decode
+  - "4q2ks4k_32s2k"       # 4 extend + 32 decode
+  - "2q1ks8k_32s2k"       # 2 large extend + 32 decode
+
+  # Explicitly chunked prefill
+  - "chunk4q8k"           # 8k prefill with chunking hint
+  - "chunk8q16k"          # 16k prefill with chunking hint
+  - "2chunk4q8k_32s2k"    # 2 chunked prefill + 32 decode
+
+  # High decode ratio (realistic serving)
+  - "1q2k_63s1k"          # 1 prefill + 63 decode
+  - "2q2k_62s2k"          # 2 prefill + 62 decode
+  - "4q4k_60s4k"          # 4 prefill + 60 decode
+
+backends:
+  - cutlass_mla
+  - flashinfer_mla
+  - flash_attn_mla   # Hopper only
+  - flashmla         # Hopper only
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: true
+
+# Analyze chunked prefill workspace size impact
+chunked_prefill:
+  test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
diff --git a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
new file mode 100644
index 000000000000..0ffaee1860f6
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -0,0 +1,66 @@
+# Speculative decoding benchmark configuration
+# Tests reorder_batch_threshold optimization
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+
+batch_specs:
+  # Pure speculative decode (K-token verification)
+  - "spec2s1k"      # 2-token spec, 1k KV
+  - "spec4s1k"      # 4-token spec, 1k KV
+  - "spec8s1k"      # 8-token spec, 1k KV
+  - "spec16s1k"     # 16-token spec, 1k KV
+
+  # Speculative with different context lengths
+  - "spec4s2k"      # 4-token spec, 2k KV
+  - "spec4s4k"      # 4-token spec, 4k KV
+  - "spec8s2k"      # 8-token spec, 2k KV
+  - "spec8s4k"      # 8-token spec, 4k KV
+
+  # Mixed: speculative + regular decode
+  - "32spec4s1k"                    # 32 spec requests
+  - "16spec4s1k_16s1k"              # 16 spec + 16 regular
+  - "8spec8s2k_24s2k"               # 8 spec (8-tok) + 24 regular
+
+  # Mixed: speculative + prefill + decode
+  - "2q1k_16spec4s1k_16s1k"         # 2 prefill + 16 spec + 16 decode
+  - "4q2k_32spec4s2k_32s2k"         # 4 prefill + 32 spec + 32 decode
+
+  # Large batches with speculation
+  - "64spec4s1k"                    # 64 spec requests
+  - "32spec8s2k"                    # 32 spec (8-token)
+  - "16spec16s4k"                   # 16 spec (16-token)
+
+# Backends that support query length > 1
+backends:
+  - flash_attn_mla    # reorder_batch_threshold = 512
+  - flashmla          # reorder_batch_threshold = 1 (tunable)
+
+# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
+# - flashinfer_mla
+
+device: "cuda:0"
+repeats: 10  # More repeats for statistical significance
+warmup_iters: 5
+profile_memory: false
+
+# Test these threshold values for optimization
+reorder_batch_thresholds:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+  - 512
diff --git a/benchmarks/attention_benchmarks/configs/standard_attention.yaml b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
new file mode 100644
index 000000000000..d1c5056c8fb1
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -0,0 +1,40 @@
+# Standard attention backend benchmark configuration
+
+model:
+  num_layers: 32
+  num_q_heads: 32
+  num_kv_heads: 8  # GQA with 4:1 ratio
+  head_dim: 128
+  block_size: 16
+
+batch_specs:
+  # Pure prefill
+  - "q512"      # Small prefill (512 tokens)
+  - "q2k"       # Medium prefill (2048 tokens)
+  - "q4k"       # Large prefill (4096 tokens)
+  - "q8k"       # Very large prefill (8192 tokens)
+
+  # Pure decode
+  - "8s1k"      # 8 requests, 1k KV cache each
+  - "16s2k"     # 16 requests, 2k KV cache each
+  - "32s1k"     # 32 requests, 1k KV cache each
+  - "64s4k"     # 64 requests, 4k KV cache each
+
+  # Mixed prefill/decode
+  - "2q2k_8s1k"      # 2 prefill + 8 decode
+  - "4q1k_16s2k"     # 4 prefill + 16 decode
+  - "2q4k_32s1k"     # 2 large prefill + 32 decode
+
+  # Context extension
+  - "q1ks2k"         # 1k query, 2k KV (chunked prefill)
+  - "2q1ks4k"        # 2 requests: 1k query, 4k KV
+
+backends:
+  - flash
+  - triton
+  - flashinfer
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: false
diff --git a/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
new file mode 100644
index 000000000000..cdc5d9e0edf8
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
@@ -0,0 +1,54 @@
+# Study 1: Should we revert CUTLASS MLA num-splits heuristic?
+# Question: What is the optimal num_kv_splits for different batch sizes?
+# Related PRs: #24966, #25509
+
+description: "CUTLASS MLA num-splits optimization study"
+
+# Single backend for this study
+backend: cutlass_mla
+
+# Test various decode batch sizes with different KV cache lengths
+batch_specs:
+  - "32s1k"     # 32 decode requests, 1k KV cache
+  - "64s1k"     # 64 decode requests, 1k KV cache
+  - "64s4k"     # 64 decode requests, 4k KV cache
+  - "64s16k"    # 64 decode requests, 16k KV cache
+  - "128s1k"    # 128 decode requests, 1k KV cache
+  - "128s4k"    # 128 decode requests, 4k KV cache
+
+# Sweep num_kv_splits values
+num_splits:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+
+# Compare against auto-selected num_kv_splits
+compare_auto: true
+
+# Model configuration (DeepSeek V2/V3 defaults)
+model:
+  num_layers: 10
+  head_dim: 576        # MLA uses 576 (kv_lora_rank=512 + 64)
+  num_q_heads: 128
+  num_kv_heads: 1      # MLA uses single KV head
+  block_size: 128
+
+# Benchmark settings
+benchmark:
+  device: "cuda:0"
+  repeats: 10          # More repeats for statistical significance
+  warmup_iters: 5
+  profile_memory: false
+
+# Output
+output:
+  csv: "study1_cutlass_numsplits_results.csv"
+  json: "study1_cutlass_numsplits_results.json"
+
+# Expected outcome:
+# - Identify if auto-selection heuristic is optimal
+# - Determine if we should revert PRs #24966, #25509
+# - Find optimal num_kv_splits per batch configuration
diff --git a/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
new file mode 100644
index 000000000000..a77dbc8126dc
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
@@ -0,0 +1,52 @@
+# Study 2: Does head count matter for FlashAttn MLA vs FlashMLA on Hopper?
+# Question: Which backend performs better on Hopper GPUs (SM90+)?
+# Question: Does the number of attention heads affect relative performance?
+
+description: "FlashAttn MLA vs FlashMLA head count comparison on Hopper"
+
+# Compare these two Hopper backends
+backends:
+  - flash_attn_mla
+  - flashmla
+
+# Standard decode workloads
+batch_specs:
+  - "32s1k"     # 32 decode requests, 1k KV cache
+  - "64s1k"     # 64 decode requests, 1k KV cache
+  - "64s4k"     # 64 decode requests, 4k KV cache
+  - "128s1k"    # 128 decode requests, 1k KV cache
+  - "128s4k"    # 128 decode requests, 4k KV cache
+
+# Model configuration - will test different head counts
+# Note: You'll need to run this multiple times with different num_q_heads values
+# Or modify benchmark.py to support head_counts parameter
+model:
+  num_layers: 10
+  head_dim: 576        # MLA uses 576
+  num_q_heads: 128     # Test with: 16, 32, 64, 128, 256
+  num_kv_heads: 1      # MLA uses single KV head
+  block_size: 128
+
+# Benchmark settings
+benchmark:
+  device: "cuda:0"
+  repeats: 10
+  warmup_iters: 5
+  profile_memory: true  # Track memory usage differences
+
+# Output
+output:
+  csv: "study2_hopper_head_count_results.csv"
+  json: "study2_hopper_head_count_results.json"
+
+# To test different head counts, run:
+# for heads in 16 32 64 128 256; do
+#   python benchmark.py --config configs/study2_hopper_head_count.yaml \
+#     --num-q-heads $heads \
+#     --output-csv study2_heads_${heads}.csv
+# done
+
+# Expected outcome:
+# - Determine which backend is faster on Hopper
+# - Identify if head count impacts relative performance
+# - Inform backend selection for DeepSeek V2/V3 models
diff --git a/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml b/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
new file mode 100644
index 000000000000..631b909dc565
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
@@ -0,0 +1,55 @@
+# Study 3: Is FlashInfer-MLA better than CUTLASS MLA after num-splits optimization?
+# Question: After optimizing CUTLASS MLA's num_kv_splits, is FlashInfer-MLA still competitive?
+
+description: "FlashInfer-MLA vs optimized CUTLASS MLA comparison"
+
+# Compare these two backends
+backends:
+  - cutlass_mla
+  - flashinfer_mla
+
+# Test various decode workloads
+batch_specs:
+  - "32s1k"     # 32 decode requests, 1k KV cache
+  - "64s1k"     # 64 decode requests, 1k KV cache
+  - "64s4k"     # 64 decode requests, 4k KV cache
+  - "64s16k"    # 64 decode requests, 16k KV cache
+  - "128s1k"    # 128 decode requests, 1k KV cache
+  - "128s4k"    # 128 decode requests, 4k KV cache
+  - "128s16k"   # 128 decode requests, 16k KV cache
+
+# For CUTLASS, test optimized num_kv_splits
+# Based on Study 1 results, you may want to adjust these values
+num_splits:
+  - 4           # Often optimal for medium batches
+  - 8           # Often optimal for larger batches
+  - 16          # Test for very large batches
+
+# Also compare against auto-selection
+compare_auto: true
+
+# Model configuration (DeepSeek V2/V3 defaults)
+model:
+  num_layers: 10
+  head_dim: 576
+  num_q_heads: 128
+  num_kv_heads: 1
+  block_size: 128
+
+# Benchmark settings
+benchmark:
+  device: "cuda:0"
+  repeats: 10
+  warmup_iters: 5
+  profile_memory: true  # Compare memory efficiency
+
+# Output
+output:
+  csv: "study3_flashinfer_vs_cutlass_results.csv"
+  json: "study3_flashinfer_vs_cutlass_results.json"
+
+# Expected outcome:
+# - Determine if FlashInfer-MLA is competitive with optimized CUTLASS
+# - Identify which backend to use for different batch sizes
+# - Assess memory efficiency trade-offs
+# - Inform default backend selection strategy
diff --git a/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
new file mode 100644
index 000000000000..062815eaaf83
--- /dev/null
+++ b/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
@@ -0,0 +1,130 @@
+# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
+# Question: At what query length does prefill pipeline become faster than decode pipeline?
+# Methodology: For each query length, compare decode vs prefill performance to find crossover point
+# Applies to: FlashAttn MLA, FlashMLA
+
+description: "Decode vs Prefill pipeline crossover analysis"
+
+# Test FlashAttn MLA (recommended - FlashMLA has known issues with speculative decode)
+backend: flash_attn_mla
+
+# Mode: decode_vs_prefill comparison (special sweep mode)
+# For each batch spec, we'll test both decode and prefill pipelines
+mode: "decode_vs_prefill"
+
+# Query lengths to test (from old benchmark_mla_threshold.py methodology)
+# Each query length will be tested with BOTH decode and prefill pipelines:
+#   - decode: threshold >= query_length (forces decode pipeline)
+#   - prefill: threshold < query_length (forces prefill pipeline)
+#
+# We use spec<N>s1k format which creates q_len=N, kv_len=1024 requests
+# This tests different query lengths with fixed KV cache context
+batch_specs:
+  # Fine-grained: 1-16 (decode range, step 1)
+  - "s1k"         # q_len=1 (regular decode, not spec)
+  - "spec2s1k"    # q_len=2
+  - "spec3s1k"    # q_len=3
+  - "spec4s1k"    # q_len=4
+  - "spec5s1k"    # q_len=5
+  - "spec6s1k"    # q_len=6
+  - "spec7s1k"    # q_len=7
+  - "spec8s1k"    # q_len=8
+  - "spec9s1k"    # q_len=9
+  - "spec10s1k"   # q_len=10
+  - "spec11s1k"   # q_len=11
+  - "spec12s1k"   # q_len=12
+  - "spec13s1k"   # q_len=13
+  - "spec14s1k"   # q_len=14
+  - "spec15s1k"   # q_len=15
+  - "spec16s1k"   # q_len=16
+  # Transition zone: 17-64 (step 2)
+  - "spec17s1k"
+  - "spec19s1k"
+  - "spec21s1k"
+  - "spec23s1k"
+  - "spec25s1k"
+  - "spec27s1k"
+  - "spec29s1k"
+  - "spec31s1k"
+  - "spec33s1k"
+  - "spec35s1k"
+  - "spec37s1k"
+  - "spec39s1k"
+  - "spec41s1k"
+  - "spec43s1k"
+  - "spec45s1k"
+  - "spec47s1k"
+  - "spec49s1k"
+  - "spec51s1k"
+  - "spec53s1k"
+  - "spec55s1k"
+  - "spec57s1k"
+  - "spec59s1k"
+  - "spec61s1k"
+  - "spec63s1k"
+  # Prefill range: 65-128 (step 4)
+  - "spec65s1k"
+  - "spec69s1k"
+  - "spec73s1k"
+  - "spec77s1k"
+  - "spec81s1k"
+  - "spec85s1k"
+  - "spec89s1k"
+  - "spec93s1k"
+  - "spec97s1k"
+  - "spec101s1k"
+  - "spec105s1k"
+  - "spec109s1k"
+  - "spec113s1k"
+  - "spec117s1k"
+  - "spec121s1k"
+  - "spec125s1k"
+
+# Batch sizes to test (from old script)
+batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+
+# Model configuration (DeepSeek V2/V3 defaults)
+model:
+  num_layers: 10
+  head_dim: 576
+  num_q_heads: 128
+  num_kv_heads: 1
+  block_size: 128
+
+# Benchmark settings
+benchmark:
+  device: "cuda:0"
+  repeats: 15          # More repeats for spec decode variance
+  warmup_iters: 5
+  profile_memory: false
+
+# Output
+output:
+  csv: "study4_reorder_threshold_results.csv"
+  json: "study4_reorder_threshold_results.json"
+
+# Expected outcome (reproduces old benchmark_mla_threshold.py study):
+# - For each batch size, find the crossover point where prefill becomes faster than decode
+# - Show decode vs prefill performance across all query lengths (1-125)
+# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
+# - Understand how crossover point varies with batch size
+# - Provide data-driven guidance for default threshold value
+#
+# Methodology (from old script):
+# - Each query length tested with BOTH pipelines:
+#     * decode: threshold >= query_length (forces decode pipeline)
+#     * prefill: threshold < query_length (forces prefill pipeline)
+# - Compare which is faster to find crossover point
+# - Use multiple repeats (15) to handle variance
+#
+# Note: FlashMLA may have issues with speculative decode workloads
+# Use flash_attn_mla instead if you encounter errors with flashmla
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
new file mode 100644
index 000000000000..7698d862eefc
--- /dev/null
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -0,0 +1,842 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+MLA benchmark runner - shared utilities for MLA benchmarks.
+
+This module provides helpers for running MLA backends without
+needing full VllmConfig integration.
+"""
+
+from typing import Optional
+
+import numpy as np
+import torch
+from batch_spec import BatchRequest, parse_batch_spec
+from common import MockHfConfig, MockLayer, setup_mla_dims
+
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
+
+
+def create_minimal_vllm_config(
+    model_name: str = "deepseek-v3",
+    block_size: int = 128,
+    max_num_seqs: int = 256,
+) -> VllmConfig:
+    """
+    Create minimal VllmConfig for MLA benchmarks.
+
+    Args:
+        model_name: Model name (deepseek-v2, deepseek-v3, etc.)
+        block_size: KV cache block size
+        max_num_seqs: Maximum number of sequences
+
+    Returns:
+        VllmConfig for benchmarking
+    """
+    # Get MLA dimensions
+    mla_dims = setup_mla_dims(model_name)
+
+    # Create model config
+    model_config = ModelConfig(
+        model=f"deepseek-ai/{model_name}",
+        tokenizer=None,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=0,
+        max_model_len=32768,
+        quantization=None,
+        quantization_param_path=None,
+        enforce_eager=False,
+        max_context_len_to_capture=None,
+        max_seq_len_to_capture=8192,
+        max_logprobs=20,
+        disable_sliding_window=False,
+        skip_tokenizer_init=True,
+        served_model_name=None,
+        limit_mm_per_prompt=None,
+        use_async_output_proc=True,
+        config_format="auto",
+    )
+
+    # Override head counts and dims for MLA
+    model_config.hf_config = MockHfConfig(mla_dims)
+
+    # Cache config
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        num_gpu_blocks=None,
+        num_cpu_blocks=None,
+        sliding_window=None,
+        enable_prefix_caching=False,
+        cpu_offload_gb=0,
+    )
+
+    # Scheduler config
+    scheduler_config = SchedulerConfig(
+        task="auto",
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=None,
+        max_model_len=32768,
+        num_scheduler_steps=1,
+        multi_step_stream_outputs=False,
+        enable_chunked_prefill=None,
+        preemption_mode="swap",
+        num_lookahead_slots=0,
+        delay_factor=0.0,
+        enable_prefix_caching=False,
+        policy="fcfs",
+        send_delta_data=False,
+    )
+
+    # Parallel config
+    parallel_config = ParallelConfig(
+        pipeline_parallel_size=1,
+        tensor_parallel_size=1,
+        worker_cls="auto",
+        max_parallel_loading_workers=None,
+        disable_custom_all_reduce=False,
+        tokenizer_pool_config=None,
+        ray_workers_use_nsight=False,
+        placement_group=None,
+        distributed_executor_backend=None,
+    )
+
+    # Compilation config
+    compilation_config = CompilationConfig(
+        level=0,
+        backend="",
+        custom_ops=[],
+        splitting_ops=[],
+        use_inductor=True,
+        enable_fusion=True,
+        use_cudagraph=False,
+        cudagraph_num_of_warmups=0,
+        cudagraph_capture_sizes=None,
+        cudagraph_copy_inputs=False,
+        use_cudagraph_for_prefill=False,
+        enabled_custom_ops=None,
+        disabled_custom_ops=None,
+        inductor_compile_sizes=[],
+        inductor_compile_config={},
+        inductor_passes={},
+        cudagraph_backend="flashinfer",
+    )
+
+    # Create VllmConfig
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        parallel_config=parallel_config,
+        scheduler_config=scheduler_config,
+        compilation_config=compilation_config,
+    )
+
+    return vllm_config
+
+
+def build_mla_metadata_cutlass(
+    requests: list[BatchRequest],
+    block_size: int,
+    device: torch.device,
+    mla_dims: dict,
+) -> tuple:
+    """
+    Build metadata for CUTLASS MLA backend.
+
+    Args:
+        requests: List of BatchRequest
+        block_size: KV cache block size
+        device: Torch device
+        mla_dims: MLA dimension configuration
+
+    Returns:
+        Tuple of (metadata, kv_cache, layer)
+    """
+    from vllm.v1.attention.backends.mla.common import (
+        MLACommonDecodeMetadata,
+        MLACommonMetadata,
+    )
+
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv = max(kv_lens)
+
+    # Build query start locations
+    q_start_cpu = np.array(
+        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
+    )
+    q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
+
+    # Build sequence lengths
+    seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
+    seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
+
+    # Build block table
+    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
+    max_num_blocks = max(num_blocks_per_req)
+
+    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
+    for i, num_blocks in enumerate(num_blocks_per_req):
+        block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
+    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+    # Slot mapping
+    slot_mapping_list = []
+    for i, (q_len, kv_len, num_blocks) in enumerate(
+        zip(q_lens, kv_lens, num_blocks_per_req)
+    ):
+        context_len = kv_len - q_len
+        for j in range(q_len):
+            token_kv_idx = context_len + j
+            block_idx = token_kv_idx // block_size
+            offset_in_block = token_kv_idx % block_size
+            global_block_id = block_table_cpu[i, block_idx]
+            slot_id = global_block_id * block_size + offset_in_block
+            slot_mapping_list.append(slot_id)
+
+    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
+
+    # Create decode metadata
+    decode_metadata = MLACommonDecodeMetadata(
+        block_table=block_table_gpu,
+        seq_lens=seq_lens_gpu,
+        dcp_tot_seq_lens=None,
+    )
+
+    # Create common metadata
+    metadata = MLACommonMetadata(
+        num_reqs=len(requests),
+        max_query_len=max(q_lens),
+        max_seq_len=max_kv,
+        num_actual_tokens=total_q,
+        query_start_loc=q_start_gpu,
+        slot_mapping=slot_mapping,
+        num_decodes=len(requests),
+        num_decode_tokens=total_q,
+        num_prefills=0,
+        head_dim=mla_dims["head_dim"],
+        decode=decode_metadata,
+        prefill=None,
+    )
+
+    # Create KV cache
+    kv_cache = torch.zeros(
+        max_num_blocks,
+        block_size,
+        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+        device=device,
+        dtype=torch.float16,
+    )
+
+    # Create layer
+    layer = MockLayer(device)
+
+    return metadata, kv_cache, layer
+
+
+def run_cutlass_mla_benchmark(
+    config,
+    num_kv_splits: Optional[int] = None,
+) -> dict:
+    """
+    Run CUTLASS MLA benchmark.
+
+    Args:
+        config: BenchmarkConfig
+        num_kv_splits: Number of KV splits (None for auto)
+
+    Returns:
+        Dict with timing statistics
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    # Create and set vLLM config for MLA
+    vllm_config = create_minimal_vllm_config(
+        model_name="deepseek-v3",
+        block_size=config.block_size,
+    )
+    from vllm.config import set_current_vllm_config
+
+    with set_current_vllm_config(vllm_config):
+        # Parse batch spec
+        requests = parse_batch_spec(config.batch_spec)
+
+        # Setup MLA dimensions
+        mla_dims = setup_mla_dims("deepseek-v3")
+        scale = 1.0 / np.sqrt(
+            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+        )
+
+        # Build metadata
+        metadata, kv_cache, layer = build_mla_metadata_cutlass(
+            requests, config.block_size, device, mla_dims
+        )
+
+        # Create CUTLASS MLA impl
+        from vllm.v1.attention.backends.mla.cutlass_mla import CutlassMLAImpl
+
+        impl = CutlassMLAImpl(
+            num_heads=mla_dims["num_q_heads"],
+            head_size=mla_dims["head_dim"],
+            scale=scale,
+            num_kv_heads=mla_dims["num_kv_heads"],
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype="auto",
+            logits_soft_cap=None,
+            attn_type="decoder",
+            kv_sharing_target_layer_name=None,
+            q_lora_rank=None,
+            kv_lora_rank=mla_dims["kv_lora_rank"],
+            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            v_head_dim=mla_dims["v_head_dim"],
+            kv_b_proj=None,
+        )
+
+        # Override num_kv_splits if specified
+        if num_kv_splits is not None:
+            impl._num_kv_splits = num_kv_splits
+
+        # Create query tensors
+        total_q = sum(r.q_len for r in requests)
+        q_nope = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"],
+            device=device,
+            dtype=torch.float16,
+        )
+        q_pe = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=torch.float16,
+        )
+
+        # Warmup
+        for _ in range(config.warmup_iters):
+            impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+        torch.cuda.synchronize()
+
+        # Benchmark
+        times = []
+        for _ in range(config.repeats):
+            start = torch.cuda.Event(enable_timing=True)
+            end = torch.cuda.Event(enable_timing=True)
+
+            start.record()
+            for _ in range(config.num_layers):
+                impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+            end.record()
+
+            torch.cuda.synchronize()
+            elapsed_ms = start.elapsed_time(end)
+            times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+        return {
+            "mean": np.mean(times),
+            "std": np.std(times),
+            "min": np.min(times),
+            "max": np.max(times),
+            "throughput": total_q / np.mean(times) if times else 0,
+        }
+
+
+def run_flashinfer_mla_benchmark(config) -> dict:
+    """
+    Run FlashInfer-MLA benchmark.
+
+    Args:
+        config: BenchmarkConfig
+
+    Returns:
+        Dict with timing statistics
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    # Create and set vLLM config for MLA
+    vllm_config = create_minimal_vllm_config(
+        model_name="deepseek-v3",
+        block_size=config.block_size,
+    )
+    from vllm.config import set_current_vllm_config
+
+    with set_current_vllm_config(vllm_config):
+        # Parse batch spec
+        requests = parse_batch_spec(config.batch_spec)
+
+        # Setup MLA dimensions
+        mla_dims = setup_mla_dims("deepseek-v3")
+        scale = 1.0 / np.sqrt(
+            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+        )
+
+        # Build metadata
+        metadata, kv_cache, layer = build_mla_metadata_cutlass(
+            requests, config.block_size, device, mla_dims
+        )
+
+        # Create FlashInfer-MLA impl
+        from vllm.v1.attention.backends.mla.flashinfer_mla import FlashInferMLAImpl
+
+        impl = FlashInferMLAImpl(
+            num_heads=mla_dims["num_q_heads"],
+            head_size=mla_dims["head_dim"],
+            scale=scale,
+            num_kv_heads=mla_dims["num_kv_heads"],
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype="auto",
+            logits_soft_cap=None,
+            attn_type="decoder",
+            kv_sharing_target_layer_name=None,
+            q_lora_rank=None,
+            kv_lora_rank=mla_dims["kv_lora_rank"],
+            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            v_head_dim=mla_dims["v_head_dim"],
+            kv_b_proj=None,
+        )
+
+        # Create query tensors
+        total_q = sum(r.q_len for r in requests)
+        q_nope = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"],
+            device=device,
+            dtype=torch.float16,
+        )
+        q_pe = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=torch.float16,
+        )
+
+        # Warmup
+        for _ in range(config.warmup_iters):
+            impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+        torch.cuda.synchronize()
+
+        # Benchmark
+        times = []
+        for _ in range(config.repeats):
+            start = torch.cuda.Event(enable_timing=True)
+            end = torch.cuda.Event(enable_timing=True)
+
+            start.record()
+            for _ in range(config.num_layers):
+                impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+            end.record()
+
+            torch.cuda.synchronize()
+            elapsed_ms = start.elapsed_time(end)
+            times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+        return {
+            "mean": np.mean(times),
+            "std": np.std(times),
+            "min": np.min(times),
+            "max": np.max(times),
+            "throughput": total_q / np.mean(times) if times else 0,
+        }
+
+
+def run_flashattn_mla_benchmark(
+    config, reorder_batch_threshold: Optional[int] = None
+) -> dict:
+    """
+    Run FlashAttn MLA benchmark (Hopper SM90+).
+
+    Args:
+        config: BenchmarkConfig
+        reorder_batch_threshold: Reorder batch threshold override
+
+    Returns:
+        Dict with timing statistics
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    # Create and set vLLM config for MLA
+    vllm_config = create_minimal_vllm_config(
+        model_name="deepseek-v3",
+        block_size=config.block_size,
+    )
+    from vllm.config import set_current_vllm_config
+
+    with set_current_vllm_config(vllm_config):
+        # Parse batch spec
+        requests = parse_batch_spec(config.batch_spec)
+
+        # Setup MLA dimensions
+        mla_dims = setup_mla_dims("deepseek-v3")
+        scale = 1.0 / np.sqrt(
+            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+        )
+
+        q_lens = [r.q_len for r in requests]
+        kv_lens = [r.kv_len for r in requests]
+        total_q = sum(q_lens)
+        max_kv = max(kv_lens)
+
+        # Build query start locations
+        q_start_cpu = np.array(
+            [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
+        )
+        q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
+
+        # Build sequence lengths
+        seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
+        seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
+
+        # Build block table
+        num_blocks_per_req = [
+            (kv + config.block_size - 1) // config.block_size for kv in kv_lens
+        ]
+        max_num_blocks = max(num_blocks_per_req)
+
+        block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
+        for i, num_blocks in enumerate(num_blocks_per_req):
+            block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
+        block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+        # Create FlashAttn MLA metadata
+        from vllm.v1.attention.backends.mla.flashattn_mla import (
+            FlashAttnMLADecodeMetadata,
+            FlashAttnMLAImpl,
+            FlashAttnMLAMetadata,
+        )
+
+        decode_metadata = FlashAttnMLADecodeMetadata(
+            block_table=block_table_gpu,
+            seq_lens=seq_lens_gpu,
+            query_start_loc=q_start_gpu,
+            max_query_len=max(q_lens),
+            max_seq_len=max_kv,
+            scheduler_metadata=None,  # Not using FA3 scheduling for now
+            max_num_splits=0,
+            dcp_tot_seq_lens=None,
+        )
+
+        # Slot mapping
+        slot_mapping_list = []
+        for i, (q_len, kv_len, num_blocks) in enumerate(
+            zip(q_lens, kv_lens, num_blocks_per_req)
+        ):
+            context_len = kv_len - q_len
+            for j in range(q_len):
+                token_kv_idx = context_len + j
+                block_idx = token_kv_idx // config.block_size
+                offset_in_block = token_kv_idx % config.block_size
+                global_block_id = block_table_cpu[i, block_idx]
+                slot_id = global_block_id * config.block_size + offset_in_block
+                slot_mapping_list.append(slot_id)
+
+        slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
+
+        metadata = FlashAttnMLAMetadata(
+            num_reqs=len(requests),
+            max_query_len=max(q_lens),
+            max_seq_len=max_kv,
+            num_actual_tokens=total_q,
+            query_start_loc=q_start_gpu,
+            slot_mapping=slot_mapping,
+            num_decodes=len(requests),
+            num_decode_tokens=total_q,
+            num_prefills=0,
+            head_dim=mla_dims["head_dim"],
+            decode=decode_metadata,
+            prefill=None,
+        )
+
+        # Create KV cache
+        kv_cache = torch.zeros(
+            max_num_blocks,
+            config.block_size,
+            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=torch.float16,
+        )
+
+        # Create FlashAttn MLA impl
+        impl = FlashAttnMLAImpl(
+            num_heads=mla_dims["num_q_heads"],
+            head_size=mla_dims["head_dim"],
+            scale=scale,
+            num_kv_heads=mla_dims["num_kv_heads"],
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype="auto",
+            logits_soft_cap=None,
+            attn_type="decoder",
+            kv_sharing_target_layer_name=None,
+            q_lora_rank=None,
+            kv_lora_rank=mla_dims["kv_lora_rank"],
+            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            v_head_dim=mla_dims["v_head_dim"],
+            kv_b_proj=None,
+        )
+
+        # Initialize DCP (distributed context parallelism) attributes for
+        # standalone benchmarking
+        if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
+            impl.dcp_world_size = 1
+            impl.dcp_rank = 0
+
+        layer = MockLayer(device)
+
+        # Create query tensors
+        q_nope = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"],
+            device=device,
+            dtype=torch.float16,
+        )
+        q_pe = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=torch.float16,
+        )
+
+        # Warmup
+        for _ in range(config.warmup_iters):
+            impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+        torch.cuda.synchronize()
+
+        # Benchmark
+        times = []
+        for _ in range(config.repeats):
+            start = torch.cuda.Event(enable_timing=True)
+            end = torch.cuda.Event(enable_timing=True)
+
+            start.record()
+            for _ in range(config.num_layers):
+                impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+            end.record()
+
+            torch.cuda.synchronize()
+            elapsed_ms = start.elapsed_time(end)
+            times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+        return {
+            "mean": np.mean(times),
+            "std": np.std(times),
+            "min": np.min(times),
+            "max": np.max(times),
+            "throughput": total_q / np.mean(times) if times else 0,
+        }
+
+
+def run_flashmla_benchmark(
+    config, reorder_batch_threshold: Optional[int] = None
+) -> dict:
+    """
+    Run FlashMLA benchmark (Hopper SM90+).
+
+    Args:
+        config: BenchmarkConfig
+        reorder_batch_threshold: Reorder batch threshold override
+
+    Returns:
+        Dict with timing statistics
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    # Create and set vLLM config for MLA
+    vllm_config = create_minimal_vllm_config(
+        model_name="deepseek-v3",
+        block_size=config.block_size,
+    )
+    from vllm.config import set_current_vllm_config
+
+    with set_current_vllm_config(vllm_config):
+        # Parse batch spec
+        requests = parse_batch_spec(config.batch_spec)
+
+        # Setup MLA dimensions
+        mla_dims = setup_mla_dims("deepseek-v3")
+        scale = 1.0 / np.sqrt(
+            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+        )
+
+        q_lens = [r.q_len for r in requests]
+        kv_lens = [r.kv_len for r in requests]
+        total_q = sum(q_lens)
+        max_kv = max(kv_lens)
+
+        # Build query start locations
+        q_start_cpu = np.array(
+            [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
+        )
+        q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
+
+        # Build sequence lengths
+        seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
+        seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
+
+        # Build block table
+        num_blocks_per_req = [
+            (kv + 64 - 1) // 64 for kv in kv_lens
+        ]  # FlashMLA uses block_size=64
+        max_num_blocks = max(num_blocks_per_req)
+
+        block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
+        for i, num_blocks in enumerate(num_blocks_per_req):
+            block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
+        block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+        # Create FlashMLA metadata (needs tile_scheduler_metadata and num_splits)
+        from vllm.attention.ops.flashmla import get_mla_metadata
+        from vllm.v1.attention.backends.mla.flashmla import (
+            FlashMLADecodeMetadata,
+            FlashMLAImpl,
+            FlashMLAMetadata,
+        )
+
+        tile_scheduler_metadata, num_splits = get_mla_metadata(
+            seq_lens_gpu,
+            mla_dims["num_q_heads"],
+            1,  # MQA for decode
+        )
+
+        decode_metadata = FlashMLADecodeMetadata(
+            block_table=block_table_gpu,
+            seq_lens=seq_lens_gpu,
+            tile_scheduler_metadata=tile_scheduler_metadata,
+            num_splits=num_splits,
+            dcp_tot_seq_lens=None,
+        )
+
+        # Slot mapping
+        slot_mapping_list = []
+        for i, (q_len, kv_len, num_blocks) in enumerate(
+            zip(q_lens, kv_lens, num_blocks_per_req)
+        ):
+            context_len = kv_len - q_len
+            for j in range(q_len):
+                token_kv_idx = context_len + j
+                block_idx = token_kv_idx // 64
+                offset_in_block = token_kv_idx % 64
+                global_block_id = block_table_cpu[i, block_idx]
+                slot_id = global_block_id * 64 + offset_in_block
+                slot_mapping_list.append(slot_id)
+
+        slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
+
+        metadata = FlashMLAMetadata(
+            num_reqs=len(requests),
+            max_query_len=max(q_lens),
+            max_seq_len=max_kv,
+            num_actual_tokens=total_q,
+            query_start_loc=q_start_gpu,
+            slot_mapping=slot_mapping,
+            num_decodes=len(requests),
+            num_decode_tokens=total_q,
+            num_prefills=0,
+            head_dim=mla_dims["head_dim"],
+            decode=decode_metadata,
+            prefill=None,
+        )
+
+        # Create KV cache
+        kv_cache = torch.zeros(
+            max_num_blocks,
+            64,  # FlashMLA block size
+            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=torch.float16,
+        )
+
+        # Create FlashMLA impl
+        impl = FlashMLAImpl(
+            num_heads=mla_dims["num_q_heads"],
+            head_size=mla_dims["head_dim"],
+            scale=scale,
+            num_kv_heads=mla_dims["num_kv_heads"],
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype="auto",
+            logits_soft_cap=None,
+            attn_type="decoder",
+            kv_sharing_target_layer_name=None,
+            q_lora_rank=None,
+            kv_lora_rank=mla_dims["kv_lora_rank"],
+            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            v_head_dim=mla_dims["v_head_dim"],
+            kv_b_proj=None,
+        )
+
+        # Initialize DCP (distributed context parallelism) attributes for
+        # standalone benchmarking
+        if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
+            impl.dcp_world_size = 1
+            impl.dcp_rank = 0
+
+        layer = MockLayer(device)
+
+        # Create query tensors
+        q_concat = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=torch.float16,
+        )
+
+        # Warmup
+        for _ in range(config.warmup_iters):
+            impl._forward_decode(q_concat, kv_cache, metadata, layer)
+        torch.cuda.synchronize()
+
+        # Benchmark
+        times = []
+        for _ in range(config.repeats):
+            start = torch.cuda.Event(enable_timing=True)
+            end = torch.cuda.Event(enable_timing=True)
+
+            start.record()
+            for _ in range(config.num_layers):
+                impl._forward_decode(q_concat, kv_cache, metadata, layer)
+            end.record()
+
+            torch.cuda.synchronize()
+            elapsed_ms = start.elapsed_time(end)
+            times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+        return {
+            "mean": np.mean(times),
+            "std": np.std(times),
+            "min": np.min(times),
+            "max": np.max(times),
+            "throughput": total_q / np.mean(times) if times else 0,
+        }
diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
new file mode 100644
index 000000000000..cf9b4a62800d
--- /dev/null
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -0,0 +1,334 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Complete benchmark runner with real vLLM integration.
+
+This module provides working implementations that can actually run
+attention kernels, not placeholders.
+"""
+
+import numpy as np
+import torch
+from batch_spec import BatchRequest, parse_batch_spec, reorder_for_flashinfer
+from common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    MockLayer,
+    MockRunner,
+    get_attention_scale,
+)
+
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+
+
+def build_common_metadata(
+    requests: list[BatchRequest],
+    block_size: int,
+    device: torch.device,
+) -> tuple[CommonAttentionMetadata, torch.Tensor, int]:
+    """
+    Build CommonAttentionMetadata from batch requests.
+
+    Args:
+        requests: List of BatchRequest
+        block_size: KV cache block size
+        device: Torch device
+
+    Returns:
+        Tuple of (CommonAttentionMetadata, slot_mapping, max_num_blocks)
+    """
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv = max(kv_lens)
+
+    # Build query start locations
+    q_start_cpu = np.array(
+        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
+    )
+    q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
+
+    # Build sequence lengths
+    seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
+    seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
+
+    # Computed tokens (context before new query)
+    computed_tokens_cpu = np.array(
+        [kv - q for kv, q in zip(kv_lens, q_lens)], dtype=np.int32
+    )
+
+    # Build block table
+    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
+    max_num_blocks = max(num_blocks_per_req)
+
+    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
+    for i, num_blocks in enumerate(num_blocks_per_req):
+        block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
+    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+    # Build slot mapping (maps each token to its KV cache slot)
+    slot_mapping_list = []
+    for i, (q_len, kv_len, num_blocks) in enumerate(
+        zip(q_lens, kv_lens, num_blocks_per_req)
+    ):
+        # For each token in the query, map to its slot in the KV cache
+        context_len = kv_len - q_len
+        for j in range(q_len):
+            token_kv_idx = context_len + j
+            block_idx = token_kv_idx // block_size
+            offset_in_block = token_kv_idx % block_size
+            # Global slot ID
+            global_block_id = block_table_cpu[i, block_idx]
+            slot_id = global_block_id * block_size + offset_in_block
+            slot_mapping_list.append(slot_id)
+
+    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
+
+    metadata = CommonAttentionMetadata(
+        query_start_loc=q_start_gpu,
+        query_start_loc_cpu=torch.from_numpy(q_start_cpu),
+        seq_lens=seq_lens_gpu,
+        seq_lens_cpu=torch.from_numpy(seq_lens_cpu),
+        num_computed_tokens_cpu=torch.from_numpy(computed_tokens_cpu),
+        num_reqs=len(requests),
+        num_actual_tokens=total_q,
+        max_query_len=max(q_lens),
+        max_seq_len=max_kv,
+        block_table_tensor=block_table_gpu,
+        slot_mapping=slot_mapping,
+    )
+
+    return metadata, slot_mapping, max_num_blocks
+
+
+def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
+    """
+    Run standard attention benchmark with real kernels.
+
+    Args:
+        config: Benchmark configuration
+
+    Returns:
+        BenchmarkResult with actual timing data
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    # Parse batch spec
+    requests = parse_batch_spec(config.batch_spec)
+
+    # Reorder for FlashInfer if needed
+    if config.backend == "flashinfer":
+        requests = reorder_for_flashinfer(requests)
+
+    # Extract dimensions
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+
+    # Compute scale
+    scale = get_attention_scale(config.head_dim)
+
+    # Select backend and dtype
+    if config.backend == "flash":
+        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend as BE
+
+        dt = torch.float16
+    elif config.backend == "triton":
+        from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend as BE
+
+        dt = torch.float32
+    elif config.backend == "flashinfer":
+        from vllm.v1.attention.backends.flashinfer import FlashInferBackend as BE
+
+        dt = torch.float16
+    else:
+        raise ValueError(f"Unknown backend: {config.backend}")
+
+    # Create attention impl
+    impl = BE.get_impl_cls()(
+        num_heads=config.num_q_heads,
+        head_size=config.head_dim,
+        scale=scale,
+        num_kv_heads=config.num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+    )
+
+    layer = MockLayer(device)
+
+    # Build metadata
+    common_metadata, slot_mapping, max_num_blocks = build_common_metadata(
+        requests, config.block_size, device
+    )
+
+    # Create mock runner for builder
+    runner = MockRunner(
+        seq_lens=common_metadata.seq_lens_cpu.numpy(),
+        query_start_locs=common_metadata.query_start_loc_cpu.numpy(),
+        device=device,
+        num_q_heads=config.num_q_heads,
+        num_kv_heads=config.num_kv_heads,
+        head_dim=config.head_dim,
+        dtype=dt,
+    )
+
+    # Build block table
+    bt = BlockTable(len(requests), max_num_blocks, total_q, False, device)
+    for i in range(len(requests)):
+        num_blocks = (kv_lens[i] + config.block_size - 1) // config.block_size
+        bt.add_row(list(range(num_blocks)), i)
+    bt.commit(len(requests))
+
+    # Create metadata builder
+    builder = BE.get_builder_cls()(
+        runner=runner,
+        kv_cache_spec=AttentionSpec(
+            block_size=config.block_size,
+            num_kv_heads=config.num_kv_heads,
+            head_size=config.head_dim,
+            dtype=dt,
+            use_mla=False,
+        ),
+        block_table=bt,
+    )
+
+    # Build attention metadata
+    attn_metadata = builder.build(
+        num_reqs=len(requests),
+        num_actual_tokens=total_q,
+        max_query_len=max(q_lens),
+        common_prefix_len=0,
+        common_attn_metadata=common_metadata,
+    )
+
+    # Create input tensors
+    q_list = [
+        torch.randn(
+            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dt
+        )
+        for _ in range(config.num_layers)
+    ]
+    k_list = [
+        torch.randn(
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dt
+        )
+        for _ in range(config.num_layers)
+    ]
+    v_list = [
+        torch.randn(
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dt
+        )
+        for _ in range(config.num_layers)
+    ]
+
+    # KV cache
+    if config.backend == "flashinfer":
+        cache_list = [
+            torch.zeros(
+                max_num_blocks,
+                2,
+                config.block_size,
+                config.num_kv_heads,
+                config.head_dim,
+                device=device,
+                dtype=dt,
+            )
+            for _ in range(config.num_layers)
+        ]
+    else:
+        cache_list = [
+            torch.zeros(
+                2,
+                max_num_blocks,
+                config.block_size,
+                config.num_kv_heads,
+                config.head_dim,
+                device=device,
+                dtype=dt,
+            )
+            for _ in range(config.num_layers)
+        ]
+
+    # Output buffer
+    out = torch.empty(
+        total_q, config.num_q_heads, config.head_dim, device=device, dtype=dt
+    )
+
+    # Warmup
+    for _ in range(config.warmup_iters):
+        for i in range(config.num_layers):
+            impl.forward(
+                layer,
+                q_list[i],
+                k_list[i],
+                v_list[i],
+                cache_list[i],
+                attn_metadata,
+                output=out,
+            )
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(config.repeats):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for i in range(config.num_layers):
+            impl.forward(
+                layer,
+                q_list[i],
+                k_list[i],
+                v_list[i],
+                cache_list[i],
+                attn_metadata,
+                output=out,
+            )
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed_ms = start.elapsed_time(end)
+        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
+
+    # Memory stats
+    mem_stats = {}
+    if config.profile_memory:
+        mem_stats = {
+            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
+            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
+        }
+
+    # Throughput
+    mean_time = np.mean(times)
+    throughput = total_q / mean_time if mean_time > 0 else 0
+
+    return BenchmarkResult(
+        config=config,
+        mean_time=mean_time,
+        std_time=np.std(times),
+        min_time=np.min(times),
+        max_time=np.max(times),
+        throughput_tokens_per_sec=throughput,
+        memory_allocated_mb=mem_stats.get("allocated_mb"),
+        memory_reserved_mb=mem_stats.get("reserved_mb"),
+    )
+
+
+def run_mla_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
+    """
+    Run MLA benchmark with real kernels.
+
+    This is a template - needs specific backend implementation.
+    """
+    # TODO: Implement for specific MLA backends
+    # This requires more complex setup due to MLA-specific metadata
+    raise NotImplementedError(
+        "MLA benchmark runner needs backend-specific implementation. "
+        "See benchmark_mla_numsplits.py for CUTLASS MLA example."
+    )
diff --git a/benchmarks/attention_benchmarks/test_batch_spec.py b/benchmarks/attention_benchmarks/test_batch_spec.py
new file mode 100644
index 000000000000..d4cfad48bb83
--- /dev/null
+++ b/benchmarks/attention_benchmarks/test_batch_spec.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Test suite for batch specification parser."""
+
+import sys
+from pathlib import Path
+
+# Add parent dir to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from batch_spec import (
+    format_batch_spec,
+    get_batch_stats,
+    parse_batch_spec,
+    parse_manual_batch,
+)
+
+
+def test_basic_patterns():
+    """Test basic batch specification patterns."""
+    print("Testing basic patterns...")
+
+    # Prefill
+    result = parse_batch_spec("q2k")
+    assert len(result) == 1
+    assert result[0].q_len == 2048
+    assert result[0].kv_len == 2048
+    assert result[0].is_prefill
+    print("  ✓ q2k -> [(2048, 2048)]")
+
+    # Decode
+    result = parse_batch_spec("8s1k")
+    assert len(result) == 8
+    assert all(r.q_len == 1 and r.kv_len == 1024 for r in result)
+    assert all(r.is_decode for r in result)
+    print("  ✓ 8s1k -> 8 x [(1, 1024)]")
+
+    # Context extension
+    result = parse_batch_spec("q1ks2k")
+    assert len(result) == 1
+    assert result[0].q_len == 1024
+    assert result[0].kv_len == 2048
+    assert result[0].is_extend
+    print("  ✓ q1ks2k -> [(1024, 2048)]")
+
+
+def test_combined_patterns():
+    """Test combined batch specifications."""
+    print("\nTesting combined patterns...")
+
+    result = parse_batch_spec("2q1k_32s1k")
+    assert len(result) == 34
+    assert sum(1 for r in result if r.is_prefill) == 2
+    assert sum(1 for r in result if r.is_decode) == 32
+    print("  ✓ 2q1k_32s1k -> 2 prefill + 32 decode")
+
+    result = parse_batch_spec("4q2k_spec8s1k_64s2k")
+    assert len(result) == 69
+    print("  ✓ 4q2k_spec8s1k_64s2k -> complex mix")
+
+
+def test_speculative_decode():
+    """Test speculative decode patterns."""
+    print("\nTesting speculative decode...")
+
+    result = parse_batch_spec("spec4s1k")
+    assert len(result) == 1
+    assert result[0].q_len == 4
+    assert result[0].kv_len == 1024
+    assert result[0].is_speculative
+    assert result[0].spec_length == 4
+    print("  ✓ spec4s1k -> 4-token speculative")
+
+    result = parse_batch_spec("8spec8s2k")
+    assert len(result) == 8
+    assert all(r.is_speculative and r.spec_length == 8 for r in result)
+    print("  ✓ 8spec8s2k -> 8 x 8-token speculative")
+
+
+def test_chunked_prefill():
+    """Test chunked prefill patterns."""
+    print("\nTesting chunked prefill...")
+
+    result = parse_batch_spec("chunk8q16k")
+    assert len(result) == 1
+    assert result[0].q_len == 16384
+    assert result[0].is_chunked
+    assert result[0].chunk_size == 8
+    print("  ✓ chunk8q16k -> chunked 16k prefill")
+
+    result = parse_batch_spec("2chunk4q8k")
+    assert len(result) == 2
+    assert all(r.is_chunked and r.chunk_size == 4 for r in result)
+    print("  ✓ 2chunk4q8k -> 2 x chunked 8k prefill")
+
+
+def test_formatting():
+    """Test batch spec formatting."""
+    print("\nTesting formatting...")
+
+    requests = parse_batch_spec("2q2k_32s1k")
+    formatted = format_batch_spec(requests)
+    assert "2 prefill" in formatted
+    assert "32 decode" in formatted
+    print(f"  ✓ Format: {formatted}")
+
+    requests = parse_batch_spec("spec4s1k_8s1k")
+    formatted = format_batch_spec(requests)
+    assert "specdecode" in formatted
+    print(f"  ✓ Format with spec: {formatted}")
+
+
+def test_batch_stats():
+    """Test batch statistics."""
+    print("\nTesting batch statistics...")
+
+    requests = parse_batch_spec("2q2k_32s1k")
+    stats = get_batch_stats(requests)
+
+    assert stats["total_requests"] == 34
+    assert stats["num_prefill"] == 2
+    assert stats["num_decode"] == 32
+    assert stats["total_tokens"] == 2048 * 2 + 32 * 1
+    print(
+        f"  ✓ Stats: {stats['total_requests']} requests, {stats['total_tokens']} tokens"
+    )
+
+
+def test_manual_batch():
+    """Test manual batch specification."""
+    print("\nTesting manual batch...")
+
+    requests = parse_manual_batch(["1,1024", "2048,2048", "1,2048"])
+    assert len(requests) == 3
+    assert requests[0].as_tuple() == (1, 1024)
+    assert requests[1].as_tuple() == (2048, 2048)
+    assert requests[2].as_tuple() == (1, 2048)
+    print("  ✓ Manual batch: 3 requests")
+
+
+def test_error_handling():
+    """Test error handling."""
+    print("\nTesting error handling...")
+
+    try:
+        parse_batch_spec("invalid")
+        raise AssertionError("Should have raised ValueError")
+    except ValueError:
+        print("  ✓ Invalid spec raises ValueError")
+
+    try:
+        parse_manual_batch(["1024,512"])  # kv < q
+        raise AssertionError("Should have raised ValueError")
+    except ValueError:
+        print("  ✓ Invalid kv_len raises ValueError")
+
+
+def main():
+    """Run all tests."""
+    print("=" * 60)
+    print("Batch Specification Parser Tests")
+    print("=" * 60)
+
+    test_basic_patterns()
+    test_combined_patterns()
+    test_speculative_decode()
+    test_chunked_prefill()
+    test_formatting()
+    test_batch_stats()
+    test_manual_batch()
+    test_error_handling()
+
+    print("\n" + "=" * 60)
+    print("All tests passed! ✓")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()

From ac4cf6b50234c862e1308ac3add3b9ef6750d472 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 10 Oct 2025 20:28:28 +0000
Subject: [PATCH 02/45] don't unnecessarily reinitialize

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/benchmark.py  | 121 +++---
 benchmarks/attention_benchmarks/mla_runner.py | 406 ++++++++++--------
 2 files changed, 290 insertions(+), 237 deletions(-)

diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index 00ab98430835..9b943d80a34d 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -240,15 +240,20 @@ def main():
         console.print(
             "[dim]For each query length, testing both decode and prefill pipelines[/]"
         )
+        console.print("[dim]Using batched execution for optimal performance[/]")
 
         # Extract batch sizes from config
         batch_sizes = getattr(args, "batch_sizes", [1])
+        backend = backends[0]  # Use first backend (should only be one)
 
-        # Calculate total benchmarks: batch_specs * batch_sizes * 2 (decode + prefill)
-        total = len(args.batch_specs) * len(batch_sizes) * 2
+        # Calculate total benchmarks
+        total = len(batch_sizes)
 
         with tqdm(total=total, desc="Benchmarking") as pbar:
             for batch_size in batch_sizes:
+                # Prepare all configs for this batch size
+                configs_with_thresholds = []
+
                 for spec in args.batch_specs:
                     # Parse the batch spec to get query length
                     requests = parse_batch_spec(spec)
@@ -265,12 +270,9 @@ def main():
                     # For batch_size > 1, we need to prepend the count
                     batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
 
-                    backend = backends[0]  # Use first backend (should only be one)
-
-                    # Test 1: Decode pipeline (threshold >= query_length)
-                    decode_threshold = query_length
-                    config_decode = BenchmarkConfig(
-                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
+                    # Create base config (without backend name)
+                    base_config = BenchmarkConfig(
+                        backend=backend,  # Will be overridden later
                         batch_spec=batch_spec,
                         num_layers=args.num_layers,
                         head_dim=args.head_dim,
@@ -283,20 +285,58 @@ def main():
                         profile_memory=args.profile_memory,
                     )
 
-                    try:
-                        clean_config = replace(config_decode, backend=backend)
-                        result = run_mla_benchmark(
-                            clean_config, reorder_batch_threshold=decode_threshold
+                    # Add decode pipeline config
+                    decode_threshold = query_length
+                    config_decode = replace(
+                        base_config,
+                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
+                    )
+                    configs_with_thresholds.append((config_decode, decode_threshold))
+
+                    # Add prefill pipeline config if query_length > 1
+                    if query_length > 1:
+                        prefill_threshold = query_length - 1
+                        config_prefill = replace(
+                            base_config,
+                            backend=f"{backend}_prefill_qlen{query_length}"
+                            f"_bs{batch_size}",
                         )
-                        result = replace(result, config=config_decode)
-                        all_results.append(result)
-                    except Exception as e:
-                        console.print(
-                            f"[red]Error decode qlen={query_length} "
-                            f"bs={batch_size}: {e}[/]"
+                        configs_with_thresholds.append(
+                            (config_prefill, prefill_threshold)
                         )
+
+                # Run all benchmarks for this batch size in one go (batched mode)
+                try:
+                    from mla_runner import run_flashattn_mla_benchmark
+
+                    # Use batched API: pass list of (config, threshold) tuples
+                    timing_results = run_flashattn_mla_benchmark(
+                        configs_with_thresholds
+                    )
+
+                    # Create BenchmarkResult objects from timing results
+                    for (config, _), timing in zip(
+                        configs_with_thresholds, timing_results
+                    ):
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=timing["mean"],
+                            std_time=timing["std"],
+                            min_time=timing["min"],
+                            max_time=timing["max"],
+                            throughput_tokens_per_sec=timing.get("throughput", None),
+                        )
+                        all_results.append(result)
+
+                except Exception as e:
+                    console.print(
+                        f"[red]Error running batched benchmarks for "
+                        f"batch_size={batch_size}: {e}[/]"
+                    )
+                    # Add error results for all configs
+                    for config, _ in configs_with_thresholds:
                         result = BenchmarkResult(
-                            config=config_decode,
+                            config=config,
                             mean_time=float("inf"),
                             std_time=0,
                             min_time=float("inf"),
@@ -305,48 +345,7 @@ def main():
                         )
                         all_results.append(result)
 
-                    pbar.update(1)
-
-                    # Test 2: Prefill pipeline (threshold < query_length)
-                    if query_length > 1:
-                        prefill_threshold = query_length - 1
-                        config_prefill = BenchmarkConfig(
-                            backend=f"{backend}_prefill_qlen{query_length}_bs{batch_size}",
-                            batch_spec=batch_spec,
-                            num_layers=args.num_layers,
-                            head_dim=args.head_dim,
-                            num_q_heads=args.num_q_heads,
-                            num_kv_heads=args.num_kv_heads,
-                            block_size=args.block_size,
-                            device=args.device,
-                            repeats=args.repeats,
-                            warmup_iters=args.warmup_iters,
-                            profile_memory=args.profile_memory,
-                        )
-
-                        try:
-                            clean_config = replace(config_prefill, backend=backend)
-                            result = run_mla_benchmark(
-                                clean_config, reorder_batch_threshold=prefill_threshold
-                            )
-                            result = replace(result, config=config_prefill)
-                            all_results.append(result)
-                        except Exception as e:
-                            console.print(
-                                f"[red]Error prefill qlen={query_length} "
-                                f"bs={batch_size}: {e}[/]"
-                            )
-                            result = BenchmarkResult(
-                                config=config_prefill,
-                                mean_time=float("inf"),
-                                std_time=0,
-                                min_time=float("inf"),
-                                max_time=float("inf"),
-                                error=str(e),
-                            )
-                            all_results.append(result)
-
-                    pbar.update(1)
+                pbar.update(1)
 
         # Display decode vs prefill results
         console.print("\n[bold green]Decode vs Prefill Results:[/]")
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index 7698d862eefc..73e16f6ca102 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -463,11 +463,42 @@ def run_flashinfer_mla_benchmark(config) -> dict:
         }
 
 
-def run_flashattn_mla_benchmark(
+def run_flashattn_mla_benchmark(config, reorder_batch_threshold: Optional[int] = None):
+    """
+    Run FlashAttn MLA benchmark (Hopper SM90+).
+
+    Always uses batched execution internally for optimal performance.
+    Accepts both single config and list of configs for convenience.
+
+    Args:
+        config: BenchmarkConfig or list of (BenchmarkConfig, threshold) tuples
+        reorder_batch_threshold: Threshold override (only for single config mode)
+
+    Returns:
+        Dict with timing statistics (single mode) or list of dicts (batched mode)
+    """
+    # Normalize to batched mode
+    if isinstance(config, list):
+        # Already in batched format: [(config1, thresh1), ...]
+        configs_with_thresholds = config
+        return_single = False
+    else:
+        # Single config: convert to batched format
+        configs_with_thresholds = [(config, reorder_batch_threshold)]
+        return_single = True
+
+    # Always use batched execution
+    results = _run_flashattn_mla_batched(configs_with_thresholds)
+
+    # Return single result or list based on input
+    return results[0] if return_single else results
+
+
+def run_flashmla_benchmark(
     config, reorder_batch_threshold: Optional[int] = None
 ) -> dict:
     """
-    Run FlashAttn MLA benchmark (Hopper SM90+).
+    Run FlashMLA benchmark (Hopper SM90+).
 
     Args:
         config: BenchmarkConfig
@@ -513,8 +544,8 @@ def run_flashattn_mla_benchmark(
 
         # Build block table
         num_blocks_per_req = [
-            (kv + config.block_size - 1) // config.block_size for kv in kv_lens
-        ]
+            (kv + 64 - 1) // 64 for kv in kv_lens
+        ]  # FlashMLA uses block_size=64
         max_num_blocks = max(num_blocks_per_req)
 
         block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
@@ -522,21 +553,25 @@ def run_flashattn_mla_benchmark(
             block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
         block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
 
-        # Create FlashAttn MLA metadata
-        from vllm.v1.attention.backends.mla.flashattn_mla import (
-            FlashAttnMLADecodeMetadata,
-            FlashAttnMLAImpl,
-            FlashAttnMLAMetadata,
+        # Create FlashMLA metadata (needs tile_scheduler_metadata and num_splits)
+        from vllm.attention.ops.flashmla import get_mla_metadata
+        from vllm.v1.attention.backends.mla.flashmla import (
+            FlashMLADecodeMetadata,
+            FlashMLAImpl,
+            FlashMLAMetadata,
+        )
+
+        tile_scheduler_metadata, num_splits = get_mla_metadata(
+            seq_lens_gpu,
+            mla_dims["num_q_heads"],
+            1,  # MQA for decode
         )
 
-        decode_metadata = FlashAttnMLADecodeMetadata(
+        decode_metadata = FlashMLADecodeMetadata(
             block_table=block_table_gpu,
             seq_lens=seq_lens_gpu,
-            query_start_loc=q_start_gpu,
-            max_query_len=max(q_lens),
-            max_seq_len=max_kv,
-            scheduler_metadata=None,  # Not using FA3 scheduling for now
-            max_num_splits=0,
+            tile_scheduler_metadata=tile_scheduler_metadata,
+            num_splits=num_splits,
             dcp_tot_seq_lens=None,
         )
 
@@ -548,15 +583,15 @@ def run_flashattn_mla_benchmark(
             context_len = kv_len - q_len
             for j in range(q_len):
                 token_kv_idx = context_len + j
-                block_idx = token_kv_idx // config.block_size
-                offset_in_block = token_kv_idx % config.block_size
+                block_idx = token_kv_idx // 64
+                offset_in_block = token_kv_idx % 64
                 global_block_id = block_table_cpu[i, block_idx]
-                slot_id = global_block_id * config.block_size + offset_in_block
+                slot_id = global_block_id * 64 + offset_in_block
                 slot_mapping_list.append(slot_id)
 
         slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
 
-        metadata = FlashAttnMLAMetadata(
+        metadata = FlashMLAMetadata(
             num_reqs=len(requests),
             max_query_len=max(q_lens),
             max_seq_len=max_kv,
@@ -574,14 +609,14 @@ def run_flashattn_mla_benchmark(
         # Create KV cache
         kv_cache = torch.zeros(
             max_num_blocks,
-            config.block_size,
+            64,  # FlashMLA block size
             mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
             device=device,
             dtype=torch.float16,
         )
 
-        # Create FlashAttn MLA impl
-        impl = FlashAttnMLAImpl(
+        # Create FlashMLA impl
+        impl = FlashMLAImpl(
             num_heads=mla_dims["num_q_heads"],
             head_size=mla_dims["head_dim"],
             scale=scale,
@@ -610,24 +645,17 @@ def run_flashattn_mla_benchmark(
         layer = MockLayer(device)
 
         # Create query tensors
-        q_nope = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["kv_lora_rank"],
-            device=device,
-            dtype=torch.float16,
-        )
-        q_pe = torch.randn(
+        q_concat = torch.randn(
             total_q,
             mla_dims["num_q_heads"],
-            mla_dims["qk_rope_head_dim"],
+            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
             device=device,
             dtype=torch.float16,
         )
 
         # Warmup
         for _ in range(config.warmup_iters):
-            impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+            impl._forward_decode(q_concat, kv_cache, metadata, layer)
         torch.cuda.synchronize()
 
         # Benchmark
@@ -638,7 +666,7 @@ def run_flashattn_mla_benchmark(
 
             start.record()
             for _ in range(config.num_layers):
-                impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
+                impl._forward_decode(q_concat, kv_cache, metadata, layer)
             end.record()
 
             torch.cuda.synchronize()
@@ -654,129 +682,47 @@ def run_flashattn_mla_benchmark(
         }
 
 
-def run_flashmla_benchmark(
-    config, reorder_batch_threshold: Optional[int] = None
-) -> dict:
+def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dict]:
     """
-    Run FlashMLA benchmark (Hopper SM90+).
+    Run multiple FlashAttn MLA benchmarks with shared initialization.
+
+    This is optimized for running many benchmarks with the same backend,
+    avoiding repeated setup/teardown overhead.
 
     Args:
-        config: BenchmarkConfig
-        reorder_batch_threshold: Reorder batch threshold override
+        configs_with_thresholds: List of (config, threshold) tuples to benchmark
 
     Returns:
-        Dict with timing statistics
+        List of dicts with timing statistics, one per config
     """
-    device = torch.device(config.device)
+    if not configs_with_thresholds:
+        return []
+
+    device = torch.device(configs_with_thresholds[0][0].device)
     torch.cuda.set_device(device)
 
-    # Create and set vLLM config for MLA
+    # Create and set vLLM config for MLA (reused across all benchmarks)
     vllm_config = create_minimal_vllm_config(
         model_name="deepseek-v3",
-        block_size=config.block_size,
+        block_size=configs_with_thresholds[0][0].block_size,
     )
     from vllm.config import set_current_vllm_config
+    from vllm.v1.attention.backends.mla.flashattn_mla import (
+        FlashAttnMLADecodeMetadata,
+        FlashAttnMLAImpl,
+        FlashAttnMLAMetadata,
+        FlashAttnMLAMetadataBuilder,
+    )
 
     with set_current_vllm_config(vllm_config):
-        # Parse batch spec
-        requests = parse_batch_spec(config.batch_spec)
-
-        # Setup MLA dimensions
+        # Setup MLA dimensions (reused)
         mla_dims = setup_mla_dims("deepseek-v3")
         scale = 1.0 / np.sqrt(
             mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
         )
 
-        q_lens = [r.q_len for r in requests]
-        kv_lens = [r.kv_len for r in requests]
-        total_q = sum(q_lens)
-        max_kv = max(kv_lens)
-
-        # Build query start locations
-        q_start_cpu = np.array(
-            [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
-        )
-        q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
-
-        # Build sequence lengths
-        seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
-        seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
-
-        # Build block table
-        num_blocks_per_req = [
-            (kv + 64 - 1) // 64 for kv in kv_lens
-        ]  # FlashMLA uses block_size=64
-        max_num_blocks = max(num_blocks_per_req)
-
-        block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
-        for i, num_blocks in enumerate(num_blocks_per_req):
-            block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
-        block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
-
-        # Create FlashMLA metadata (needs tile_scheduler_metadata and num_splits)
-        from vllm.attention.ops.flashmla import get_mla_metadata
-        from vllm.v1.attention.backends.mla.flashmla import (
-            FlashMLADecodeMetadata,
-            FlashMLAImpl,
-            FlashMLAMetadata,
-        )
-
-        tile_scheduler_metadata, num_splits = get_mla_metadata(
-            seq_lens_gpu,
-            mla_dims["num_q_heads"],
-            1,  # MQA for decode
-        )
-
-        decode_metadata = FlashMLADecodeMetadata(
-            block_table=block_table_gpu,
-            seq_lens=seq_lens_gpu,
-            tile_scheduler_metadata=tile_scheduler_metadata,
-            num_splits=num_splits,
-            dcp_tot_seq_lens=None,
-        )
-
-        # Slot mapping
-        slot_mapping_list = []
-        for i, (q_len, kv_len, num_blocks) in enumerate(
-            zip(q_lens, kv_lens, num_blocks_per_req)
-        ):
-            context_len = kv_len - q_len
-            for j in range(q_len):
-                token_kv_idx = context_len + j
-                block_idx = token_kv_idx // 64
-                offset_in_block = token_kv_idx % 64
-                global_block_id = block_table_cpu[i, block_idx]
-                slot_id = global_block_id * 64 + offset_in_block
-                slot_mapping_list.append(slot_id)
-
-        slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
-
-        metadata = FlashMLAMetadata(
-            num_reqs=len(requests),
-            max_query_len=max(q_lens),
-            max_seq_len=max_kv,
-            num_actual_tokens=total_q,
-            query_start_loc=q_start_gpu,
-            slot_mapping=slot_mapping,
-            num_decodes=len(requests),
-            num_decode_tokens=total_q,
-            num_prefills=0,
-            head_dim=mla_dims["head_dim"],
-            decode=decode_metadata,
-            prefill=None,
-        )
-
-        # Create KV cache
-        kv_cache = torch.zeros(
-            max_num_blocks,
-            64,  # FlashMLA block size
-            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=torch.float16,
-        )
-
-        # Create FlashMLA impl
-        impl = FlashMLAImpl(
+        # Create impl once (reused across all benchmarks)
+        impl = FlashAttnMLAImpl(
             num_heads=mla_dims["num_q_heads"],
             head_size=mla_dims["head_dim"],
             scale=scale,
@@ -796,47 +742,155 @@ def run_flashmla_benchmark(
             kv_b_proj=None,
         )
 
-        # Initialize DCP (distributed context parallelism) attributes for
-        # standalone benchmarking
+        # Initialize DCP attributes
         if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
             impl.dcp_world_size = 1
             impl.dcp_rank = 0
 
         layer = MockLayer(device)
-
-        # Create query tensors
-        q_concat = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=torch.float16,
-        )
-
-        # Warmup
-        for _ in range(config.warmup_iters):
-            impl._forward_decode(q_concat, kv_cache, metadata, layer)
-        torch.cuda.synchronize()
-
-        # Benchmark
-        times = []
-        for _ in range(config.repeats):
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-
-            start.record()
-            for _ in range(config.num_layers):
-                impl._forward_decode(q_concat, kv_cache, metadata, layer)
-            end.record()
-
-            torch.cuda.synchronize()
-            elapsed_ms = start.elapsed_time(end)
-            times.append(elapsed_ms / 1000.0 / config.num_layers)
-
-        return {
-            "mean": np.mean(times),
-            "std": np.std(times),
-            "min": np.min(times),
-            "max": np.max(times),
-            "throughput": total_q / np.mean(times) if times else 0,
-        }
+        results = []
+
+        # Run each benchmark with the shared impl
+        for config, threshold in configs_with_thresholds:
+            # Set threshold for this benchmark on the builder class
+            if threshold is not None:
+                original_threshold = FlashAttnMLAMetadataBuilder.reorder_batch_threshold
+                FlashAttnMLAMetadataBuilder.reorder_batch_threshold = threshold
+
+            try:
+                # Parse batch spec
+                requests = parse_batch_spec(config.batch_spec)
+
+                q_lens = [r.q_len for r in requests]
+                kv_lens = [r.kv_len for r in requests]
+                total_q = sum(q_lens)
+                max_kv = max(kv_lens)
+
+                # Build query start locations
+                q_start_cpu = np.array(
+                    [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
+                    dtype=np.int32,
+                )
+                q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
+
+                # Build sequence lengths
+                seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
+                seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
+
+                # Build block table
+                num_blocks_per_req = [
+                    (kv + config.block_size - 1) // config.block_size for kv in kv_lens
+                ]
+                max_num_blocks = max(num_blocks_per_req)
+
+                block_table_cpu = np.zeros(
+                    (len(requests), max_num_blocks), dtype=np.int32
+                )
+                current_block = 0
+                for i, num_blocks in enumerate(num_blocks_per_req):
+                    for j in range(num_blocks):
+                        block_table_cpu[i, j] = current_block
+                        current_block += 1
+
+                block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+                # Build slot mapping
+                slot_mapping_list = []
+                for i, (q_len, kv_len, num_blocks) in enumerate(
+                    zip(q_lens, kv_lens, num_blocks_per_req)
+                ):
+                    context_len = kv_len - q_len
+                    for j in range(q_len):
+                        token_kv_idx = context_len + j
+                        block_idx = token_kv_idx // config.block_size
+                        offset_in_block = token_kv_idx % config.block_size
+                        global_block_id = block_table_cpu[i, block_idx]
+                        slot_id = global_block_id * config.block_size + offset_in_block
+                        slot_mapping_list.append(slot_id)
+
+                slot_mapping = torch.tensor(
+                    slot_mapping_list, dtype=torch.int64, device=device
+                )
+
+                # Create FlashAttn MLA decode metadata
+                decode_metadata = FlashAttnMLADecodeMetadata(
+                    block_table=block_table_gpu,
+                    seq_lens=seq_lens_gpu,
+                    dcp_tot_seq_lens=None,
+                    query_start_loc=q_start_gpu,
+                    max_query_len=max(q_lens),
+                    max_seq_len=max_kv,
+                )
+
+                # Create FlashAttn MLA metadata
+                metadata = FlashAttnMLAMetadata(
+                    num_reqs=len(requests),
+                    max_query_len=max(q_lens),
+                    max_seq_len=max_kv,
+                    num_actual_tokens=total_q,
+                    query_start_loc=q_start_gpu,
+                    slot_mapping=slot_mapping,
+                    num_decodes=len(requests),
+                    num_decode_tokens=total_q,
+                    num_prefills=0,
+                    head_dim=mla_dims["head_dim"],
+                    decode=decode_metadata,
+                    prefill=None,
+                )
+
+                # Create KV cache
+                kv_cache = torch.zeros(
+                    current_block,
+                    config.block_size,
+                    mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+                    device=device,
+                    dtype=torch.float16,
+                )
+
+                # Create query tensors
+                q_concat = torch.randn(
+                    total_q,
+                    mla_dims["num_q_heads"],
+                    mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+                    device=device,
+                    dtype=torch.float16,
+                )
+
+                # Warmup
+                for _ in range(config.warmup_iters):
+                    impl._forward_decode(q_concat, kv_cache, metadata, layer)
+                torch.cuda.synchronize()
+
+                # Benchmark
+                times = []
+                for _ in range(config.repeats):
+                    start = torch.cuda.Event(enable_timing=True)
+                    end = torch.cuda.Event(enable_timing=True)
+
+                    start.record()
+                    for _ in range(config.num_layers):
+                        impl._forward_decode(q_concat, kv_cache, metadata, layer)
+                    end.record()
+
+                    torch.cuda.synchronize()
+                    elapsed_ms = start.elapsed_time(end)
+                    times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+                results.append(
+                    {
+                        "mean": np.mean(times),
+                        "std": np.std(times),
+                        "min": np.min(times),
+                        "max": np.max(times),
+                        "throughput": total_q / np.mean(times) if times else 0,
+                    }
+                )
+
+            finally:
+                # Restore original threshold on the builder class
+                if threshold is not None:
+                    FlashAttnMLAMetadataBuilder.reorder_batch_threshold = (
+                        original_threshold
+                    )
+
+        return results

From 62ffea759290a05966b6dedd68021695c7c8c5b4 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 10 Oct 2025 20:40:40 +0000
Subject: [PATCH 03/45] clean up grammar

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md     |  51 ++++----
 benchmarks/attention_benchmarks/batch_spec.py | 120 +++++-------------
 .../configs/mla_decode.yaml                   |  32 ++---
 .../configs/mla_mixed_batch.yaml              |  36 +++---
 .../configs/speculative_decode.yaml           |  32 ++---
 .../configs/standard_attention.yaml           |  18 +--
 .../configs/study1_cutlass_numsplits.yaml     |  12 +-
 .../configs/study2_hopper_head_count.yaml     |  10 +-
 .../configs/study3_flashinfer_vs_cutlass.yaml |  14 +-
 .../configs/study4_reorder_threshold.yaml     | 112 ++++++++--------
 10 files changed, 190 insertions(+), 247 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 0ec6ee1c4e34..51a0167b6df3 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -20,35 +20,38 @@ python benchmark.py --config configs/study4_reorder_threshold.yaml
 # Or run custom benchmarks
 python benchmark.py \
     --backends flash flashinfer \
-    --batch-specs "q2k" "8s1k" "2q2k_32s1k" \
+    --batch-specs "q2k" "8q1kv1k" "2q2k_32q1kv1k" \
     --output-csv results.csv
 ```
 
-## Batch Specification Grammar
+## Simplified Batch Specification Grammar
 
-Express complex workloads concisely:
+Express workloads concisely using query length and KV cache size:
 
 ```python
-"q2k"                    # 2048-token prefill
-"8s1k"                   # 8 decode requests (1k KV cache each)
-"2q2k_32s1k"             # 2 prefills + 32 decodes
-"spec4s1k"               # 4-token speculative decode
-"chunk8q16k"             # Chunked 16k prefill
-"2q2k_spec4s1k_32s1k"    # Complex: 2 prefill + 1 spec + 32 decode
+"q2k"              # 2048-token prefill (q_len=2048, kv_len=2048)
+"q1kv1k"           # Decode: 1 token with 1K KV cache
+"8q1kv1k"          # 8 decode requests
+"q4kv1k"           # 4-token extend (e.g., spec decode)
+"2q2k_32q1kv1k"    # Mixed: 2 prefills + 32 decodes
+"16q4kv1k"         # 16 spec decode (4 tokens each)
 ```
 
-### Grammar Rules
+### Grammar Rule
 
 ```
-Prefill:        (<count>?) q<tokens>(k?)                 # q2k = 2048 tokens
-Decode:         (<count>?) s<kv_len>(k?)                 # 8s1k = 8 x 1k KV
-Speculative:    (<count>?) spec<len> s<kv_len>(k?)      # spec4s1k
-Chunked:        (<count>?) chunk<size> q<tokens>(k?)    # chunk8q16k
-Mixed:          Use _ to combine                         # 2q2k_32s1k
+Format: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
 
-'k' suffix = multiply by 1024
+- count:  Number of identical requests (optional, default=1)
+- q_len:  Query length (number of new tokens)
+- kv_len: Total KV cache length (optional, defaults to q_len for prefill)
+- 'k':    Multiplies value by 1024
+
+Mixed batches: Use _ to combine (e.g., "2q2k_32q1kv1k")
 ```
 
+**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
+
 ## Research Studies
 
 The suite includes 4 pre-configured studies to answer key MLA optimization questions. Each study is a single YAML file you can run directly:
@@ -121,7 +124,7 @@ The `benchmark.py` script handles **all** backends - both standard attention and
 ```bash
 python benchmark.py \
     --backends flash triton flashinfer \
-    --batch-specs "q2k" "8s1k" "2q2k_32s1k" \
+    --batch-specs "q2k" "8q1kv1k" "2q2k_32q1kv1k" \
     --num-layers 10 \
     --repeats 5 \
     --output-csv results.csv
@@ -133,7 +136,7 @@ python benchmark.py \
 # Compare all MLA backends
 python benchmark.py \
     --backends cutlass_mla flashinfer_mla flash_attn_mla flashmla \
-    --batch-specs "64s1k" "64s4k" \
+    --batch-specs "64q1kv1k" "64q1kv4k" \
     --output-csv mla_results.csv
 ```
 
@@ -144,7 +147,7 @@ python benchmark.py \
 ```bash
 python benchmark.py \
     --backend cutlass_mla \
-    --batch-specs "64s1k" "64s4k" "64s16k" \
+    --batch-specs "64q1kv1k" "64q1kv4k" "64q1kv16k" \
     --num-splits 1 2 4 8 16 \
     --compare-auto \
     --output-json optimal_splits.json
@@ -157,7 +160,7 @@ python benchmark.py \
 ```bash
 python benchmark.py \
     --backend flashmla \
-    --batch-specs "spec4s1k" "spec8s2k" \
+    --batch-specs "q4kv1k" "q8kv2k" \
     --thresholds 1 4 16 64 256 512 \
     --output-csv threshold_sweep.csv
 ```
@@ -170,7 +173,7 @@ python benchmark.py \
 --backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
                                     # flashinfer_mla, flash_attn_mla, flashmla
 --backend BACKEND                   # Single backend (alternative to --backends)
---batch-specs SPEC [SPEC ...]       # Batch specifications (default: ["q2k", "8s1k"])
+--batch-specs SPEC [SPEC ...]       # Batch specifications (default: ["q2k", "8q1kv1k"])
 
 # Model configuration
 --num-layers N                      # Number of layers (default: 10)
@@ -220,7 +223,7 @@ from common import BenchmarkConfig
 
 config = BenchmarkConfig(
     backend="cutlass_mla",
-    batch_spec="64s4k",
+    batch_spec="64q1kv4k",
     num_layers=10,
     head_dim=576,
     num_q_heads=128,
@@ -252,7 +255,7 @@ from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
 from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
 
 # Parse batch specs
-requests = parse_batch_spec("2q2k_spec4s1k_32s1k")
+requests = parse_batch_spec("2q2k_q4kv1k_32s1k")
 print(format_batch_spec(requests))
 # "2 prefill (2x2k), 1 specdecode (1xq4s1k), 32 decode (32x1k)"
 
@@ -315,7 +318,7 @@ source /path/to/vllm/.venv/bin/activate
 
 **OOM?**
 - Reduce batch size: `"32s1k"` → `"16s1k"`
-- Reduce sequence length: `"64s16k"` → `"64s4k"`
+- Reduce sequence length: `"64q1kv16k"` → `"64q1kv4k"`
 
 ## What's Included
 
diff --git a/benchmarks/attention_benchmarks/batch_spec.py b/benchmarks/attention_benchmarks/batch_spec.py
index d412b09ad7c9..11eab551edd8 100644
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -2,28 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 """
-Extended batch specification grammar for attention benchmarks.
+Simplified batch specification grammar for attention benchmarks.
 
 Grammar (underscore-separated segments):
-  Prefill:        (<count>?) q<q_len>(k?) (s<kv_len>(k?))?
-  Decode:         (<count>?) s<kv_len>(k?)
-  Spec decode:    (<count>?) spec<spec_len> s<kv_len>(k?)
-  Chunked prefill: (<count>?) chunk<chunk_size> q<q_len>(k?)
+  Format: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
 
-  'k' suffix multiplies by 1024
+  - count: Number of identical requests (optional, default=1)
+  - q_len: Query length (number of new tokens)
+  - kv_len: Total KV cache length (optional, defaults to q_len for prefill)
+  - 'k' suffix: Multiplies value by 1024
+
+Common patterns:
+  - Prefill:  q_len == kv_len  (e.g., "q2k" → 2048 new tokens, 2048 KV)
+  - Decode:   q_len == 1        (e.g., "q1kv1k" → 1 token, 1024 KV cache)
+  - Extend:   q_len < kv_len    (e.g., "q4kv1k" → 4 tokens, 1024 KV cache)
 
 Examples:
-  q2k                    -> [(2048, 2048)]
-  8s1k                   -> [(1, 1024)] * 8
-  2q1k_32s1k             -> [(1024, 1024)] * 2 + [(1, 1024)] * 32
-  spec4s1k               -> [(4, 1024)]  # 4-token speculative decode
-  chunk8q16k             -> [(16384, 16384)] with chunking hint
-  2q1ks2k_spec4s1k_32s1k -> [(1024, 2048)] * 2 + [(4, 1024)] + [(1, 1024)] * 32
+  q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
+  q1kv1k           -> [(1, 1024)]              # Decode: 1 token, 1K KV cache
+  8q1kv1k          -> [(1, 1024)] * 8          # 8 decode requests
+  q4kv1k           -> [(4, 1024)]              # 4-token extend (spec decode)
+  2q1k_32q1kv1k    -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
+  16q4kv1k         -> [(4, 1024)] * 16         # 16 spec decode requests
 """
 
 from collections import Counter
 from dataclasses import dataclass
-from typing import Optional
 
 import regex as re
 
@@ -32,22 +36,18 @@
 class BatchRequest:
     """Represents a single request in a batch."""
 
-    q_len: int  # Query length
-    kv_len: int  # KV cache length
-    is_speculative: bool = False  # Is this speculative decoding?
-    spec_length: int = 0  # Number of speculative tokens (if speculative)
-    is_chunked: bool = False  # Should use chunked prefill?
-    chunk_size: Optional[int] = None  # Chunk size for chunked prefill
+    q_len: int  # Query length (number of new tokens)
+    kv_len: int  # Total KV cache length
 
     @property
     def is_decode(self) -> bool:
         """True if this is a decode request (q_len == 1)."""
-        return self.q_len == 1 and self.kv_len > 1
+        return self.q_len == 1
 
     @property
     def is_prefill(self) -> bool:
         """True if this is a pure prefill (q_len == kv_len)."""
-        return self.q_len > 1 and self.kv_len == self.q_len
+        return self.q_len == self.kv_len
 
     @property
     def is_extend(self) -> bool:
@@ -100,6 +100,8 @@ def parse_batch_spec(spec: str) -> list[BatchRequest]:
     """
     Parse batch specification string into list of BatchRequest objects.
 
+    Grammar: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
+
     Args:
         spec: Batch specification string (see module docstring for grammar)
 
@@ -112,46 +114,8 @@ def parse_batch_spec(spec: str) -> list[BatchRequest]:
     requests = []
 
     for seg in spec.split("_"):
-        # Try chunked prefill pattern: (<count>?) chunk<chunk_size> q<q_len>(k?)
-        m = re.match(r"^(?:(\d+))?chunk(\d+)q(\d+)(k?)$", seg)
-        if m:
-            cnt = int(m.group(1)) if m.group(1) else 1
-            chunk_size = int(m.group(2))
-            q_len = _parse_size(m.group(3), m.group(4))
-            requests.extend(
-                [
-                    BatchRequest(
-                        q_len=q_len,
-                        kv_len=q_len,
-                        is_chunked=True,
-                        chunk_size=chunk_size,
-                    )
-                ]
-                * cnt
-            )
-            continue
-
-        # Try speculative decode pattern: (<count>?) spec<spec_len> s<kv_len>(k?)
-        m = re.match(r"^(?:(\d+))?spec(\d+)s(\d+)(k?)$", seg)
-        if m:
-            cnt = int(m.group(1)) if m.group(1) else 1
-            spec_len = int(m.group(2))
-            kv_len = _parse_size(m.group(3), m.group(4))
-            requests.extend(
-                [
-                    BatchRequest(
-                        q_len=spec_len,
-                        kv_len=kv_len,
-                        is_speculative=True,
-                        spec_length=spec_len,
-                    )
-                ]
-                * cnt
-            )
-            continue
-
-        # Try prefill/extend pattern: (<count>?) q<q_len>(k?) (s<kv_len>(k?))?
-        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
+        # Unified pattern: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
+        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:kv(\d+)(k?))?$", seg)
         if m:
             cnt = int(m.group(1)) if m.group(1) else 1
             q_len = _parse_size(m.group(2), m.group(3))
@@ -159,14 +123,6 @@ def parse_batch_spec(spec: str) -> list[BatchRequest]:
             requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
             continue
 
-        # Try decode pattern: (<count>?) s<kv_len>(k?)
-        m = re.match(r"^(?:(\d+))?s(\d+)(k?)$", seg)
-        if m:
-            cnt = int(m.group(1)) if m.group(1) else 1
-            kv_len = _parse_size(m.group(2), m.group(3))
-            requests.extend([BatchRequest(q_len=1, kv_len=kv_len)] * cnt)
-            continue
-
         raise ValueError(f"Invalid batch spec segment: '{seg}'")
 
     return requests
@@ -187,36 +143,20 @@ def format_batch_spec(requests: list[BatchRequest]) -> str:
     kinds = {
         "prefill": [],
         "extend": [],
-        "chunked_prefill": [],
-        "specdecode": [],
         "decode": [],
-        "unknown": [],
     }
 
     for req in requests:
         tup = (req.q_len, req.kv_len)
-        if req.is_chunked:
-            kinds["chunked_prefill"].append(tup)
-        elif req.is_speculative:
-            kinds["specdecode"].append(tup)
-        elif req.is_prefill:
+        if req.is_prefill:
             kinds["prefill"].append(tup)
         elif req.is_extend:
             kinds["extend"].append(tup)
         elif req.is_decode:
             kinds["decode"].append(tup)
-        else:
-            kinds["unknown"].append(tup)
 
     parts = []
-    for kind in [
-        "prefill",
-        "extend",
-        "chunked_prefill",
-        "specdecode",
-        "decode",
-        "unknown",
-    ]:
+    for kind in ["prefill", "extend", "decode"]:
         lst = kinds[kind]
         if not lst:
             continue
@@ -226,16 +166,16 @@ def format_batch_spec(requests: list[BatchRequest]) -> str:
         inner = []
 
         for (q, kv), cnt in ctr.items():
-            if kind in ("prefill", "chunked_prefill"):
+            if kind == "prefill":
                 size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
                 inner.append(f"{cnt}x{size}")
             elif kind == "decode":
                 size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
                 inner.append(f"{cnt}x{size}")
-            else:  # extend, specdecode, unknown
+            else:  # extend
                 qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
                 kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
-                inner.append(f"{cnt}xq{qstr}s{kstr}")
+                inner.append(f"{cnt}xq{qstr}kv{kstr}")
 
         parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
 
diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
index d8e06e7ba5ba..2b17d40cb2c6 100644
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -14,30 +14,30 @@ model:
 
 batch_specs:
   # Small batches, varying sequence lengths
-  - "16s512"     # 16 requests, 512 KV cache
-  - "16s1k"      # 16 requests, 1k KV cache
-  - "16s2k"      # 16 requests, 2k KV cache
-  - "16s4k"      # 16 requests, 4k KV cache
+  - "16q1kv512"     # 16 requests, 512 KV cache
+  - "16q1kv1k"      # 16 requests, 1k KV cache
+  - "16q1kv2k"      # 16 requests, 2k KV cache
+  - "16q1kv4k"      # 16 requests, 4k KV cache
 
   # Medium batches
-  - "32s1k"      # 32 requests, 1k KV cache
-  - "32s2k"      # 32 requests, 2k KV cache
-  - "32s4k"      # 32 requests, 4k KV cache
-  - "32s8k"      # 32 requests, 8k KV cache
+  - "32q1kv1k"      # 32 requests, 1k KV cache
+  - "32q1kv2k"      # 32 requests, 2k KV cache
+  - "32q1kv4k"      # 32 requests, 4k KV cache
+  - "32q1kv8k"      # 32 requests, 8k KV cache
 
   # Large batches
-  - "64s1k"      # 64 requests, 1k KV cache
-  - "64s2k"      # 64 requests, 2k KV cache
-  - "64s4k"      # 64 requests, 4k KV cache
-  - "64s8k"      # 64 requests, 8k KV cache
+  - "64q1kv1k"      # 64 requests, 1k KV cache
+  - "64q1kv2k"      # 64 requests, 2k KV cache
+  - "64q1kv4k"      # 64 requests, 4k KV cache
+  - "64q1kv8k"      # 64 requests, 8k KV cache
 
   # Very large batches
-  - "128s1k"     # 128 requests, 1k KV cache
-  - "128s2k"     # 128 requests, 2k KV cache
+  - "128q1kv1k"     # 128 requests, 1k KV cache
+  - "128q1kv2k"     # 128 requests, 2k KV cache
 
   # Long context
-  - "32s16k"     # 32 requests, 16k KV cache
-  - "32s32k"     # 32 requests, 32k KV cache
+  - "32q1kv16k"     # 32 requests, 16k KV cache
+  - "32q1kv32k"     # 32 requests, 32k KV cache
 
 backends:
   - cutlass_mla
diff --git a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
index b75fb99e4cbd..c503e44334b9 100644
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -15,34 +15,34 @@ model:
 
 batch_specs:
   # Small prefill + decode
-  - "1q1k_8s1k"           # 1 prefill + 8 decode
-  - "2q2k_16s1k"          # 2 prefill + 16 decode
-  - "4q1k_32s2k"          # 4 prefill + 32 decode
+  - "1q1k_8q1kv1k"           # 1 prefill + 8 decode
+  - "2q2k_16q1kv1k"          # 2 prefill + 16 decode
+  - "4q1k_32q1kv2k"          # 4 prefill + 32 decode
 
   # Medium prefill + decode
-  - "2q4k_32s2k"          # 2 medium prefill + 32 decode
-  - "4q4k_64s2k"          # 4 medium prefill + 64 decode
-  - "8q2k_64s4k"          # 8 prefill + 64 decode
+  - "2q4k_32q1kv2k"          # 2 medium prefill + 32 decode
+  - "4q4k_64q1kv2k"          # 4 medium prefill + 64 decode
+  - "8q2k_64q1kv4k"          # 8 prefill + 64 decode
 
   # Large prefill + decode (chunked prefill stress test)
-  - "2q8k_32s1k"          # 2 large prefill + 32 decode
-  - "1q16k_16s2k"         # 1 very large prefill + 16 decode
-  - "2q16k_32s4k"         # 2 very large prefill + 32 decode
+  - "2q8k_32q1kv1k"          # 2 large prefill + 32 decode
+  - "1q16k_16q1kv2k"         # 1 very large prefill + 16 decode
+  - "2q16k_32q1kv4k"         # 2 very large prefill + 32 decode
 
   # Context extension + decode
-  - "2q1ks2k_16s1k"       # 2 extend + 16 decode
-  - "4q2ks4k_32s2k"       # 4 extend + 32 decode
-  - "2q1ks8k_32s2k"       # 2 large extend + 32 decode
+  - "2q1kkv2k_16q1kv1k"       # 2 extend + 16 decode
+  - "4q2kkv4k_32q1kv2k"       # 4 extend + 32 decode
+  - "2q1kkv8k_32q1kv2k"       # 2 large extend + 32 decode
 
   # Explicitly chunked prefill
-  - "chunk4q8k"           # 8k prefill with chunking hint
-  - "chunk8q16k"          # 16k prefill with chunking hint
-  - "2chunk4q8k_32s2k"    # 2 chunked prefill + 32 decode
+  - "q8k"           # 8k prefill with chunking hint
+  - "q16k"          # 16k prefill with chunking hint
+  - "2q8k_32q1kv2k"    # 2 chunked prefill + 32 decode
 
   # High decode ratio (realistic serving)
-  - "1q2k_63s1k"          # 1 prefill + 63 decode
-  - "2q2k_62s2k"          # 2 prefill + 62 decode
-  - "4q4k_60s4k"          # 4 prefill + 60 decode
+  - "1q2k_63q1kv1k"          # 1 prefill + 63 decode
+  - "2q2k_62q1kv2k"          # 2 prefill + 62 decode
+  - "4q4k_60q1kv4k"          # 4 prefill + 60 decode
 
 backends:
   - cutlass_mla
diff --git a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
index 0ffaee1860f6..2982cdaa665c 100644
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -14,30 +14,30 @@ model:
 
 batch_specs:
   # Pure speculative decode (K-token verification)
-  - "spec2s1k"      # 2-token spec, 1k KV
-  - "spec4s1k"      # 4-token spec, 1k KV
-  - "spec8s1k"      # 8-token spec, 1k KV
-  - "spec16s1k"     # 16-token spec, 1k KV
+  - "q2kv1k"      # 2-token spec, 1k KV
+  - "q4kv1k"      # 4-token spec, 1k KV
+  - "q8kv1k"      # 8-token spec, 1k KV
+  - "q16kv1k"     # 16-token spec, 1k KV
 
   # Speculative with different context lengths
-  - "spec4s2k"      # 4-token spec, 2k KV
-  - "spec4s4k"      # 4-token spec, 4k KV
-  - "spec8s2k"      # 8-token spec, 2k KV
-  - "spec8s4k"      # 8-token spec, 4k KV
+  - "q4kv2k"      # 4-token spec, 2k KV
+  - "q4kv4k"      # 4-token spec, 4k KV
+  - "q8kv2k"      # 8-token spec, 2k KV
+  - "q8kv4k"      # 8-token spec, 4k KV
 
   # Mixed: speculative + regular decode
-  - "32spec4s1k"                    # 32 spec requests
-  - "16spec4s1k_16s1k"              # 16 spec + 16 regular
-  - "8spec8s2k_24s2k"               # 8 spec (8-tok) + 24 regular
+  - "32q4kv1k"                    # 32 spec requests
+  - "16q4kv1k_16q1kv1k"              # 16 spec + 16 regular
+  - "8q8kv2k_24q1kv2k"               # 8 spec (8-tok) + 24 regular
 
   # Mixed: speculative + prefill + decode
-  - "2q1k_16spec4s1k_16s1k"         # 2 prefill + 16 spec + 16 decode
-  - "4q2k_32spec4s2k_32s2k"         # 4 prefill + 32 spec + 32 decode
+  - "2q1k_16q4kv1k_16q1kv1k"         # 2 prefill + 16 spec + 16 decode
+  - "4q2k_32q4kv2k_32q1kv2k"         # 4 prefill + 32 spec + 32 decode
 
   # Large batches with speculation
-  - "64spec4s1k"                    # 64 spec requests
-  - "32spec8s2k"                    # 32 spec (8-token)
-  - "16spec16s4k"                   # 16 spec (16-token)
+  - "64q4kv1k"                    # 64 spec requests
+  - "32q8kv2k"                    # 32 spec (8-token)
+  - "16q16kv4k"                   # 16 spec (16-token)
 
 # Backends that support query length > 1
 backends:
diff --git a/benchmarks/attention_benchmarks/configs/standard_attention.yaml b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
index d1c5056c8fb1..622223ecd151 100644
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -15,19 +15,19 @@ batch_specs:
   - "q8k"       # Very large prefill (8192 tokens)
 
   # Pure decode
-  - "8s1k"      # 8 requests, 1k KV cache each
-  - "16s2k"     # 16 requests, 2k KV cache each
-  - "32s1k"     # 32 requests, 1k KV cache each
-  - "64s4k"     # 64 requests, 4k KV cache each
+  - "8q1kv1k"      # 8 requests, 1k KV cache each
+  - "16q1kv2k"     # 16 requests, 2k KV cache each
+  - "32q1kv1k"     # 32 requests, 1k KV cache each
+  - "64q1kv4k"     # 64 requests, 4k KV cache each
 
   # Mixed prefill/decode
-  - "2q2k_8s1k"      # 2 prefill + 8 decode
-  - "4q1k_16s2k"     # 4 prefill + 16 decode
-  - "2q4k_32s1k"     # 2 large prefill + 32 decode
+  - "2q2k_8q1kv1k"      # 2 prefill + 8 decode
+  - "4q1k_16q1kv2k"     # 4 prefill + 16 decode
+  - "2q4k_32q1kv1k"     # 2 large prefill + 32 decode
 
   # Context extension
-  - "q1ks2k"         # 1k query, 2k KV (chunked prefill)
-  - "2q1ks4k"        # 2 requests: 1k query, 4k KV
+  - "q1kkv2k"         # 1k query, 2k KV (chunked prefill)
+  - "2q1kkv4k"        # 2 requests: 1k query, 4k KV
 
 backends:
   - flash
diff --git a/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
index cdc5d9e0edf8..49ebbd247b21 100644
--- a/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
@@ -9,12 +9,12 @@ backend: cutlass_mla
 
 # Test various decode batch sizes with different KV cache lengths
 batch_specs:
-  - "32s1k"     # 32 decode requests, 1k KV cache
-  - "64s1k"     # 64 decode requests, 1k KV cache
-  - "64s4k"     # 64 decode requests, 4k KV cache
-  - "64s16k"    # 64 decode requests, 16k KV cache
-  - "128s1k"    # 128 decode requests, 1k KV cache
-  - "128s4k"    # 128 decode requests, 4k KV cache
+  - "32q1kv1k"     # 32 decode requests, 1k KV cache
+  - "64q1kv1k"     # 64 decode requests, 1k KV cache
+  - "64q1kv4k"     # 64 decode requests, 4k KV cache
+  - "64q1kv16k"    # 64 decode requests, 16k KV cache
+  - "128q1kv1k"    # 128 decode requests, 1k KV cache
+  - "128q1kv4k"    # 128 decode requests, 4k KV cache
 
 # Sweep num_kv_splits values
 num_splits:
diff --git a/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
index a77dbc8126dc..fca34736ad96 100644
--- a/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
@@ -11,11 +11,11 @@ backends:
 
 # Standard decode workloads
 batch_specs:
-  - "32s1k"     # 32 decode requests, 1k KV cache
-  - "64s1k"     # 64 decode requests, 1k KV cache
-  - "64s4k"     # 64 decode requests, 4k KV cache
-  - "128s1k"    # 128 decode requests, 1k KV cache
-  - "128s4k"    # 128 decode requests, 4k KV cache
+  - "32q1kv1k"     # 32 decode requests, 1k KV cache
+  - "64q1kv1k"     # 64 decode requests, 1k KV cache
+  - "64q1kv4k"     # 64 decode requests, 4k KV cache
+  - "128q1kv1k"    # 128 decode requests, 1k KV cache
+  - "128q1kv4k"    # 128 decode requests, 4k KV cache
 
 # Model configuration - will test different head counts
 # Note: You'll need to run this multiple times with different num_q_heads values
diff --git a/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml b/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
index 631b909dc565..3eb799961c12 100644
--- a/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
+++ b/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
@@ -10,13 +10,13 @@ backends:
 
 # Test various decode workloads
 batch_specs:
-  - "32s1k"     # 32 decode requests, 1k KV cache
-  - "64s1k"     # 64 decode requests, 1k KV cache
-  - "64s4k"     # 64 decode requests, 4k KV cache
-  - "64s16k"    # 64 decode requests, 16k KV cache
-  - "128s1k"    # 128 decode requests, 1k KV cache
-  - "128s4k"    # 128 decode requests, 4k KV cache
-  - "128s16k"   # 128 decode requests, 16k KV cache
+  - "32q1kv1k"     # 32 decode requests, 1k KV cache
+  - "64q1kv1k"     # 64 decode requests, 1k KV cache
+  - "64q1kv4k"     # 64 decode requests, 4k KV cache
+  - "64q1kv16k"    # 64 decode requests, 16k KV cache
+  - "128q1kv1k"    # 128 decode requests, 1k KV cache
+  - "128q1kv4k"    # 128 decode requests, 4k KV cache
+  - "128q1kv16k"   # 128 decode requests, 16k KV cache
 
 # For CUTLASS, test optimized num_kv_splits
 # Based on Study 1 results, you may want to adjust these values
diff --git a/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
index 062815eaaf83..687c2ff6ff1a 100644
--- a/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
@@ -21,64 +21,64 @@ mode: "decode_vs_prefill"
 # This tests different query lengths with fixed KV cache context
 batch_specs:
   # Fine-grained: 1-16 (decode range, step 1)
-  - "s1k"         # q_len=1 (regular decode, not spec)
-  - "spec2s1k"    # q_len=2
-  - "spec3s1k"    # q_len=3
-  - "spec4s1k"    # q_len=4
-  - "spec5s1k"    # q_len=5
-  - "spec6s1k"    # q_len=6
-  - "spec7s1k"    # q_len=7
-  - "spec8s1k"    # q_len=8
-  - "spec9s1k"    # q_len=9
-  - "spec10s1k"   # q_len=10
-  - "spec11s1k"   # q_len=11
-  - "spec12s1k"   # q_len=12
-  - "spec13s1k"   # q_len=13
-  - "spec14s1k"   # q_len=14
-  - "spec15s1k"   # q_len=15
-  - "spec16s1k"   # q_len=16
+  - "q1kv1k"         # q_len=1 (regular decode, not spec)
+  - "q2kv1k"    # q_len=2
+  - "q3kv1k"    # q_len=3
+  - "q4kv1k"    # q_len=4
+  - "q5kv1k"    # q_len=5
+  - "q6kv1k"    # q_len=6
+  - "q7kv1k"    # q_len=7
+  - "q8kv1k"    # q_len=8
+  - "q9kv1k"    # q_len=9
+  - "q10kv1k"   # q_len=10
+  - "q11kv1k"   # q_len=11
+  - "q12kv1k"   # q_len=12
+  - "q13kv1k"   # q_len=13
+  - "q14kv1k"   # q_len=14
+  - "q15kv1k"   # q_len=15
+  - "q16kv1k"   # q_len=16
   # Transition zone: 17-64 (step 2)
-  - "spec17s1k"
-  - "spec19s1k"
-  - "spec21s1k"
-  - "spec23s1k"
-  - "spec25s1k"
-  - "spec27s1k"
-  - "spec29s1k"
-  - "spec31s1k"
-  - "spec33s1k"
-  - "spec35s1k"
-  - "spec37s1k"
-  - "spec39s1k"
-  - "spec41s1k"
-  - "spec43s1k"
-  - "spec45s1k"
-  - "spec47s1k"
-  - "spec49s1k"
-  - "spec51s1k"
-  - "spec53s1k"
-  - "spec55s1k"
-  - "spec57s1k"
-  - "spec59s1k"
-  - "spec61s1k"
-  - "spec63s1k"
+  - "q17kv1k"
+  - "q19kv1k"
+  - "q21kv1k"
+  - "q23kv1k"
+  - "q25kv1k"
+  - "q27kv1k"
+  - "q29kv1k"
+  - "q31kv1k"
+  - "q33kv1k"
+  - "q35kv1k"
+  - "q37kv1k"
+  - "q39kv1k"
+  - "q41kv1k"
+  - "q43kv1k"
+  - "q45kv1k"
+  - "q47kv1k"
+  - "q49kv1k"
+  - "q51kv1k"
+  - "q53kv1k"
+  - "q55kv1k"
+  - "q57kv1k"
+  - "q59kv1k"
+  - "q61kv1k"
+  - "q63kv1k"
   # Prefill range: 65-128 (step 4)
-  - "spec65s1k"
-  - "spec69s1k"
-  - "spec73s1k"
-  - "spec77s1k"
-  - "spec81s1k"
-  - "spec85s1k"
-  - "spec89s1k"
-  - "spec93s1k"
-  - "spec97s1k"
-  - "spec101s1k"
-  - "spec105s1k"
-  - "spec109s1k"
-  - "spec113s1k"
-  - "spec117s1k"
-  - "spec121s1k"
-  - "spec125s1k"
+  - "q65kv1k"
+  - "q69kv1k"
+  - "q73kv1k"
+  - "q77kv1k"
+  - "q81kv1k"
+  - "q85kv1k"
+  - "q89kv1k"
+  - "q93kv1k"
+  - "q97kv1k"
+  - "q101kv1k"
+  - "q105kv1k"
+  - "q109kv1k"
+  - "q113kv1k"
+  - "q117kv1k"
+  - "q121kv1k"
+  - "q125kv1k"
 
 # Batch sizes to test (from old script)
 batch_sizes:

From 057bb3adbb214dcb0941746ca103ceef31f43ffb Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 10 Oct 2025 21:04:10 +0000
Subject: [PATCH 04/45] simplify

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/benchmark.py  |  23 +-
 benchmarks/attention_benchmarks/mla_runner.py | 740 ++++++------------
 2 files changed, 245 insertions(+), 518 deletions(-)

diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index 9b943d80a34d..f76a85e4b4cc 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -50,22 +50,9 @@ def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult
 
 def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
     """Run MLA benchmark with appropriate backend."""
-    from mla_runner import (
-        run_cutlass_mla_benchmark,
-        run_flashattn_mla_benchmark,
-        run_flashinfer_mla_benchmark,
-        run_flashmla_benchmark,
-    )
-
-    backend_map = {
-        "cutlass_mla": run_cutlass_mla_benchmark,
-        "flashinfer_mla": run_flashinfer_mla_benchmark,
-        "flash_attn_mla": run_flashattn_mla_benchmark,
-        "flashmla": run_flashmla_benchmark,
-    }
+    from mla_runner import run_mla_benchmark as run_mla
 
-    runner = backend_map[config.backend]
-    result_dict = runner(config, **kwargs)
+    result_dict = run_mla(config.backend, config, **kwargs)
 
     return BenchmarkResult(
         config=config,
@@ -307,12 +294,10 @@ def main():
 
                 # Run all benchmarks for this batch size in one go (batched mode)
                 try:
-                    from mla_runner import run_flashattn_mla_benchmark
+                    from mla_runner import run_mla_benchmark as run_mla
 
                     # Use batched API: pass list of (config, threshold) tuples
-                    timing_results = run_flashattn_mla_benchmark(
-                        configs_with_thresholds
-                    )
+                    timing_results = run_mla(backend, configs_with_thresholds)
 
                     # Create BenchmarkResult objects from timing results
                     for (config, _), timing in zip(
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index 73e16f6ca102..a4907208098e 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -247,473 +247,114 @@ def build_mla_metadata_cutlass(
     return metadata, kv_cache, layer
 
 
-def run_cutlass_mla_benchmark(
-    config,
-    num_kv_splits: Optional[int] = None,
-) -> dict:
-    """
-    Run CUTLASS MLA benchmark.
-
-    Args:
-        config: BenchmarkConfig
-        num_kv_splits: Number of KV splits (None for auto)
-
-    Returns:
-        Dict with timing statistics
-    """
-    device = torch.device(config.device)
-    torch.cuda.set_device(device)
-
-    # Create and set vLLM config for MLA
-    vllm_config = create_minimal_vllm_config(
-        model_name="deepseek-v3",
-        block_size=config.block_size,
-    )
-    from vllm.config import set_current_vllm_config
-
-    with set_current_vllm_config(vllm_config):
-        # Parse batch spec
-        requests = parse_batch_spec(config.batch_spec)
-
-        # Setup MLA dimensions
-        mla_dims = setup_mla_dims("deepseek-v3")
-        scale = 1.0 / np.sqrt(
-            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
-        )
-
-        # Build metadata
-        metadata, kv_cache, layer = build_mla_metadata_cutlass(
-            requests, config.block_size, device, mla_dims
-        )
-
-        # Create CUTLASS MLA impl
-        from vllm.v1.attention.backends.mla.cutlass_mla import CutlassMLAImpl
-
-        impl = CutlassMLAImpl(
-            num_heads=mla_dims["num_q_heads"],
-            head_size=mla_dims["head_dim"],
-            scale=scale,
-            num_kv_heads=mla_dims["num_kv_heads"],
-            alibi_slopes=None,
-            sliding_window=None,
-            kv_cache_dtype="auto",
-            logits_soft_cap=None,
-            attn_type="decoder",
-            kv_sharing_target_layer_name=None,
-            q_lora_rank=None,
-            kv_lora_rank=mla_dims["kv_lora_rank"],
-            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
-            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
-            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-            v_head_dim=mla_dims["v_head_dim"],
-            kv_b_proj=None,
-        )
-
-        # Override num_kv_splits if specified
-        if num_kv_splits is not None:
-            impl._num_kv_splits = num_kv_splits
-
-        # Create query tensors
-        total_q = sum(r.q_len for r in requests)
-        q_nope = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["kv_lora_rank"],
-            device=device,
-            dtype=torch.float16,
-        )
-        q_pe = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=torch.float16,
-        )
-
-        # Warmup
-        for _ in range(config.warmup_iters):
-            impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
-        torch.cuda.synchronize()
-
-        # Benchmark
-        times = []
-        for _ in range(config.repeats):
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-
-            start.record()
-            for _ in range(config.num_layers):
-                impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
-            end.record()
-
-            torch.cuda.synchronize()
-            elapsed_ms = start.elapsed_time(end)
-            times.append(elapsed_ms / 1000.0 / config.num_layers)
-
-        return {
-            "mean": np.mean(times),
-            "std": np.std(times),
-            "min": np.min(times),
-            "max": np.max(times),
-            "throughput": total_q / np.mean(times) if times else 0,
-        }
-
-
-def run_flashinfer_mla_benchmark(config) -> dict:
+# Backend configuration mapping for unified runner
+_BACKEND_CONFIG = {
+    "flash_attn_mla": {
+        "module": "vllm.v1.attention.backends.mla.flashattn_mla",
+        "impl_class": "FlashAttnMLAImpl",
+        "metadata_class": "FlashAttnMLAMetadata",
+        "decode_metadata_class": "FlashAttnMLADecodeMetadata",
+        "builder_class": "FlashAttnMLAMetadataBuilder",
+        "query_format": "tuple",  # (q_nope, q_pe)
+        "block_size": None,  # Use config block_size
+    },
+    "flashmla": {
+        "module": "vllm.v1.attention.backends.mla.flashmla",
+        "impl_class": "FlashMLAImpl",
+        "metadata_class": "FlashMLAMetadata",
+        "decode_metadata_class": "FlashMLADecodeMetadata",
+        "builder_class": None,
+        "query_format": "concat",  # Single concatenated tensor
+        "block_size": 64,  # FlashMLA uses fixed block size
+    },
+    "flashinfer_mla": {
+        "module": "vllm.v1.attention.backends.mla.flashinfer_mla",
+        "impl_class": "FlashInferMLAImpl",
+        "metadata_class": "MLACommonMetadata",
+        "decode_metadata_class": "MLACommonDecodeMetadata",
+        "builder_class": None,
+        "query_format": "tuple",
+        "block_size": None,
+    },
+    "cutlass_mla": {
+        "module": "vllm.v1.attention.backends.mla.cutlass_mla",
+        "impl_class": "CutlassMLAImpl",
+        "metadata_class": "MLACommonMetadata",
+        "decode_metadata_class": "MLACommonDecodeMetadata",
+        "builder_class": None,
+        "query_format": "tuple",
+        "block_size": None,
+    },
+}
+
+
+def _run_mla_benchmark_batched(
+    backend: str,
+    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
+) -> list[dict]:
     """
-    Run FlashInfer-MLA benchmark.
-
-    Args:
-        config: BenchmarkConfig
-
-    Returns:
-        Dict with timing statistics
-    """
-    device = torch.device(config.device)
-    torch.cuda.set_device(device)
-
-    # Create and set vLLM config for MLA
-    vllm_config = create_minimal_vllm_config(
-        model_name="deepseek-v3",
-        block_size=config.block_size,
-    )
-    from vllm.config import set_current_vllm_config
-
-    with set_current_vllm_config(vllm_config):
-        # Parse batch spec
-        requests = parse_batch_spec(config.batch_spec)
-
-        # Setup MLA dimensions
-        mla_dims = setup_mla_dims("deepseek-v3")
-        scale = 1.0 / np.sqrt(
-            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
-        )
-
-        # Build metadata
-        metadata, kv_cache, layer = build_mla_metadata_cutlass(
-            requests, config.block_size, device, mla_dims
-        )
-
-        # Create FlashInfer-MLA impl
-        from vllm.v1.attention.backends.mla.flashinfer_mla import FlashInferMLAImpl
-
-        impl = FlashInferMLAImpl(
-            num_heads=mla_dims["num_q_heads"],
-            head_size=mla_dims["head_dim"],
-            scale=scale,
-            num_kv_heads=mla_dims["num_kv_heads"],
-            alibi_slopes=None,
-            sliding_window=None,
-            kv_cache_dtype="auto",
-            logits_soft_cap=None,
-            attn_type="decoder",
-            kv_sharing_target_layer_name=None,
-            q_lora_rank=None,
-            kv_lora_rank=mla_dims["kv_lora_rank"],
-            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
-            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
-            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-            v_head_dim=mla_dims["v_head_dim"],
-            kv_b_proj=None,
-        )
-
-        # Create query tensors
-        total_q = sum(r.q_len for r in requests)
-        q_nope = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["kv_lora_rank"],
-            device=device,
-            dtype=torch.float16,
-        )
-        q_pe = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=torch.float16,
-        )
-
-        # Warmup
-        for _ in range(config.warmup_iters):
-            impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
-        torch.cuda.synchronize()
-
-        # Benchmark
-        times = []
-        for _ in range(config.repeats):
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-
-            start.record()
-            for _ in range(config.num_layers):
-                impl._forward_decode((q_nope, q_pe), kv_cache, metadata, layer)
-            end.record()
-
-            torch.cuda.synchronize()
-            elapsed_ms = start.elapsed_time(end)
-            times.append(elapsed_ms / 1000.0 / config.num_layers)
-
-        return {
-            "mean": np.mean(times),
-            "std": np.std(times),
-            "min": np.min(times),
-            "max": np.max(times),
-            "throughput": total_q / np.mean(times) if times else 0,
-        }
+    Unified batched MLA benchmark runner for all backends.
 
+    Works for: flash_attn_mla, flashmla, flashinfer_mla, cutlass_mla
 
-def run_flashattn_mla_benchmark(config, reorder_batch_threshold: Optional[int] = None):
-    """
-    Run FlashAttn MLA benchmark (Hopper SM90+).
-
-    Always uses batched execution internally for optimal performance.
-    Accepts both single config and list of configs for convenience.
+    This function reuses backend initialization across multiple benchmarks
+    to avoid setup/teardown overhead.
 
     Args:
-        config: BenchmarkConfig or list of (BenchmarkConfig, threshold) tuples
-        reorder_batch_threshold: Threshold override (only for single config mode)
+        backend: Backend name
+        configs_with_params: List of (config, threshold, num_splits) tuples
+            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
+            - num_splits: num_kv_splits (CUTLASS only)
 
     Returns:
-        Dict with timing statistics (single mode) or list of dicts (batched mode)
-    """
-    # Normalize to batched mode
-    if isinstance(config, list):
-        # Already in batched format: [(config1, thresh1), ...]
-        configs_with_thresholds = config
-        return_single = False
-    else:
-        # Single config: convert to batched format
-        configs_with_thresholds = [(config, reorder_batch_threshold)]
-        return_single = True
-
-    # Always use batched execution
-    results = _run_flashattn_mla_batched(configs_with_thresholds)
-
-    # Return single result or list based on input
-    return results[0] if return_single else results
-
-
-def run_flashmla_benchmark(
-    config, reorder_batch_threshold: Optional[int] = None
-) -> dict:
+        List of dicts with timing statistics
     """
-    Run FlashMLA benchmark (Hopper SM90+).
+    if not configs_with_params:
+        return []
 
-    Args:
-        config: BenchmarkConfig
-        reorder_batch_threshold: Reorder batch threshold override
+    if backend not in _BACKEND_CONFIG:
+        raise ValueError(f"Unknown backend: {backend}")
 
-    Returns:
-        Dict with timing statistics
-    """
-    device = torch.device(config.device)
+    backend_cfg = _BACKEND_CONFIG[backend]
+    device = torch.device(configs_with_params[0][0].device)
     torch.cuda.set_device(device)
 
-    # Create and set vLLM config for MLA
+    # Determine block size
+    config_block_size = configs_with_params[0][0].block_size
+    block_size = backend_cfg["block_size"] or config_block_size
+
+    # Create and set vLLM config for MLA (reused across all benchmarks)
     vllm_config = create_minimal_vllm_config(
         model_name="deepseek-v3",
-        block_size=config.block_size,
+        block_size=block_size,
     )
-    from vllm.config import set_current_vllm_config
-
-    with set_current_vllm_config(vllm_config):
-        # Parse batch spec
-        requests = parse_batch_spec(config.batch_spec)
-
-        # Setup MLA dimensions
-        mla_dims = setup_mla_dims("deepseek-v3")
-        scale = 1.0 / np.sqrt(
-            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
-        )
 
-        q_lens = [r.q_len for r in requests]
-        kv_lens = [r.kv_len for r in requests]
-        total_q = sum(q_lens)
-        max_kv = max(kv_lens)
+    # Import backend classes dynamically
+    import importlib
 
-        # Build query start locations
-        q_start_cpu = np.array(
-            [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
-        )
-        q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
-
-        # Build sequence lengths
-        seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
-        seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
-
-        # Build block table
-        num_blocks_per_req = [
-            (kv + 64 - 1) // 64 for kv in kv_lens
-        ]  # FlashMLA uses block_size=64
-        max_num_blocks = max(num_blocks_per_req)
-
-        block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
-        for i, num_blocks in enumerate(num_blocks_per_req):
-            block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
-        block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
-
-        # Create FlashMLA metadata (needs tile_scheduler_metadata and num_splits)
-        from vllm.attention.ops.flashmla import get_mla_metadata
-        from vllm.v1.attention.backends.mla.flashmla import (
-            FlashMLADecodeMetadata,
-            FlashMLAImpl,
-            FlashMLAMetadata,
-        )
-
-        tile_scheduler_metadata, num_splits = get_mla_metadata(
-            seq_lens_gpu,
-            mla_dims["num_q_heads"],
-            1,  # MQA for decode
-        )
-
-        decode_metadata = FlashMLADecodeMetadata(
-            block_table=block_table_gpu,
-            seq_lens=seq_lens_gpu,
-            tile_scheduler_metadata=tile_scheduler_metadata,
-            num_splits=num_splits,
-            dcp_tot_seq_lens=None,
-        )
+    from vllm.config import set_current_vllm_config
 
-        # Slot mapping
-        slot_mapping_list = []
-        for i, (q_len, kv_len, num_blocks) in enumerate(
-            zip(q_lens, kv_lens, num_blocks_per_req)
-        ):
-            context_len = kv_len - q_len
-            for j in range(q_len):
-                token_kv_idx = context_len + j
-                block_idx = token_kv_idx // 64
-                offset_in_block = token_kv_idx % 64
-                global_block_id = block_table_cpu[i, block_idx]
-                slot_id = global_block_id * 64 + offset_in_block
-                slot_mapping_list.append(slot_id)
-
-        slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
-
-        metadata = FlashMLAMetadata(
-            num_reqs=len(requests),
-            max_query_len=max(q_lens),
-            max_seq_len=max_kv,
-            num_actual_tokens=total_q,
-            query_start_loc=q_start_gpu,
-            slot_mapping=slot_mapping,
-            num_decodes=len(requests),
-            num_decode_tokens=total_q,
-            num_prefills=0,
-            head_dim=mla_dims["head_dim"],
-            decode=decode_metadata,
-            prefill=None,
-        )
+    backend_module = importlib.import_module(backend_cfg["module"])
+    impl_class = getattr(backend_module, backend_cfg["impl_class"])
+    metadata_class = getattr(backend_module, backend_cfg["metadata_class"])
+    decode_metadata_class = getattr(
+        backend_module, backend_cfg["decode_metadata_class"]
+    )
 
-        # Create KV cache
-        kv_cache = torch.zeros(
-            max_num_blocks,
-            64,  # FlashMLA block size
-            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=torch.float16,
-        )
+    # Import builder class if needed (for threshold setting)
+    builder_class = None
+    if backend_cfg["builder_class"]:
+        builder_class = getattr(backend_module, backend_cfg["builder_class"])
 
-        # Create FlashMLA impl
-        impl = FlashMLAImpl(
-            num_heads=mla_dims["num_q_heads"],
-            head_size=mla_dims["head_dim"],
-            scale=scale,
-            num_kv_heads=mla_dims["num_kv_heads"],
-            alibi_slopes=None,
-            sliding_window=None,
-            kv_cache_dtype="auto",
-            logits_soft_cap=None,
-            attn_type="decoder",
-            kv_sharing_target_layer_name=None,
-            q_lora_rank=None,
-            kv_lora_rank=mla_dims["kv_lora_rank"],
-            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
-            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
-            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-            v_head_dim=mla_dims["v_head_dim"],
-            kv_b_proj=None,
+    # Import common metadata for backends that use it
+    if backend_cfg["metadata_class"] == "MLACommonMetadata":
+        from vllm.v1.attention.backends.mla.common import (
+            MLACommonDecodeMetadata as decode_metadata_class,
         )
-
-        # Initialize DCP (distributed context parallelism) attributes for
-        # standalone benchmarking
-        if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
-            impl.dcp_world_size = 1
-            impl.dcp_rank = 0
-
-        layer = MockLayer(device)
-
-        # Create query tensors
-        q_concat = torch.randn(
-            total_q,
-            mla_dims["num_q_heads"],
-            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-            device=device,
-            dtype=torch.float16,
+        from vllm.v1.attention.backends.mla.common import (
+            MLACommonMetadata as metadata_class,
         )
 
-        # Warmup
-        for _ in range(config.warmup_iters):
-            impl._forward_decode(q_concat, kv_cache, metadata, layer)
-        torch.cuda.synchronize()
-
-        # Benchmark
-        times = []
-        for _ in range(config.repeats):
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-
-            start.record()
-            for _ in range(config.num_layers):
-                impl._forward_decode(q_concat, kv_cache, metadata, layer)
-            end.record()
-
-            torch.cuda.synchronize()
-            elapsed_ms = start.elapsed_time(end)
-            times.append(elapsed_ms / 1000.0 / config.num_layers)
-
-        return {
-            "mean": np.mean(times),
-            "std": np.std(times),
-            "min": np.min(times),
-            "max": np.max(times),
-            "throughput": total_q / np.mean(times) if times else 0,
-        }
-
-
-def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dict]:
-    """
-    Run multiple FlashAttn MLA benchmarks with shared initialization.
-
-    This is optimized for running many benchmarks with the same backend,
-    avoiding repeated setup/teardown overhead.
-
-    Args:
-        configs_with_thresholds: List of (config, threshold) tuples to benchmark
-
-    Returns:
-        List of dicts with timing statistics, one per config
-    """
-    if not configs_with_thresholds:
-        return []
-
-    device = torch.device(configs_with_thresholds[0][0].device)
-    torch.cuda.set_device(device)
-
-    # Create and set vLLM config for MLA (reused across all benchmarks)
-    vllm_config = create_minimal_vllm_config(
-        model_name="deepseek-v3",
-        block_size=configs_with_thresholds[0][0].block_size,
-    )
-    from vllm.config import set_current_vllm_config
-    from vllm.v1.attention.backends.mla.flashattn_mla import (
-        FlashAttnMLADecodeMetadata,
-        FlashAttnMLAImpl,
-        FlashAttnMLAMetadata,
-        FlashAttnMLAMetadataBuilder,
-    )
-
     with set_current_vllm_config(vllm_config):
         # Setup MLA dimensions (reused)
         mla_dims = setup_mla_dims("deepseek-v3")
@@ -722,25 +363,27 @@ def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dic
         )
 
         # Create impl once (reused across all benchmarks)
-        impl = FlashAttnMLAImpl(
-            num_heads=mla_dims["num_q_heads"],
-            head_size=mla_dims["head_dim"],
-            scale=scale,
-            num_kv_heads=mla_dims["num_kv_heads"],
-            alibi_slopes=None,
-            sliding_window=None,
-            kv_cache_dtype="auto",
-            logits_soft_cap=None,
-            attn_type="decoder",
-            kv_sharing_target_layer_name=None,
-            q_lora_rank=None,
-            kv_lora_rank=mla_dims["kv_lora_rank"],
-            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
-            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
-            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-            v_head_dim=mla_dims["v_head_dim"],
-            kv_b_proj=None,
-        )
+        impl_kwargs = {
+            "num_heads": mla_dims["num_q_heads"],
+            "head_size": mla_dims["head_dim"],
+            "scale": scale,
+            "num_kv_heads": mla_dims["num_kv_heads"],
+            "alibi_slopes": None,
+            "sliding_window": None,
+            "kv_cache_dtype": "auto",
+            "logits_soft_cap": None,
+            "attn_type": "decoder",
+            "kv_sharing_target_layer_name": None,
+            "q_lora_rank": None,
+            "kv_lora_rank": mla_dims["kv_lora_rank"],
+            "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
+            "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
+            "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            "v_head_dim": mla_dims["v_head_dim"],
+            "kv_b_proj": None,
+        }
+
+        impl = impl_class(**impl_kwargs)
 
         # Initialize DCP attributes
         if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
@@ -751,11 +394,18 @@ def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dic
         results = []
 
         # Run each benchmark with the shared impl
-        for config, threshold in configs_with_thresholds:
-            # Set threshold for this benchmark on the builder class
-            if threshold is not None:
-                original_threshold = FlashAttnMLAMetadataBuilder.reorder_batch_threshold
-                FlashAttnMLAMetadataBuilder.reorder_batch_threshold = threshold
+        for config, threshold, num_splits in configs_with_params:
+            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
+            original_threshold = None
+            if threshold is not None and builder_class:
+                original_threshold = builder_class.reorder_batch_threshold
+                builder_class.reorder_batch_threshold = threshold
+
+            # Set num_splits for CUTLASS
+            original_num_splits = None
+            if num_splits is not None and hasattr(impl, "_num_kv_splits"):
+                original_num_splits = impl._num_kv_splits
+                impl._num_kv_splits = num_splits
 
             try:
                 # Parse batch spec
@@ -779,7 +429,7 @@ def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dic
 
                 # Build block table
                 num_blocks_per_req = [
-                    (kv + config.block_size - 1) // config.block_size for kv in kv_lens
+                    (kv + block_size - 1) // block_size for kv in kv_lens
                 ]
                 max_num_blocks = max(num_blocks_per_req)
 
@@ -802,28 +452,53 @@ def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dic
                     context_len = kv_len - q_len
                     for j in range(q_len):
                         token_kv_idx = context_len + j
-                        block_idx = token_kv_idx // config.block_size
-                        offset_in_block = token_kv_idx % config.block_size
+                        block_idx = token_kv_idx // block_size
+                        offset_in_block = token_kv_idx % block_size
                         global_block_id = block_table_cpu[i, block_idx]
-                        slot_id = global_block_id * config.block_size + offset_in_block
+                        slot_id = global_block_id * block_size + offset_in_block
                         slot_mapping_list.append(slot_id)
 
                 slot_mapping = torch.tensor(
                     slot_mapping_list, dtype=torch.int64, device=device
                 )
 
-                # Create FlashAttn MLA decode metadata
-                decode_metadata = FlashAttnMLADecodeMetadata(
-                    block_table=block_table_gpu,
-                    seq_lens=seq_lens_gpu,
-                    dcp_tot_seq_lens=None,
-                    query_start_loc=q_start_gpu,
-                    max_query_len=max(q_lens),
-                    max_seq_len=max_kv,
-                )
+                # Create decode metadata
+                decode_metadata_kwargs = {
+                    "block_table": block_table_gpu,
+                    "seq_lens": seq_lens_gpu,
+                    "dcp_tot_seq_lens": None,
+                }
+
+                # FlashAttn MLA needs extra fields
+                if backend == "flash_attn_mla":
+                    decode_metadata_kwargs.update(
+                        {
+                            "query_start_loc": q_start_gpu,
+                            "max_query_len": max(q_lens),
+                            "max_seq_len": max_kv,
+                        }
+                    )
+
+                # FlashMLA needs tile_scheduler_metadata and num_splits
+                if backend == "flashmla":
+                    from vllm.attention.ops.flashmla import get_mla_metadata
+
+                    tile_scheduler_metadata, num_splits_auto = get_mla_metadata(
+                        seq_lens_gpu,
+                        mla_dims["num_q_heads"],
+                        1,  # MQA for decode
+                    )
+                    decode_metadata_kwargs.update(
+                        {
+                            "tile_scheduler_metadata": tile_scheduler_metadata,
+                            "num_splits": num_splits_auto,
+                        }
+                    )
+
+                decode_metadata = decode_metadata_class(**decode_metadata_kwargs)
 
-                # Create FlashAttn MLA metadata
-                metadata = FlashAttnMLAMetadata(
+                # Create metadata
+                metadata = metadata_class(
                     num_reqs=len(requests),
                     max_query_len=max(q_lens),
                     max_seq_len=max_kv,
@@ -841,24 +516,41 @@ def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dic
                 # Create KV cache
                 kv_cache = torch.zeros(
                     current_block,
-                    config.block_size,
+                    block_size,
                     mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
                     device=device,
                     dtype=torch.float16,
                 )
 
-                # Create query tensors
-                q_concat = torch.randn(
-                    total_q,
-                    mla_dims["num_q_heads"],
-                    mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-                    device=device,
-                    dtype=torch.float16,
-                )
+                # Create query tensors (format depends on backend)
+                if backend_cfg["query_format"] == "tuple":
+                    q_nope = torch.randn(
+                        total_q,
+                        mla_dims["num_q_heads"],
+                        mla_dims["kv_lora_rank"],
+                        device=device,
+                        dtype=torch.float16,
+                    )
+                    q_pe = torch.randn(
+                        total_q,
+                        mla_dims["num_q_heads"],
+                        mla_dims["qk_rope_head_dim"],
+                        device=device,
+                        dtype=torch.float16,
+                    )
+                    query = (q_nope, q_pe)
+                else:  # concat
+                    query = torch.randn(
+                        total_q,
+                        mla_dims["num_q_heads"],
+                        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+                        device=device,
+                        dtype=torch.float16,
+                    )
 
                 # Warmup
                 for _ in range(config.warmup_iters):
-                    impl._forward_decode(q_concat, kv_cache, metadata, layer)
+                    impl._forward_decode(query, kv_cache, metadata, layer)
                 torch.cuda.synchronize()
 
                 # Benchmark
@@ -869,7 +561,7 @@ def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dic
 
                     start.record()
                     for _ in range(config.num_layers):
-                        impl._forward_decode(q_concat, kv_cache, metadata, layer)
+                        impl._forward_decode(query, kv_cache, metadata, layer)
                     end.record()
 
                     torch.cuda.synchronize()
@@ -887,10 +579,60 @@ def _run_flashattn_mla_batched(configs_with_thresholds: list[tuple]) -> list[dic
                 )
 
             finally:
-                # Restore original threshold on the builder class
-                if threshold is not None:
-                    FlashAttnMLAMetadataBuilder.reorder_batch_threshold = (
-                        original_threshold
-                    )
+                # Restore original threshold
+                if original_threshold is not None:
+                    builder_class.reorder_batch_threshold = original_threshold
+
+                # Restore original num_splits
+                if original_num_splits is not None:
+                    impl._num_kv_splits = original_num_splits
 
         return results
+
+
+def run_mla_benchmark(
+    backend: str,
+    config,
+    reorder_batch_threshold: Optional[int] = None,
+    num_kv_splits: Optional[int] = None,
+) -> dict:
+    """
+    Unified MLA benchmark runner for all backends.
+
+    Works for: flash_attn_mla, flashmla, flashinfer_mla, cutlass_mla
+
+    Always uses batched execution internally for optimal performance.
+
+    Args:
+        backend: Backend name (flash_attn_mla, flashmla, flashinfer_mla, cutlass_mla)
+        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
+        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
+                                 (single config mode only)
+        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
+
+    Returns:
+        Dict with timing statistics (single mode) or list of dicts (batched mode)
+    """
+    # Normalize to batched mode: (config, threshold, num_splits)
+    if isinstance(config, list):
+        # Already in batched format
+        if len(config) > 0 and isinstance(config[0], tuple):
+            # Format: [(cfg, param), ...] where param is threshold or num_splits
+            if backend in ("flash_attn_mla", "flashmla"):
+                configs_with_params = [(cfg, param, None) for cfg, param in config]
+            else:  # cutlass_mla or flashinfer_mla
+                configs_with_params = [(cfg, None, param) for cfg, param in config]
+        else:
+            # Format: [cfg, ...] - just configs
+            configs_with_params = [(cfg, None, None) for cfg in config]
+        return_single = False
+    else:
+        # Single config: convert to batched format
+        configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
+        return_single = True
+
+    # Use unified batched execution
+    results = _run_mla_benchmark_batched(backend, configs_with_params)
+
+    # Return single result or list based on input
+    return results[0] if return_single else results

From 558b049ba885f390e36fbb6bc4cf88f2c8e758e3 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 13 Oct 2025 18:14:04 +0000
Subject: [PATCH 05/45] add batch spec ranges

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/benchmark.py  |  91 ++++++++++++++-
 .../configs/study4_reorder_threshold.yaml     |  83 ++++----------
 .../attention_benchmarks/test_batch_spec.py   | 104 ++++++++++++++++++
 3 files changed, 216 insertions(+), 62 deletions(-)

diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index f76a85e4b4cc..787717c8d8e7 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -70,6 +70,80 @@ def load_config_from_yaml(config_path: str) -> dict:
         return yaml.safe_load(f)
 
 
+def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
+    """
+    Generate batch specs from range specifications.
+
+    Args:
+        ranges: List of range specifications, each containing:
+            - template: Batch spec template (e.g., "q{q_len}kv1k")
+            - q_len: Dict with start, stop, step, end_inclusive (optional)
+            - Other parameters can also be ranges
+
+    Returns:
+        List of generated batch spec strings
+
+    Example:
+        ranges = [
+            {
+                "template": "q{q_len}kv1k",
+                "q_len": {
+                    "start": 1,
+                    "stop": 16,
+                    "step": 1,
+                    "end_inclusive": true  # Optional, defaults to true
+                }
+            }
+        ]
+        Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
+    """
+    all_specs = []
+
+    for range_spec in ranges:
+        template = range_spec.get("template")
+        if not template:
+            raise ValueError("Range specification must include 'template'")
+
+        # Extract all range parameters from the spec
+        range_params = {}
+        for key, value in range_spec.items():
+            if key == "template":
+                continue
+            if isinstance(value, dict) and "start" in value:
+                # This is a range specification
+                start = value["start"]
+                stop = value["stop"]
+                step = value.get("step", 1)
+                # Check if end should be inclusive (default: True)
+                end_inclusive = value.get("end_inclusive", True)
+
+                # Adjust stop based on end_inclusive
+                if end_inclusive:
+                    range_params[key] = list(range(start, stop + 1, step))
+                else:
+                    range_params[key] = list(range(start, stop, step))
+            else:
+                # This is a fixed value
+                range_params[key] = [value]
+
+        # Generate all combinations (Cartesian product)
+        if range_params:
+            import itertools
+
+            param_names = list(range_params.keys())
+            param_values = [range_params[name] for name in param_names]
+
+            for values in itertools.product(*param_values):
+                params = dict(zip(param_names, values))
+                spec = template.format(**params)
+                all_specs.append(spec)
+        else:
+            # No parameters, just use template as-is
+            all_specs.append(template)
+
+    return all_specs
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Universal vLLM attention benchmark",
@@ -170,8 +244,23 @@ def main():
             args.mode = None
 
         # Batch specs and sizes
-        if "batch_specs" in yaml_config:
+        # Support both explicit batch_specs and generated batch_spec_ranges
+        if "batch_spec_ranges" in yaml_config:
+            # Generate batch specs from ranges
+            generated_specs = generate_batch_specs_from_ranges(
+                yaml_config["batch_spec_ranges"]
+            )
+            # Combine with any explicit batch_specs
+            if "batch_specs" in yaml_config:
+                args.batch_specs = yaml_config["batch_specs"] + generated_specs
+            else:
+                args.batch_specs = generated_specs
+            console.print(
+                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
+            )
+        elif "batch_specs" in yaml_config:
             args.batch_specs = yaml_config["batch_specs"]
+
         if "batch_sizes" in yaml_config:
             args.batch_sizes = yaml_config["batch_sizes"]
         else:
diff --git a/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
index 687c2ff6ff1a..b34ce349d147 100644
--- a/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
@@ -17,68 +17,29 @@ mode: "decode_vs_prefill"
 #   - decode: threshold >= query_length (forces decode pipeline)
 #   - prefill: threshold < query_length (forces prefill pipeline)
 #
-# We use spec<N>s1k format which creates q_len=N, kv_len=1024 requests
+# We use q<N>kv1k format which creates q_len=N, kv_len=1024 requests
 # This tests different query lengths with fixed KV cache context
-batch_specs:
-  # Fine-grained: 1-16 (decode range, step 1)
-  - "q1kv1k"         # q_len=1 (regular decode, not spec)
-  - "q2kv1k"    # q_len=2
-  - "q3kv1k"    # q_len=3
-  - "q4kv1k"    # q_len=4
-  - "q5kv1k"    # q_len=5
-  - "q6kv1k"    # q_len=6
-  - "q7kv1k"    # q_len=7
-  - "q8kv1k"    # q_len=8
-  - "q9kv1k"    # q_len=9
-  - "q10kv1k"   # q_len=10
-  - "q11kv1k"   # q_len=11
-  - "q12kv1k"   # q_len=12
-  - "q13kv1k"   # q_len=13
-  - "q14kv1k"   # q_len=14
-  - "q15kv1k"   # q_len=15
-  - "q16kv1k"   # q_len=16
-  # Transition zone: 17-64 (step 2)
-  - "q17kv1k"
-  - "q19kv1k"
-  - "q21kv1k"
-  - "q23kv1k"
-  - "q25kv1k"
-  - "q27kv1k"
-  - "q29kv1k"
-  - "q31kv1k"
-  - "q33kv1k"
-  - "q35kv1k"
-  - "q37kv1k"
-  - "q39kv1k"
-  - "q41kv1k"
-  - "q43kv1k"
-  - "q45kv1k"
-  - "q47kv1k"
-  - "q49kv1k"
-  - "q51kv1k"
-  - "q53kv1k"
-  - "q55kv1k"
-  - "q57kv1k"
-  - "q59kv1k"
-  - "q61kv1k"
-  - "q63kv1k"
-  # Prefill range: 65-128 (step 4)
-  - "q65kv1k"
-  - "q69kv1k"
-  - "q73kv1k"
-  - "q77kv1k"
-  - "q81kv1k"
-  - "q85kv1k"
-  - "q89kv1k"
-  - "q93kv1k"
-  - "q97kv1k"
-  - "q101kv1k"
-  - "q105kv1k"
-  - "q109kv1k"
-  - "q113kv1k"
-  - "q117kv1k"
-  - "q121kv1k"
-  - "q125kv1k"
+#
+# Using batch_spec_ranges for automatic generation:
+batch_spec_ranges:
+  - template: "q{q_len}kv1k"
+    q_len:
+      start: 1
+      stop: 16
+      step: 1
+      end_inclusive: false
+  - template: "q{q_len}kv1k"
+    q_len:
+      start: 16
+      stop: 64
+      step: 2
+      end_inclusive: false
+  - template: "q{q_len}kv1k"
+    q_len:
+      start: 64
+      stop: 1024
+      step: 4
+      end_inclusive: true
 
 # Batch sizes to test (from old script)
 batch_sizes:
diff --git a/benchmarks/attention_benchmarks/test_batch_spec.py b/benchmarks/attention_benchmarks/test_batch_spec.py
index d4cfad48bb83..0bfee494387b 100644
--- a/benchmarks/attention_benchmarks/test_batch_spec.py
+++ b/benchmarks/attention_benchmarks/test_batch_spec.py
@@ -16,6 +16,7 @@
     parse_batch_spec,
     parse_manual_batch,
 )
+from benchmark import generate_batch_specs_from_ranges
 
 
 def test_basic_patterns():
@@ -157,6 +158,104 @@ def test_error_handling():
         print("  ✓ Invalid kv_len raises ValueError")
 
 
+def test_range_generation_simple():
+    """Test simple range generation."""
+    print("\nTesting range generation (simple)...")
+
+    ranges = [{"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 5, "step": 1}}]
+    specs = generate_batch_specs_from_ranges(ranges)
+    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q4kv1k", "q5kv1k"]
+    assert specs == expected, f"Expected {expected}, got {specs}"
+    print(f"  ✓ Simple range: {len(specs)} specs generated")
+
+
+def test_range_generation_multiple():
+    """Test multiple range specifications."""
+    print("\nTesting range generation (multiple ranges)...")
+
+    ranges = [
+        {"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 3, "step": 1}},
+        {"template": "q{q_len}kv1k", "q_len": {"start": 10, "stop": 20, "step": 5}},
+    ]
+    specs = generate_batch_specs_from_ranges(ranges)
+    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q10kv1k", "q15kv1k", "q20kv1k"]
+    assert specs == expected, f"Expected {expected}, got {specs}"
+    print(f"  ✓ Multiple ranges: {len(specs)} specs generated")
+
+
+def test_range_generation_large():
+    """Test large range similar to study4 config."""
+    print("\nTesting range generation (large range)...")
+
+    ranges = [
+        {"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 16, "step": 1}},
+        {"template": "q{q_len}kv1k", "q_len": {"start": 17, "stop": 64, "step": 2}},
+        {"template": "q{q_len}kv1k", "q_len": {"start": 65, "stop": 128, "step": 4}},
+    ]
+    specs = generate_batch_specs_from_ranges(ranges)
+    expected_count = 16 + 24 + 16  # (1-16) + (17,19,21...63) + (65,69,73...125)
+    assert len(specs) == expected_count, (
+        f"Expected {expected_count} specs, got {len(specs)}"
+    )
+    print(f"  ✓ Large range: {len(specs)} specs generated")
+
+
+def test_range_generation_cartesian():
+    """Test Cartesian product with multiple parameters."""
+    print("\nTesting range generation (Cartesian product)...")
+
+    ranges = [
+        {
+            "template": "q{q_len}kv{kv_len}k",
+            "q_len": {"start": 1, "stop": 2, "step": 1},
+            "kv_len": {"start": 1, "stop": 2, "step": 1},
+        }
+    ]
+    specs = generate_batch_specs_from_ranges(ranges)
+    # Should generate Cartesian product: (1,1), (1,2), (2,1), (2,2)
+    expected = ["q1kv1k", "q1kv2k", "q2kv1k", "q2kv2k"]
+    assert specs == expected, f"Expected {expected}, got {specs}"
+    print(f"  ✓ Cartesian product: {len(specs)} specs generated")
+
+
+def test_range_generation_end_inclusive():
+    """Test end_inclusive parameter."""
+    print("\nTesting range generation (end_inclusive)...")
+
+    # Test inclusive (default)
+    ranges_inclusive = [
+        {"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 3, "step": 1}}
+    ]
+    specs = generate_batch_specs_from_ranges(ranges_inclusive)
+    expected = ["q1kv1k", "q2kv1k", "q3kv1k"]
+    assert specs == expected, f"Expected {expected}, got {specs}"
+    print(f"  ✓ end_inclusive default (true): {specs}")
+
+    # Test explicit inclusive
+    ranges_explicit_inclusive = [
+        {
+            "template": "q{q_len}kv1k",
+            "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": True},
+        }
+    ]
+    specs = generate_batch_specs_from_ranges(ranges_explicit_inclusive)
+    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q4kv1k", "q5kv1k"]
+    assert specs == expected, f"Expected {expected}, got {specs}"
+    print("  ✓ end_inclusive=true: includes stop value")
+
+    # Test exclusive
+    ranges_exclusive = [
+        {
+            "template": "q{q_len}kv1k",
+            "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": False},
+        }
+    ]
+    specs = generate_batch_specs_from_ranges(ranges_exclusive)
+    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q4kv1k"]
+    assert specs == expected, f"Expected {expected}, got {specs}"
+    print("  ✓ end_inclusive=false: excludes stop value")
+
+
 def main():
     """Run all tests."""
     print("=" * 60)
@@ -171,6 +270,11 @@ def main():
     test_batch_stats()
     test_manual_batch()
     test_error_handling()
+    test_range_generation_simple()
+    test_range_generation_multiple()
+    test_range_generation_large()
+    test_range_generation_cartesian()
+    test_range_generation_end_inclusive()
 
     print("\n" + "=" * 60)
     print("All tests passed! ✓")

From 1e1b5416aa7e63403ad6675cbd8189e950772fbb Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 13 Oct 2025 18:20:08 +0000
Subject: [PATCH 06/45] rename

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md     | 32 +++++++++----------
 ..._numsplits.yaml => cutlass_numsplits.yaml} |  4 +--
 ...utlass.yaml => flashinfer_vs_cutlass.yaml} |  4 +--
 ...head_count.yaml => hopper_head_count.yaml} |  8 ++---
 ..._threshold.yaml => reorder_threshold.yaml} |  6 ++--
 5 files changed, 27 insertions(+), 27 deletions(-)
 rename benchmarks/attention_benchmarks/configs/{study1_cutlass_numsplits.yaml => cutlass_numsplits.yaml} (93%)
 rename benchmarks/attention_benchmarks/configs/{study3_flashinfer_vs_cutlass.yaml => flashinfer_vs_cutlass.yaml} (94%)
 rename benchmarks/attention_benchmarks/configs/{study2_hopper_head_count.yaml => hopper_head_count.yaml} (87%)
 rename benchmarks/attention_benchmarks/configs/{study4_reorder_threshold.yaml => reorder_threshold.yaml} (93%)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 51a0167b6df3..7086cc3d575a 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -12,10 +12,10 @@ python test_batch_spec.py
 # ✓ All tests pass
 
 # Run one of the 4 research studies
-python benchmark.py --config configs/study1_cutlass_numsplits.yaml
-python benchmark.py --config configs/study2_hopper_head_count.yaml
-python benchmark.py --config configs/study3_flashinfer_vs_cutlass.yaml
-python benchmark.py --config configs/study4_reorder_threshold.yaml
+python benchmark.py --config configs/cutlass_numsplits.yaml
+python benchmark.py --config configs/hopper_head_count.yaml
+python benchmark.py --config configs/flashinfer_vs_cutlass.yaml
+python benchmark.py --config configs/reorder_threshold.yaml
 
 # Or run custom benchmarks
 python benchmark.py \
@@ -61,7 +61,7 @@ The suite includes 4 pre-configured studies to answer key MLA optimization quest
 **Question:** Should we revert the CUTLASS MLA num-splits heuristic (PRs #24966, #25509)?
 
 ```bash
-python benchmark.py --config configs/study1_cutlass_numsplits.yaml
+python benchmark.py --config configs/cutlass_numsplits.yaml
 ```
 
 Tests CUTLASS MLA with different `num_kv_splits` values (1, 2, 4, 8, 16, 32) across various batch sizes and compares against auto-selection.
@@ -72,13 +72,13 @@ Tests CUTLASS MLA with different `num_kv_splits` values (1, 2, 4, 8, 16, 32) acr
 
 ```bash
 # Test with default head count (128)
-python benchmark.py --config configs/study2_hopper_head_count.yaml
+python benchmark.py --config configs/hopper_head_count.yaml
 
 # Test with different head counts
 for heads in 16 32 64 128 256; do
-    python benchmark.py --config configs/study2_hopper_head_count.yaml \
+    python benchmark.py --config configs/hopper_head_count.yaml \
         --num-q-heads $heads \
-        --output-csv study2_heads_${heads}.csv
+        --output-csv hopper_heads_${heads}.csv
 done
 ```
 
@@ -89,7 +89,7 @@ Compares FlashAttn MLA and FlashMLA performance with varying attention head coun
 **Question:** Is FlashInfer-MLA better than CUTLASS MLA after num-splits optimization?
 
 ```bash
-python benchmark.py --config configs/study3_flashinfer_vs_cutlass.yaml
+python benchmark.py --config configs/flashinfer_vs_cutlass.yaml
 ```
 
 Compares FlashInfer-MLA against CUTLASS MLA with optimized `num_kv_splits` values.
@@ -99,15 +99,15 @@ Compares FlashInfer-MLA against CUTLASS MLA with optimized `num_kv_splits` value
 **Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
 
 **Methodology:** Reproduces the original `benchmark_mla_threshold.py` study using the new interface:
-- For each query length (1-125), test BOTH decode and prefill pipelines
+- For each query length (1-2048), test BOTH decode and prefill pipelines
 - Find the crossover point where prefill becomes faster
 - Analyze how this varies across batch sizes (1-256)
 
 ```bash
-python benchmark.py --config configs/study4_reorder_threshold.yaml
+python benchmark.py --config configs/reorder_threshold.yaml
 ```
 
-Tests query lengths from 1-125 (fine-grained 1-16, step 2 for 17-64, step 4 for 65-125) across 9 batch sizes. For each query length, compares:
+Tests query lengths from 1-2048 (fine-grained steps at low values, coarser at high values) across 9 batch sizes. For each query length, compares:
 - **Decode pipeline**: `threshold >= query_length`
 - **Prefill pipeline**: `threshold < query_length`
 
@@ -285,10 +285,10 @@ attention_benchmarks/
 ├── benchmark.py                   # Universal benchmark script
 │
 └── configs/                       # Pre-configured studies
-    ├── study1_cutlass_numsplits.yaml      # CUTLASS num-splits optimization
-    ├── study2_hopper_head_count.yaml      # FlashAttn vs FlashMLA head count
-    ├── study3_flashinfer_vs_cutlass.yaml  # FlashInfer vs optimized CUTLASS
-    └── study4_reorder_threshold.yaml      # Reorder threshold optimization
+    ├── cutlass_numsplits.yaml         # CUTLASS num-splits optimization
+    ├── hopper_head_count.yaml         # FlashAttn vs FlashMLA head count
+    ├── flashinfer_vs_cutlass.yaml     # FlashInfer vs optimized CUTLASS
+    └── reorder_threshold.yaml         # Reorder threshold optimization
 ```
 
 ## Tips
diff --git a/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
similarity index 93%
rename from benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
rename to benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
index 49ebbd247b21..430b42a620c4 100644
--- a/benchmarks/attention_benchmarks/configs/study1_cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
@@ -45,8 +45,8 @@ benchmark:
 
 # Output
 output:
-  csv: "study1_cutlass_numsplits_results.csv"
-  json: "study1_cutlass_numsplits_results.json"
+  csv: "cutlass_numsplits_results.csv"
+  json: "cutlass_numsplits_results.json"
 
 # Expected outcome:
 # - Identify if auto-selection heuristic is optimal
diff --git a/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
similarity index 94%
rename from benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
rename to benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
index 3eb799961c12..abd856547705 100644
--- a/benchmarks/attention_benchmarks/configs/study3_flashinfer_vs_cutlass.yaml
+++ b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
@@ -45,8 +45,8 @@ benchmark:
 
 # Output
 output:
-  csv: "study3_flashinfer_vs_cutlass_results.csv"
-  json: "study3_flashinfer_vs_cutlass_results.json"
+  csv: "flashinfer_vs_cutlass_results.csv"
+  json: "flashinfer_vs_cutlass_results.json"
 
 # Expected outcome:
 # - Determine if FlashInfer-MLA is competitive with optimized CUTLASS
diff --git a/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
similarity index 87%
rename from benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
rename to benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
index fca34736ad96..ef68249cd8b0 100644
--- a/benchmarks/attention_benchmarks/configs/study2_hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
@@ -36,14 +36,14 @@ benchmark:
 
 # Output
 output:
-  csv: "study2_hopper_head_count_results.csv"
-  json: "study2_hopper_head_count_results.json"
+  csv: "hopper_head_count_results.csv"
+  json: "hopper_head_count_results.json"
 
 # To test different head counts, run:
 # for heads in 16 32 64 128 256; do
-#   python benchmark.py --config configs/study2_hopper_head_count.yaml \
+#   python benchmark.py --config configs/hopper_head_count.yaml \
 #     --num-q-heads $heads \
-#     --output-csv study2_heads_${heads}.csv
+#     --output-csv hopper_heads_${heads}.csv
 # done
 
 # Expected outcome:
diff --git a/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
similarity index 93%
rename from benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
rename to benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
index b34ce349d147..a82a97f0b239 100644
--- a/benchmarks/attention_benchmarks/configs/study4_reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -5,7 +5,7 @@
 
 description: "Decode vs Prefill pipeline crossover analysis"
 
-# Test FlashAttn MLA (recommended - FlashMLA has known issues with speculative decode)
+# Test FlashAttn MLA
 backend: flash_attn_mla
 
 # Mode: decode_vs_prefill comparison (special sweep mode)
@@ -70,8 +70,8 @@ benchmark:
 
 # Output
 output:
-  csv: "study4_reorder_threshold_results.csv"
-  json: "study4_reorder_threshold_results.json"
+  csv: "reorder_threshold_results.csv"
+  json: "reorder_threshold_results.json"
 
 # Expected outcome (reproduces old benchmark_mla_threshold.py study):
 # - For each batch size, find the crossover point where prefill becomes faster than decode

From 7e3fad2df319eea179c3df2c36ee6c7cf9900190 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 13 Oct 2025 18:31:48 +0000
Subject: [PATCH 07/45] disambiguate grammar

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md     | 46 +++++++++----------
 benchmarks/attention_benchmarks/batch_spec.py | 26 +++++------
 .../configs/cutlass_numsplits.yaml            | 12 ++---
 .../configs/flashinfer_vs_cutlass.yaml        | 14 +++---
 .../configs/hopper_head_count.yaml            | 10 ++--
 .../configs/mla_decode.yaml                   | 32 ++++++-------
 .../configs/mla_mixed_batch.yaml              | 32 ++++++-------
 .../configs/reorder_threshold.yaml            | 10 ++--
 .../configs/speculative_decode.yaml           | 32 ++++++-------
 .../configs/standard_attention.yaml           | 14 +++---
 .../attention_benchmarks/test_batch_spec.py   | 34 +++++++-------
 11 files changed, 131 insertions(+), 131 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 7086cc3d575a..8c2b89f305c0 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -20,34 +20,34 @@ python benchmark.py --config configs/reorder_threshold.yaml
 # Or run custom benchmarks
 python benchmark.py \
     --backends flash flashinfer \
-    --batch-specs "q2k" "8q1kv1k" "2q2k_32q1kv1k" \
+    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
     --output-csv results.csv
 ```
 
 ## Simplified Batch Specification Grammar
 
-Express workloads concisely using query length and KV cache size:
+Express workloads concisely using query length and sequence length:
 
 ```python
-"q2k"              # 2048-token prefill (q_len=2048, kv_len=2048)
-"q1kv1k"           # Decode: 1 token with 1K KV cache
-"8q1kv1k"          # 8 decode requests
-"q4kv1k"           # 4-token extend (e.g., spec decode)
-"2q2k_32q1kv1k"    # Mixed: 2 prefills + 32 decodes
-"16q4kv1k"         # 16 spec decode (4 tokens each)
+"q2k"              # 2048-token prefill (q_len=2048, seq_len=2048)
+"q1s1k"            # Decode: 1 token with 1K sequence
+"8q1s1k"           # 8 decode requests
+"q4s1k"            # 4-token extend (e.g., spec decode)
+"2q2k_32q1s1k"     # Mixed: 2 prefills + 32 decodes
+"16q4s1k"          # 16 spec decode (4 tokens each)
 ```
 
 ### Grammar Rule
 
 ```
-Format: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
+Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
 
-- count:  Number of identical requests (optional, default=1)
-- q_len:  Query length (number of new tokens)
-- kv_len: Total KV cache length (optional, defaults to q_len for prefill)
-- 'k':    Multiplies value by 1024
+- count:   Number of identical requests (optional, default=1)
+- q_len:   Query length (number of new tokens)
+- seq_len: Total sequence length (optional, defaults to q_len for prefill)
+- 'k':     Multiplies value by 1024
 
-Mixed batches: Use _ to combine (e.g., "2q2k_32q1kv1k")
+Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
 ```
 
 **Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
@@ -124,7 +124,7 @@ The `benchmark.py` script handles **all** backends - both standard attention and
 ```bash
 python benchmark.py \
     --backends flash triton flashinfer \
-    --batch-specs "q2k" "8q1kv1k" "2q2k_32q1kv1k" \
+    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
     --num-layers 10 \
     --repeats 5 \
     --output-csv results.csv
@@ -136,7 +136,7 @@ python benchmark.py \
 # Compare all MLA backends
 python benchmark.py \
     --backends cutlass_mla flashinfer_mla flash_attn_mla flashmla \
-    --batch-specs "64q1kv1k" "64q1kv4k" \
+    --batch-specs "64q1s1k" "64q1s4k" \
     --output-csv mla_results.csv
 ```
 
@@ -147,7 +147,7 @@ python benchmark.py \
 ```bash
 python benchmark.py \
     --backend cutlass_mla \
-    --batch-specs "64q1kv1k" "64q1kv4k" "64q1kv16k" \
+    --batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
     --num-splits 1 2 4 8 16 \
     --compare-auto \
     --output-json optimal_splits.json
@@ -160,7 +160,7 @@ python benchmark.py \
 ```bash
 python benchmark.py \
     --backend flashmla \
-    --batch-specs "q4kv1k" "q8kv2k" \
+    --batch-specs "q4s1k" "q8s2k" \
     --thresholds 1 4 16 64 256 512 \
     --output-csv threshold_sweep.csv
 ```
@@ -173,7 +173,7 @@ python benchmark.py \
 --backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
                                     # flashinfer_mla, flash_attn_mla, flashmla
 --backend BACKEND                   # Single backend (alternative to --backends)
---batch-specs SPEC [SPEC ...]       # Batch specifications (default: ["q2k", "8q1kv1k"])
+--batch-specs SPEC [SPEC ...]       # Batch specifications (default: ["q2k", "8q1s1k"])
 
 # Model configuration
 --num-layers N                      # Number of layers (default: 10)
@@ -223,7 +223,7 @@ from common import BenchmarkConfig
 
 config = BenchmarkConfig(
     backend="cutlass_mla",
-    batch_spec="64q1kv4k",
+    batch_spec="64q1s4k",
     num_layers=10,
     head_dim=576,
     num_q_heads=128,
@@ -255,9 +255,9 @@ from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
 from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
 
 # Parse batch specs
-requests = parse_batch_spec("2q2k_q4kv1k_32s1k")
+requests = parse_batch_spec("2q2k_q4s1k_32s1k")
 print(format_batch_spec(requests))
-# "2 prefill (2x2k), 1 specdecode (1xq4s1k), 32 decode (32x1k)"
+# "2 prefill (2x2k), 1 extend (1xq4s1k), 32 decode (32x1k)"
 
 # Get batch statistics
 stats = get_batch_stats(requests)
@@ -318,7 +318,7 @@ source /path/to/vllm/.venv/bin/activate
 
 **OOM?**
 - Reduce batch size: `"32s1k"` → `"16s1k"`
-- Reduce sequence length: `"64q1kv16k"` → `"64q1kv4k"`
+- Reduce sequence length: `"64q1s16k"` → `"64q1s4k"`
 
 ## What's Included
 
diff --git a/benchmarks/attention_benchmarks/batch_spec.py b/benchmarks/attention_benchmarks/batch_spec.py
index 11eab551edd8..eaa82ad9dddb 100644
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -5,25 +5,25 @@
 Simplified batch specification grammar for attention benchmarks.
 
 Grammar (underscore-separated segments):
-  Format: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
+  Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
 
   - count: Number of identical requests (optional, default=1)
   - q_len: Query length (number of new tokens)
-  - kv_len: Total KV cache length (optional, defaults to q_len for prefill)
+  - seq_len: Total sequence length (optional, defaults to q_len for prefill)
   - 'k' suffix: Multiplies value by 1024
 
 Common patterns:
-  - Prefill:  q_len == kv_len  (e.g., "q2k" → 2048 new tokens, 2048 KV)
-  - Decode:   q_len == 1        (e.g., "q1kv1k" → 1 token, 1024 KV cache)
-  - Extend:   q_len < kv_len    (e.g., "q4kv1k" → 4 tokens, 1024 KV cache)
+  - Prefill:  q_len == seq_len  (e.g., "q2k" → 2048 new tokens, 2048 seq)
+  - Decode:   q_len == 1        (e.g., "q1s1k" → 1 token, 1024 seq length)
+  - Extend:   q_len < seq_len   (e.g., "q4s1k" → 4 tokens, 1024 seq length)
 
 Examples:
   q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
-  q1kv1k           -> [(1, 1024)]              # Decode: 1 token, 1K KV cache
-  8q1kv1k          -> [(1, 1024)] * 8          # 8 decode requests
-  q4kv1k           -> [(4, 1024)]              # 4-token extend (spec decode)
-  2q1k_32q1kv1k    -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
-  16q4kv1k         -> [(4, 1024)] * 16         # 16 spec decode requests
+  q1s1k            -> [(1, 1024)]              # Decode: 1 token, 1K sequence
+  8q1s1k           -> [(1, 1024)] * 8          # 8 decode requests
+  q4s1k            -> [(4, 1024)]              # 4-token extend (spec decode)
+  2q1k_32q1s1k     -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
+  16q4s1k          -> [(4, 1024)] * 16         # 16 spec decode requests
 """
 
 from collections import Counter
@@ -100,7 +100,7 @@ def parse_batch_spec(spec: str) -> list[BatchRequest]:
     """
     Parse batch specification string into list of BatchRequest objects.
 
-    Grammar: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
+    Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
 
     Args:
         spec: Batch specification string (see module docstring for grammar)
@@ -114,8 +114,8 @@ def parse_batch_spec(spec: str) -> list[BatchRequest]:
     requests = []
 
     for seg in spec.split("_"):
-        # Unified pattern: (<count>?) q<q_len>(k?) (kv<kv_len>(k?))?
-        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:kv(\d+)(k?))?$", seg)
+        # Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
         if m:
             cnt = int(m.group(1)) if m.group(1) else 1
             q_len = _parse_size(m.group(2), m.group(3))
diff --git a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
index 430b42a620c4..1b5305c0c866 100644
--- a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
@@ -9,12 +9,12 @@ backend: cutlass_mla
 
 # Test various decode batch sizes with different KV cache lengths
 batch_specs:
-  - "32q1kv1k"     # 32 decode requests, 1k KV cache
-  - "64q1kv1k"     # 64 decode requests, 1k KV cache
-  - "64q1kv4k"     # 64 decode requests, 4k KV cache
-  - "64q1kv16k"    # 64 decode requests, 16k KV cache
-  - "128q1kv1k"    # 128 decode requests, 1k KV cache
-  - "128q1kv4k"    # 128 decode requests, 4k KV cache
+  - "32q1s1k"     # 32 decode requests, 1k KV cache
+  - "64q1s1k"     # 64 decode requests, 1k KV cache
+  - "64q1s4k"     # 64 decode requests, 4k KV cache
+  - "64q1s16k"    # 64 decode requests, 16k KV cache
+  - "128q1s1k"    # 128 decode requests, 1k KV cache
+  - "128q1s4k"    # 128 decode requests, 4k KV cache
 
 # Sweep num_kv_splits values
 num_splits:
diff --git a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
index abd856547705..1b4a0904f8b4 100644
--- a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
+++ b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
@@ -10,13 +10,13 @@ backends:
 
 # Test various decode workloads
 batch_specs:
-  - "32q1kv1k"     # 32 decode requests, 1k KV cache
-  - "64q1kv1k"     # 64 decode requests, 1k KV cache
-  - "64q1kv4k"     # 64 decode requests, 4k KV cache
-  - "64q1kv16k"    # 64 decode requests, 16k KV cache
-  - "128q1kv1k"    # 128 decode requests, 1k KV cache
-  - "128q1kv4k"    # 128 decode requests, 4k KV cache
-  - "128q1kv16k"   # 128 decode requests, 16k KV cache
+  - "32q1s1k"     # 32 decode requests, 1k KV cache
+  - "64q1s1k"     # 64 decode requests, 1k KV cache
+  - "64q1s4k"     # 64 decode requests, 4k KV cache
+  - "64q1s16k"    # 64 decode requests, 16k KV cache
+  - "128q1s1k"    # 128 decode requests, 1k KV cache
+  - "128q1s4k"    # 128 decode requests, 4k KV cache
+  - "128q1s16k"   # 128 decode requests, 16k KV cache
 
 # For CUTLASS, test optimized num_kv_splits
 # Based on Study 1 results, you may want to adjust these values
diff --git a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
index ef68249cd8b0..c518ed55d32a 100644
--- a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
@@ -11,11 +11,11 @@ backends:
 
 # Standard decode workloads
 batch_specs:
-  - "32q1kv1k"     # 32 decode requests, 1k KV cache
-  - "64q1kv1k"     # 64 decode requests, 1k KV cache
-  - "64q1kv4k"     # 64 decode requests, 4k KV cache
-  - "128q1kv1k"    # 128 decode requests, 1k KV cache
-  - "128q1kv4k"    # 128 decode requests, 4k KV cache
+  - "32q1s1k"     # 32 decode requests, 1k KV cache
+  - "64q1s1k"     # 64 decode requests, 1k KV cache
+  - "64q1s4k"     # 64 decode requests, 4k KV cache
+  - "128q1s1k"    # 128 decode requests, 1k KV cache
+  - "128q1s4k"    # 128 decode requests, 4k KV cache
 
 # Model configuration - will test different head counts
 # Note: You'll need to run this multiple times with different num_q_heads values
diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
index 2b17d40cb2c6..d46676afcb6e 100644
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -14,30 +14,30 @@ model:
 
 batch_specs:
   # Small batches, varying sequence lengths
-  - "16q1kv512"     # 16 requests, 512 KV cache
-  - "16q1kv1k"      # 16 requests, 1k KV cache
-  - "16q1kv2k"      # 16 requests, 2k KV cache
-  - "16q1kv4k"      # 16 requests, 4k KV cache
+  - "16q1s512"     # 16 requests, 512 KV cache
+  - "16q1s1k"      # 16 requests, 1k KV cache
+  - "16q1s2k"      # 16 requests, 2k KV cache
+  - "16q1s4k"      # 16 requests, 4k KV cache
 
   # Medium batches
-  - "32q1kv1k"      # 32 requests, 1k KV cache
-  - "32q1kv2k"      # 32 requests, 2k KV cache
-  - "32q1kv4k"      # 32 requests, 4k KV cache
-  - "32q1kv8k"      # 32 requests, 8k KV cache
+  - "32q1s1k"      # 32 requests, 1k KV cache
+  - "32q1s2k"      # 32 requests, 2k KV cache
+  - "32q1s4k"      # 32 requests, 4k KV cache
+  - "32q1s8k"      # 32 requests, 8k KV cache
 
   # Large batches
-  - "64q1kv1k"      # 64 requests, 1k KV cache
-  - "64q1kv2k"      # 64 requests, 2k KV cache
-  - "64q1kv4k"      # 64 requests, 4k KV cache
-  - "64q1kv8k"      # 64 requests, 8k KV cache
+  - "64q1s1k"      # 64 requests, 1k KV cache
+  - "64q1s2k"      # 64 requests, 2k KV cache
+  - "64q1s4k"      # 64 requests, 4k KV cache
+  - "64q1s8k"      # 64 requests, 8k KV cache
 
   # Very large batches
-  - "128q1kv1k"     # 128 requests, 1k KV cache
-  - "128q1kv2k"     # 128 requests, 2k KV cache
+  - "128q1s1k"     # 128 requests, 1k KV cache
+  - "128q1s2k"     # 128 requests, 2k KV cache
 
   # Long context
-  - "32q1kv16k"     # 32 requests, 16k KV cache
-  - "32q1kv32k"     # 32 requests, 32k KV cache
+  - "32q1s16k"     # 32 requests, 16k KV cache
+  - "32q1s32k"     # 32 requests, 32k KV cache
 
 backends:
   - cutlass_mla
diff --git a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
index c503e44334b9..ce9e05fd21da 100644
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -15,34 +15,34 @@ model:
 
 batch_specs:
   # Small prefill + decode
-  - "1q1k_8q1kv1k"           # 1 prefill + 8 decode
-  - "2q2k_16q1kv1k"          # 2 prefill + 16 decode
-  - "4q1k_32q1kv2k"          # 4 prefill + 32 decode
+  - "1q1k_8q1s1k"           # 1 prefill + 8 decode
+  - "2q2k_16q1s1k"          # 2 prefill + 16 decode
+  - "4q1k_32q1s2k"          # 4 prefill + 32 decode
 
   # Medium prefill + decode
-  - "2q4k_32q1kv2k"          # 2 medium prefill + 32 decode
-  - "4q4k_64q1kv2k"          # 4 medium prefill + 64 decode
-  - "8q2k_64q1kv4k"          # 8 prefill + 64 decode
+  - "2q4k_32q1s2k"          # 2 medium prefill + 32 decode
+  - "4q4k_64q1s2k"          # 4 medium prefill + 64 decode
+  - "8q2k_64q1s4k"          # 8 prefill + 64 decode
 
   # Large prefill + decode (chunked prefill stress test)
-  - "2q8k_32q1kv1k"          # 2 large prefill + 32 decode
-  - "1q16k_16q1kv2k"         # 1 very large prefill + 16 decode
-  - "2q16k_32q1kv4k"         # 2 very large prefill + 32 decode
+  - "2q8k_32q1s1k"          # 2 large prefill + 32 decode
+  - "1q16k_16q1s2k"         # 1 very large prefill + 16 decode
+  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
 
   # Context extension + decode
-  - "2q1kkv2k_16q1kv1k"       # 2 extend + 16 decode
-  - "4q2kkv4k_32q1kv2k"       # 4 extend + 32 decode
-  - "2q1kkv8k_32q1kv2k"       # 2 large extend + 32 decode
+  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
+  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
+  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
 
   # Explicitly chunked prefill
   - "q8k"           # 8k prefill with chunking hint
   - "q16k"          # 16k prefill with chunking hint
-  - "2q8k_32q1kv2k"    # 2 chunked prefill + 32 decode
+  - "2q8k_32q1s2k"    # 2 chunked prefill + 32 decode
 
   # High decode ratio (realistic serving)
-  - "1q2k_63q1kv1k"          # 1 prefill + 63 decode
-  - "2q2k_62q1kv2k"          # 2 prefill + 62 decode
-  - "4q4k_60q1kv4k"          # 4 prefill + 60 decode
+  - "1q2k_63q1s1k"          # 1 prefill + 63 decode
+  - "2q2k_62q1s2k"          # 2 prefill + 62 decode
+  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
 
 backends:
   - cutlass_mla
diff --git a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
index a82a97f0b239..0356d1094001 100644
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -17,24 +17,24 @@ mode: "decode_vs_prefill"
 #   - decode: threshold >= query_length (forces decode pipeline)
 #   - prefill: threshold < query_length (forces prefill pipeline)
 #
-# We use q<N>kv1k format which creates q_len=N, kv_len=1024 requests
-# This tests different query lengths with fixed KV cache context
+# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
+# This tests different query lengths with fixed sequence length context
 #
 # Using batch_spec_ranges for automatic generation:
 batch_spec_ranges:
-  - template: "q{q_len}kv1k"
+  - template: "q{q_len}s1k"
     q_len:
       start: 1
       stop: 16
       step: 1
       end_inclusive: false
-  - template: "q{q_len}kv1k"
+  - template: "q{q_len}s1k"
     q_len:
       start: 16
       stop: 64
       step: 2
       end_inclusive: false
-  - template: "q{q_len}kv1k"
+  - template: "q{q_len}s1k"
     q_len:
       start: 64
       stop: 1024
diff --git a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
index 2982cdaa665c..7a80cdee8066 100644
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -14,30 +14,30 @@ model:
 
 batch_specs:
   # Pure speculative decode (K-token verification)
-  - "q2kv1k"      # 2-token spec, 1k KV
-  - "q4kv1k"      # 4-token spec, 1k KV
-  - "q8kv1k"      # 8-token spec, 1k KV
-  - "q16kv1k"     # 16-token spec, 1k KV
+  - "q2s1k"      # 2-token spec, 1k KV
+  - "q4s1k"      # 4-token spec, 1k KV
+  - "q8s1k"      # 8-token spec, 1k KV
+  - "q16s1k"     # 16-token spec, 1k KV
 
   # Speculative with different context lengths
-  - "q4kv2k"      # 4-token spec, 2k KV
-  - "q4kv4k"      # 4-token spec, 4k KV
-  - "q8kv2k"      # 8-token spec, 2k KV
-  - "q8kv4k"      # 8-token spec, 4k KV
+  - "q4s2k"      # 4-token spec, 2k KV
+  - "q4s4k"      # 4-token spec, 4k KV
+  - "q8s2k"      # 8-token spec, 2k KV
+  - "q8s4k"      # 8-token spec, 4k KV
 
   # Mixed: speculative + regular decode
-  - "32q4kv1k"                    # 32 spec requests
-  - "16q4kv1k_16q1kv1k"              # 16 spec + 16 regular
-  - "8q8kv2k_24q1kv2k"               # 8 spec (8-tok) + 24 regular
+  - "32q4s1k"                    # 32 spec requests
+  - "16q4s1k_16q1s1k"              # 16 spec + 16 regular
+  - "8q8s2k_24q1s2k"               # 8 spec (8-tok) + 24 regular
 
   # Mixed: speculative + prefill + decode
-  - "2q1k_16q4kv1k_16q1kv1k"         # 2 prefill + 16 spec + 16 decode
-  - "4q2k_32q4kv2k_32q1kv2k"         # 4 prefill + 32 spec + 32 decode
+  - "2q1k_16q4s1k_16q1s1k"         # 2 prefill + 16 spec + 16 decode
+  - "4q2k_32q4s2k_32q1s2k"         # 4 prefill + 32 spec + 32 decode
 
   # Large batches with speculation
-  - "64q4kv1k"                    # 64 spec requests
-  - "32q8kv2k"                    # 32 spec (8-token)
-  - "16q16kv4k"                   # 16 spec (16-token)
+  - "64q4s1k"                    # 64 spec requests
+  - "32q8s2k"                    # 32 spec (8-token)
+  - "16q16s4k"                   # 16 spec (16-token)
 
 # Backends that support query length > 1
 backends:
diff --git a/benchmarks/attention_benchmarks/configs/standard_attention.yaml b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
index 622223ecd151..5376b62f23f5 100644
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -15,15 +15,15 @@ batch_specs:
   - "q8k"       # Very large prefill (8192 tokens)
 
   # Pure decode
-  - "8q1kv1k"      # 8 requests, 1k KV cache each
-  - "16q1kv2k"     # 16 requests, 2k KV cache each
-  - "32q1kv1k"     # 32 requests, 1k KV cache each
-  - "64q1kv4k"     # 64 requests, 4k KV cache each
+  - "8q1s1k"      # 8 requests, 1k KV cache each
+  - "16q1s2k"     # 16 requests, 2k KV cache each
+  - "32q1s1k"     # 32 requests, 1k KV cache each
+  - "64q1s4k"     # 64 requests, 4k KV cache each
 
   # Mixed prefill/decode
-  - "2q2k_8q1kv1k"      # 2 prefill + 8 decode
-  - "4q1k_16q1kv2k"     # 4 prefill + 16 decode
-  - "2q4k_32q1kv1k"     # 2 large prefill + 32 decode
+  - "2q2k_8q1s1k"      # 2 prefill + 8 decode
+  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
+  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
 
   # Context extension
   - "q1kkv2k"         # 1k query, 2k KV (chunked prefill)
diff --git a/benchmarks/attention_benchmarks/test_batch_spec.py b/benchmarks/attention_benchmarks/test_batch_spec.py
index 0bfee494387b..c6db153c9102 100644
--- a/benchmarks/attention_benchmarks/test_batch_spec.py
+++ b/benchmarks/attention_benchmarks/test_batch_spec.py
@@ -39,7 +39,7 @@ def test_basic_patterns():
     print("  ✓ 8s1k -> 8 x [(1, 1024)]")
 
     # Context extension
-    result = parse_batch_spec("q1ks2k")
+    result = parse_batch_spec("q1s2k")
     assert len(result) == 1
     assert result[0].q_len == 1024
     assert result[0].kv_len == 2048
@@ -162,9 +162,9 @@ def test_range_generation_simple():
     """Test simple range generation."""
     print("\nTesting range generation (simple)...")
 
-    ranges = [{"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 5, "step": 1}}]
+    ranges = [{"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 5, "step": 1}}]
     specs = generate_batch_specs_from_ranges(ranges)
-    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q4kv1k", "q5kv1k"]
+    expected = ["q1s1k", "q2s1k", "q3s1k", "q4s1k", "q5s1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ Simple range: {len(specs)} specs generated")
 
@@ -174,11 +174,11 @@ def test_range_generation_multiple():
     print("\nTesting range generation (multiple ranges)...")
 
     ranges = [
-        {"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 3, "step": 1}},
-        {"template": "q{q_len}kv1k", "q_len": {"start": 10, "stop": 20, "step": 5}},
+        {"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 3, "step": 1}},
+        {"template": "q{q_len}s1k", "q_len": {"start": 10, "stop": 20, "step": 5}},
     ]
     specs = generate_batch_specs_from_ranges(ranges)
-    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q10kv1k", "q15kv1k", "q20kv1k"]
+    expected = ["q1s1k", "q2s1k", "q3s1k", "q10s1k", "q15s1k", "q20s1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ Multiple ranges: {len(specs)} specs generated")
 
@@ -188,9 +188,9 @@ def test_range_generation_large():
     print("\nTesting range generation (large range)...")
 
     ranges = [
-        {"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 16, "step": 1}},
-        {"template": "q{q_len}kv1k", "q_len": {"start": 17, "stop": 64, "step": 2}},
-        {"template": "q{q_len}kv1k", "q_len": {"start": 65, "stop": 128, "step": 4}},
+        {"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 16, "step": 1}},
+        {"template": "q{q_len}s1k", "q_len": {"start": 17, "stop": 64, "step": 2}},
+        {"template": "q{q_len}s1k", "q_len": {"start": 65, "stop": 128, "step": 4}},
     ]
     specs = generate_batch_specs_from_ranges(ranges)
     expected_count = 16 + 24 + 16  # (1-16) + (17,19,21...63) + (65,69,73...125)
@@ -206,14 +206,14 @@ def test_range_generation_cartesian():
 
     ranges = [
         {
-            "template": "q{q_len}kv{kv_len}k",
+            "template": "q{q_len}s{kv_len}k",
             "q_len": {"start": 1, "stop": 2, "step": 1},
             "kv_len": {"start": 1, "stop": 2, "step": 1},
         }
     ]
     specs = generate_batch_specs_from_ranges(ranges)
     # Should generate Cartesian product: (1,1), (1,2), (2,1), (2,2)
-    expected = ["q1kv1k", "q1kv2k", "q2kv1k", "q2kv2k"]
+    expected = ["q1s1k", "q1s2k", "q2s1k", "q2s2k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ Cartesian product: {len(specs)} specs generated")
 
@@ -224,34 +224,34 @@ def test_range_generation_end_inclusive():
 
     # Test inclusive (default)
     ranges_inclusive = [
-        {"template": "q{q_len}kv1k", "q_len": {"start": 1, "stop": 3, "step": 1}}
+        {"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 3, "step": 1}}
     ]
     specs = generate_batch_specs_from_ranges(ranges_inclusive)
-    expected = ["q1kv1k", "q2kv1k", "q3kv1k"]
+    expected = ["q1s1k", "q2s1k", "q3s1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ end_inclusive default (true): {specs}")
 
     # Test explicit inclusive
     ranges_explicit_inclusive = [
         {
-            "template": "q{q_len}kv1k",
+            "template": "q{q_len}s1k",
             "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": True},
         }
     ]
     specs = generate_batch_specs_from_ranges(ranges_explicit_inclusive)
-    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q4kv1k", "q5kv1k"]
+    expected = ["q1s1k", "q2s1k", "q3s1k", "q4s1k", "q5s1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print("  ✓ end_inclusive=true: includes stop value")
 
     # Test exclusive
     ranges_exclusive = [
         {
-            "template": "q{q_len}kv1k",
+            "template": "q{q_len}s1k",
             "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": False},
         }
     ]
     specs = generate_batch_specs_from_ranges(ranges_exclusive)
-    expected = ["q1kv1k", "q2kv1k", "q3kv1k", "q4kv1k"]
+    expected = ["q1s1k", "q2s1k", "q3s1k", "q4s1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print("  ✓ end_inclusive=false: excludes stop value")
 

From 6bc0f8217d5536ebb7697777ba60fe5ee6c15d62 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 13 Oct 2025 21:10:51 +0000
Subject: [PATCH 08/45] use metadata builders

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/reorder_threshold.yaml            |   5 +-
 benchmarks/attention_benchmarks/mla_runner.py | 170 ++++++++----------
 2 files changed, 80 insertions(+), 95 deletions(-)

diff --git a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
index 0356d1094001..681eac81e4e7 100644
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -75,7 +75,7 @@ output:
 
 # Expected outcome (reproduces old benchmark_mla_threshold.py study):
 # - For each batch size, find the crossover point where prefill becomes faster than decode
-# - Show decode vs prefill performance across all query lengths (1-125)
+# - Show decode vs prefill performance across all query lengths
 # - Determine optimal reorder_batch_threshold based on last query length where decode is faster
 # - Understand how crossover point varies with batch size
 # - Provide data-driven guidance for default threshold value
@@ -85,7 +85,4 @@ output:
 #     * decode: threshold >= query_length (forces decode pipeline)
 #     * prefill: threshold < query_length (forces prefill pipeline)
 # - Compare which is faster to find crossover point
-# - Use multiple repeats (15) to handle variance
 #
-# Note: FlashMLA may have issues with speculative decode workloads
-# Use flash_attn_mla instead if you encounter errors with flashmla
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index a4907208098e..d01ea82eb2a2 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -247,47 +247,56 @@ def build_mla_metadata_cutlass(
     return metadata, kv_cache, layer
 
 
-# Backend configuration mapping for unified runner
-_BACKEND_CONFIG = {
-    "flash_attn_mla": {
-        "module": "vllm.v1.attention.backends.mla.flashattn_mla",
-        "impl_class": "FlashAttnMLAImpl",
-        "metadata_class": "FlashAttnMLAMetadata",
-        "decode_metadata_class": "FlashAttnMLADecodeMetadata",
-        "builder_class": "FlashAttnMLAMetadataBuilder",
-        "query_format": "tuple",  # (q_nope, q_pe)
-        "block_size": None,  # Use config block_size
-    },
+# Backend name to class name prefix mapping
+_BACKEND_NAME_MAP = {
+    "flash_attn_mla": "FlashAttnMLA",
+    "flashmla": "FlashMLA",
+    "flashinfer_mla": "FlashInferMLA",
+    "cutlass_mla": "CutlassMLA",
+}
+
+# Special properties that differ from defaults
+_BACKEND_PROPERTIES = {
     "flashmla": {
-        "module": "vllm.v1.attention.backends.mla.flashmla",
-        "impl_class": "FlashMLAImpl",
-        "metadata_class": "FlashMLAMetadata",
-        "decode_metadata_class": "FlashMLADecodeMetadata",
-        "builder_class": None,
-        "query_format": "concat",  # Single concatenated tensor
+        "query_format": "concat",  # Single concatenated tensor (vs tuple)
         "block_size": 64,  # FlashMLA uses fixed block size
     },
-    "flashinfer_mla": {
-        "module": "vllm.v1.attention.backends.mla.flashinfer_mla",
-        "impl_class": "FlashInferMLAImpl",
-        "metadata_class": "MLACommonMetadata",
-        "decode_metadata_class": "MLACommonDecodeMetadata",
-        "builder_class": None,
-        "query_format": "tuple",
-        "block_size": None,
-    },
-    "cutlass_mla": {
-        "module": "vllm.v1.attention.backends.mla.cutlass_mla",
-        "impl_class": "CutlassMLAImpl",
-        "metadata_class": "MLACommonMetadata",
-        "decode_metadata_class": "MLACommonDecodeMetadata",
-        "builder_class": None,
-        "query_format": "tuple",
-        "block_size": None,
-    },
 }
 
 
+def _get_backend_config(backend: str) -> dict:
+    """
+    Get backend configuration using naming conventions.
+
+    All MLA backends follow the pattern:
+    - Module: vllm.v1.attention.backends.mla.{backend}
+    - Impl: {Name}Impl
+    - Metadata: {Name}Metadata (or MLACommonMetadata)
+    - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
+    - MetadataBuilder: {Name}MetadataBuilder
+    """
+    if backend not in _BACKEND_NAME_MAP:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    name = _BACKEND_NAME_MAP[backend]
+    props = _BACKEND_PROPERTIES.get(backend, {})
+
+    # Check if backend uses common metadata (FlashInfer, CUTLASS)
+    uses_common = backend in ("flashinfer_mla", "cutlass_mla")
+
+    return {
+        "module": f"vllm.v1.attention.backends.mla.{backend}",
+        "impl_class": f"{name}Impl",
+        "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
+        "decode_metadata_class": "MLACommonDecodeMetadata"
+        if uses_common
+        else f"{name}DecodeMetadata",
+        "builder_class": f"{name}MetadataBuilder",
+        "query_format": props.get("query_format", "tuple"),
+        "block_size": props.get("block_size", None),
+    }
+
+
 def _run_mla_benchmark_batched(
     backend: str,
     configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
@@ -312,10 +321,7 @@ def _run_mla_benchmark_batched(
     if not configs_with_params:
         return []
 
-    if backend not in _BACKEND_CONFIG:
-        raise ValueError(f"Unknown backend: {backend}")
-
-    backend_cfg = _BACKEND_CONFIG[backend]
+    backend_cfg = _get_backend_config(backend)
     device = torch.device(configs_with_params[0][0].device)
     torch.cuda.set_device(device)
 
@@ -336,25 +342,34 @@ def _run_mla_benchmark_batched(
 
     backend_module = importlib.import_module(backend_cfg["module"])
     impl_class = getattr(backend_module, backend_cfg["impl_class"])
-    metadata_class = getattr(backend_module, backend_cfg["metadata_class"])
-    decode_metadata_class = getattr(
-        backend_module, backend_cfg["decode_metadata_class"]
-    )
 
     # Import builder class if needed (for threshold setting)
     builder_class = None
+    builder_instance = None
     if backend_cfg["builder_class"]:
         builder_class = getattr(backend_module, backend_cfg["builder_class"])
 
-    # Import common metadata for backends that use it
-    if backend_cfg["metadata_class"] == "MLACommonMetadata":
-        from vllm.v1.attention.backends.mla.common import (
-            MLACommonDecodeMetadata as decode_metadata_class,
+        # Create a builder instance to use build() method
+        from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+        kv_cache_spec = FullAttentionSpec(
+            block_size=block_size,
+            num_kv_heads=1,  # MLA uses 1 KV head
+            head_size=576,  # MLA head dim
+            dtype=torch.float16,
         )
-        from vllm.v1.attention.backends.mla.common import (
-            MLACommonMetadata as metadata_class,
+
+        builder_instance = builder_class(
+            kv_cache_spec=kv_cache_spec,
+            layer_names=["layer_0"],  # Dummy layer name for benchmark
+            vllm_config=vllm_config,
+            device=device,
         )
 
+    # Import common metadata for backends that use it
+    if backend_cfg["metadata_class"] == "MLACommonMetadata":
+        pass
+
     with set_current_vllm_config(vllm_config):
         # Setup MLA dimensions (reused)
         mla_dims = setup_mla_dims("deepseek-v3")
@@ -462,55 +477,28 @@ def _run_mla_benchmark_batched(
                     slot_mapping_list, dtype=torch.int64, device=device
                 )
 
-                # Create decode metadata
-                decode_metadata_kwargs = {
-                    "block_table": block_table_gpu,
-                    "seq_lens": seq_lens_gpu,
-                    "dcp_tot_seq_lens": None,
-                }
-
-                # FlashAttn MLA needs extra fields
-                if backend == "flash_attn_mla":
-                    decode_metadata_kwargs.update(
-                        {
-                            "query_start_loc": q_start_gpu,
-                            "max_query_len": max(q_lens),
-                            "max_seq_len": max_kv,
-                        }
-                    )
-
-                # FlashMLA needs tile_scheduler_metadata and num_splits
-                if backend == "flashmla":
-                    from vllm.attention.ops.flashmla import get_mla_metadata
-
-                    tile_scheduler_metadata, num_splits_auto = get_mla_metadata(
-                        seq_lens_gpu,
-                        mla_dims["num_q_heads"],
-                        1,  # MQA for decode
-                    )
-                    decode_metadata_kwargs.update(
-                        {
-                            "tile_scheduler_metadata": tile_scheduler_metadata,
-                            "num_splits": num_splits_auto,
-                        }
-                    )
-
-                decode_metadata = decode_metadata_class(**decode_metadata_kwargs)
+                # Create CommonAttentionMetadata and use builder.build()
+                from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
-                # Create metadata
-                metadata = metadata_class(
+                common_attn_metadata = CommonAttentionMetadata(
                     num_reqs=len(requests),
                     max_query_len=max(q_lens),
                     max_seq_len=max_kv,
                     num_actual_tokens=total_q,
                     query_start_loc=q_start_gpu,
+                    query_start_loc_cpu=q_start_cpu,
+                    seq_lens=seq_lens_gpu,
+                    seq_lens_cpu=seq_lens_cpu,
                     slot_mapping=slot_mapping,
-                    num_decodes=len(requests),
-                    num_decode_tokens=total_q,
-                    num_prefills=0,
-                    head_dim=mla_dims["head_dim"],
-                    decode=decode_metadata,
-                    prefill=None,
+                    block_table_tensor=block_table_gpu,
+                    dcp_local_seq_lens=None,
+                )
+
+                # Use the production build() method!
+                metadata = builder_instance.build(
+                    common_prefix_len=0,
+                    common_attn_metadata=common_attn_metadata,
+                    fast_build=False,
                 )
 
                 # Create KV cache

From 53f7a0d4e6658fb172abf92ae9609eb02bbe0924 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 11:45:14 -0400
Subject: [PATCH 09/45] bugfixes

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/common.py     |  44 +++-
 benchmarks/attention_benchmarks/mla_runner.py | 225 ++++++------------
 2 files changed, 117 insertions(+), 152 deletions(-)

diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index b4adbf05821b..eeadd9bc531f 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -33,10 +33,25 @@ def get_text_config(self):
         return self
 
 
-class MockLayer:
-    """Mock attention layer with scale parameters."""
+# Import AttentionLayerBase at module level to avoid circular dependencies
+try:
+    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 
-    def __init__(self, device: torch.device):
+    _HAS_ATTENTION_LAYER_BASE = True
+except ImportError:
+    _HAS_ATTENTION_LAYER_BASE = False
+    AttentionLayerBase = object  # Fallback
+
+
+class MockLayer(AttentionLayerBase):
+    """Mock attention layer with scale parameters and impl.
+
+    Inherits from AttentionLayerBase so it passes isinstance checks
+    in get_layers_from_vllm_config when FlashInfer prefill is enabled.
+    """
+
+    def __init__(self, device: torch.device, impl=None):
+        # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
         self._k_scale = torch.tensor(1.0, device=device)
         self._v_scale = torch.tensor(1.0, device=device)
         self._q_scale = torch.tensor(1.0, device=device)
@@ -44,6 +59,13 @@ def __init__(self, device: torch.device):
         self._k_scale_float = float(self._k_scale.item())
         self._v_scale_float = float(self._v_scale.item())
         self._q_scale_float = float(self._q_scale.item())
+        # AttentionImpl for metadata builders to query
+        self.impl = impl
+
+    def get_attn_backend(self):
+        """Get the attention backend class (required by AttentionLayerBase)."""
+        # Return None as this is just a mock layer for benchmarking
+        return None
 
 
 class MockModelConfig:
@@ -72,6 +94,22 @@ def get_num_kv_heads(self, _=None) -> int:
     def get_head_size(self) -> int:
         return self._d
 
+    def get_num_layers(self) -> int:
+        """Mock method for layer count queries."""
+        return 1
+
+    def get_sliding_window_for_layer(self, _layer_idx: int):
+        """Mock method for sliding window queries."""
+        return None
+
+    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
+        """Mock method for logits soft cap queries."""
+        return None
+
+    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
+        """Mock method for SM scale queries."""
+        return 1.0 / (self.get_head_size() ** 0.5)
+
 
 class MockParallelConfig:
     """Mock parallel configuration."""
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index d01ea82eb2a2..e9d64550b7c2 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 import torch
-from batch_spec import BatchRequest, parse_batch_spec
+from batch_spec import parse_batch_spec
 from common import MockHfConfig, MockLayer, setup_mla_dims
 
 from vllm.config import (
@@ -70,6 +70,20 @@ def create_minimal_vllm_config(
     # Override head counts and dims for MLA
     model_config.hf_config = MockHfConfig(mla_dims)
 
+    # Add mock methods for layer-specific queries (needed by metadata builders)
+    import types
+
+    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
+    model_config.get_sliding_window_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_logits_soft_cap_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_sm_scale_for_layer = types.MethodType(
+        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
+    )
+
     # Cache config
     cache_config = CacheConfig(
         block_size=block_size,
@@ -146,107 +160,6 @@ def create_minimal_vllm_config(
     return vllm_config
 
 
-def build_mla_metadata_cutlass(
-    requests: list[BatchRequest],
-    block_size: int,
-    device: torch.device,
-    mla_dims: dict,
-) -> tuple:
-    """
-    Build metadata for CUTLASS MLA backend.
-
-    Args:
-        requests: List of BatchRequest
-        block_size: KV cache block size
-        device: Torch device
-        mla_dims: MLA dimension configuration
-
-    Returns:
-        Tuple of (metadata, kv_cache, layer)
-    """
-    from vllm.v1.attention.backends.mla.common import (
-        MLACommonDecodeMetadata,
-        MLACommonMetadata,
-    )
-
-    q_lens = [r.q_len for r in requests]
-    kv_lens = [r.kv_len for r in requests]
-    total_q = sum(q_lens)
-    max_kv = max(kv_lens)
-
-    # Build query start locations
-    q_start_cpu = np.array(
-        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
-    )
-    q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
-
-    # Build sequence lengths
-    seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
-    seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
-
-    # Build block table
-    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
-    max_num_blocks = max(num_blocks_per_req)
-
-    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
-    for i, num_blocks in enumerate(num_blocks_per_req):
-        block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
-    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
-
-    # Slot mapping
-    slot_mapping_list = []
-    for i, (q_len, kv_len, num_blocks) in enumerate(
-        zip(q_lens, kv_lens, num_blocks_per_req)
-    ):
-        context_len = kv_len - q_len
-        for j in range(q_len):
-            token_kv_idx = context_len + j
-            block_idx = token_kv_idx // block_size
-            offset_in_block = token_kv_idx % block_size
-            global_block_id = block_table_cpu[i, block_idx]
-            slot_id = global_block_id * block_size + offset_in_block
-            slot_mapping_list.append(slot_id)
-
-    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
-
-    # Create decode metadata
-    decode_metadata = MLACommonDecodeMetadata(
-        block_table=block_table_gpu,
-        seq_lens=seq_lens_gpu,
-        dcp_tot_seq_lens=None,
-    )
-
-    # Create common metadata
-    metadata = MLACommonMetadata(
-        num_reqs=len(requests),
-        max_query_len=max(q_lens),
-        max_seq_len=max_kv,
-        num_actual_tokens=total_q,
-        query_start_loc=q_start_gpu,
-        slot_mapping=slot_mapping,
-        num_decodes=len(requests),
-        num_decode_tokens=total_q,
-        num_prefills=0,
-        head_dim=mla_dims["head_dim"],
-        decode=decode_metadata,
-        prefill=None,
-    )
-
-    # Create KV cache
-    kv_cache = torch.zeros(
-        max_num_blocks,
-        block_size,
-        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-        device=device,
-        dtype=torch.float16,
-    )
-
-    # Create layer
-    layer = MockLayer(device)
-
-    return metadata, kv_cache, layer
-
-
 # Backend name to class name prefix mapping
 _BACKEND_NAME_MAP = {
     "flash_attn_mla": "FlashAttnMLA",
@@ -343,69 +256,76 @@ def _run_mla_benchmark_batched(
     backend_module = importlib.import_module(backend_cfg["module"])
     impl_class = getattr(backend_module, backend_cfg["impl_class"])
 
+    # Setup MLA dimensions (reused)
+    mla_dims = setup_mla_dims("deepseek-v3")
+
     # Import builder class if needed (for threshold setting)
     builder_class = None
     builder_instance = None
-    if backend_cfg["builder_class"]:
-        builder_class = getattr(backend_module, backend_cfg["builder_class"])
-
-        # Create a builder instance to use build() method
-        from vllm.v1.kv_cache_interface import FullAttentionSpec
-
-        kv_cache_spec = FullAttentionSpec(
-            block_size=block_size,
-            num_kv_heads=1,  # MLA uses 1 KV head
-            head_size=576,  # MLA head dim
-            dtype=torch.float16,
-        )
-
-        builder_instance = builder_class(
-            kv_cache_spec=kv_cache_spec,
-            layer_names=["layer_0"],  # Dummy layer name for benchmark
-            vllm_config=vllm_config,
-            device=device,
-        )
-
-    # Import common metadata for backends that use it
-    if backend_cfg["metadata_class"] == "MLACommonMetadata":
-        pass
 
     with set_current_vllm_config(vllm_config):
-        # Setup MLA dimensions (reused)
-        mla_dims = setup_mla_dims("deepseek-v3")
         scale = 1.0 / np.sqrt(
             mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
         )
 
         # Create impl once (reused across all benchmarks)
-        impl_kwargs = {
-            "num_heads": mla_dims["num_q_heads"],
-            "head_size": mla_dims["head_dim"],
-            "scale": scale,
-            "num_kv_heads": mla_dims["num_kv_heads"],
-            "alibi_slopes": None,
-            "sliding_window": None,
-            "kv_cache_dtype": "auto",
-            "logits_soft_cap": None,
-            "attn_type": "decoder",
-            "kv_sharing_target_layer_name": None,
-            "q_lora_rank": None,
-            "kv_lora_rank": mla_dims["kv_lora_rank"],
-            "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
-            "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
-            "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-            "v_head_dim": mla_dims["v_head_dim"],
-            "kv_b_proj": None,
-        }
-
-        impl = impl_class(**impl_kwargs)
+        impl = impl_class(
+            num_heads=mla_dims["num_q_heads"],
+            head_size=mla_dims["head_dim"],
+            scale=scale,
+            num_kv_heads=mla_dims["num_kv_heads"],
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype="auto",
+            logits_soft_cap=None,
+            attn_type="decoder",
+            kv_sharing_target_layer_name=None,
+            q_lora_rank=None,
+            kv_lora_rank=mla_dims["kv_lora_rank"],
+            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            v_head_dim=mla_dims["v_head_dim"],
+            kv_b_proj=None,
+        )
 
         # Initialize DCP attributes
         if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
             impl.dcp_world_size = 1
             impl.dcp_rank = 0
 
-        layer = MockLayer(device)
+        # Create mock layer (used for benchmarks)
+        layer = MockLayer(device, impl=impl)
+
+        if backend_cfg["builder_class"]:
+            builder_class = getattr(backend_module, backend_cfg["builder_class"])
+
+            # Create a builder instance to use build() method
+            from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+            kv_cache_spec = FullAttentionSpec(
+                block_size=block_size,
+                num_kv_heads=1,  # MLA uses 1 KV head
+                head_size=576,  # MLA head dim
+                dtype=torch.float16,
+            )
+
+            # Populate static_forward_context so builder can find the layer
+            # MockLayer now inherits from AttentionLayerBase, so isinstance checks pass
+            vllm_config.compilation_config.static_forward_context = {
+                "placeholder": layer
+            }
+
+            builder_instance = builder_class(
+                kv_cache_spec=kv_cache_spec,
+                layer_names=["placeholder"],  # Dummy layer name (like in tests)
+                vllm_config=vllm_config,
+                device=device,
+            )
+
+        # Import common metadata for backends that use it
+        if backend_cfg["metadata_class"] == "MLACommonMetadata":
+            pass
         results = []
 
         # Run each benchmark with the shared impl
@@ -442,6 +362,12 @@ def _run_mla_benchmark_batched(
                 seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
                 seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
 
+                # Build num_computed_tokens (context length for each request)
+                context_lens = [
+                    kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)
+                ]
+                num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
+
                 # Build block table
                 num_blocks_per_req = [
                     (kv + block_size - 1) // block_size for kv in kv_lens
@@ -489,6 +415,7 @@ def _run_mla_benchmark_batched(
                     query_start_loc_cpu=q_start_cpu,
                     seq_lens=seq_lens_gpu,
                     seq_lens_cpu=seq_lens_cpu,
+                    num_computed_tokens_cpu=num_computed_tokens_cpu,
                     slot_mapping=slot_mapping,
                     block_table_tensor=block_table_gpu,
                     dcp_local_seq_lens=None,

From 269c0cc9041a100f73007c6a5c907d568fd2b01f Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 15:47:59 +0000
Subject: [PATCH 10/45] fix typo

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md              |  4 ++--
 benchmarks/attention_benchmarks/benchmark.py           |  4 ++--
 .../configs/hopper_head_count.yaml                     |  2 +-
 .../attention_benchmarks/configs/mla_decode.yaml       |  4 ++--
 .../attention_benchmarks/configs/mla_mixed_batch.yaml  |  4 ++--
 .../configs/reorder_threshold.yaml                     |  2 +-
 .../configs/speculative_decode.yaml                    |  2 +-
 benchmarks/attention_benchmarks/mla_runner.py          | 10 +++++-----
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 8c2b89f305c0..9e1e6281737a 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -135,7 +135,7 @@ python benchmark.py \
 ```bash
 # Compare all MLA backends
 python benchmark.py \
-    --backends cutlass_mla flashinfer_mla flash_attn_mla flashmla \
+    --backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
     --batch-specs "64q1s1k" "64q1s4k" \
     --output-csv mla_results.csv
 ```
@@ -171,7 +171,7 @@ python benchmark.py \
 
 ```
 --backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
-                                    # flashinfer_mla, flash_attn_mla, flashmla
+                                    # flashinfer_mla, flashattn_mla, flashmla
 --backend BACKEND                   # Single backend (alternative to --backends)
 --batch-specs SPEC [SPEC ...]       # Batch specifications (default: ["q2k", "8q1s1k"])
 
diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index 787717c8d8e7..f3ae79bd135a 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -162,7 +162,7 @@ def main():
         "--backends",
         nargs="+",
         help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
-        "flashinfer_mla, flash_attn_mla, flashmla)",
+        "flashinfer_mla, flashattn_mla, flashmla)",
     )
     parser.add_argument(
         "--backend",
@@ -617,7 +617,7 @@ def main():
                         if backend in [
                             "cutlass_mla",
                             "flashinfer_mla",
-                            "flash_attn_mla",
+                            "flashattn_mla",
                             "flashmla",
                         ]:
                             result = run_mla_benchmark(config)
diff --git a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
index c518ed55d32a..ac990f75992c 100644
--- a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
@@ -6,7 +6,7 @@ description: "FlashAttn MLA vs FlashMLA head count comparison on Hopper"
 
 # Compare these two Hopper backends
 backends:
-  - flash_attn_mla
+  - flashattn_mla
   - flashmla
 
 # Standard decode workloads
diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
index d46676afcb6e..aaf4eec9b1c8 100644
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -42,7 +42,7 @@ batch_specs:
 backends:
   - cutlass_mla
   - flashinfer_mla
-  - flash_attn_mla  # Hopper only
+  - flashattn_mla  # Hopper only
   - flashmla        # Hopper only
 
 device: "cuda:0"
@@ -54,7 +54,7 @@ profile_memory: true
 cutlass_mla:
   num_kv_splits: auto  # or specific value like 4, 8, 16
 
-flash_attn_mla:
+flashattn_mla:
   reorder_batch_threshold: 512
 
 flashmla:
diff --git a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
index ce9e05fd21da..ad3c0dced6ec 100644
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -47,8 +47,8 @@ batch_specs:
 backends:
   - cutlass_mla
   - flashinfer_mla
-  - flash_attn_mla   # Hopper only
-  - flashmla         # Hopper only
+  - flashattn_mla   # Hopper only
+  - flashmla        # Hopper only
 
 device: "cuda:0"
 repeats: 5
diff --git a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
index 681eac81e4e7..1ea0a12b5338 100644
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -6,7 +6,7 @@
 description: "Decode vs Prefill pipeline crossover analysis"
 
 # Test FlashAttn MLA
-backend: flash_attn_mla
+backend: flashattn_mla
 
 # Mode: decode_vs_prefill comparison (special sweep mode)
 # For each batch spec, we'll test both decode and prefill pipelines
diff --git a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
index 7a80cdee8066..8a960ac9555f 100644
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -41,7 +41,7 @@ batch_specs:
 
 # Backends that support query length > 1
 backends:
-  - flash_attn_mla    # reorder_batch_threshold = 512
+  - flashattn_mla    # reorder_batch_threshold = 512
   - flashmla          # reorder_batch_threshold = 1 (tunable)
 
 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index e9d64550b7c2..f37b4ba25e7e 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -162,7 +162,7 @@ def create_minimal_vllm_config(
 
 # Backend name to class name prefix mapping
 _BACKEND_NAME_MAP = {
-    "flash_attn_mla": "FlashAttnMLA",
+    "flashattn_mla": "FlashAttnMLA",
     "flashmla": "FlashMLA",
     "flashinfer_mla": "FlashInferMLA",
     "cutlass_mla": "CutlassMLA",
@@ -217,7 +217,7 @@ def _run_mla_benchmark_batched(
     """
     Unified batched MLA benchmark runner for all backends.
 
-    Works for: flash_attn_mla, flashmla, flashinfer_mla, cutlass_mla
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
 
     This function reuses backend initialization across multiple benchmarks
     to avoid setup/teardown overhead.
@@ -514,12 +514,12 @@ def run_mla_benchmark(
     """
     Unified MLA benchmark runner for all backends.
 
-    Works for: flash_attn_mla, flashmla, flashinfer_mla, cutlass_mla
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
 
     Always uses batched execution internally for optimal performance.
 
     Args:
-        backend: Backend name (flash_attn_mla, flashmla, flashinfer_mla, cutlass_mla)
+        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
         config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
         reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
                                  (single config mode only)
@@ -533,7 +533,7 @@ def run_mla_benchmark(
         # Already in batched format
         if len(config) > 0 and isinstance(config[0], tuple):
             # Format: [(cfg, param), ...] where param is threshold or num_splits
-            if backend in ("flash_attn_mla", "flashmla"):
+            if backend in ("flashattn_mla", "flashmla"):
                 configs_with_params = [(cfg, param, None) for cfg, param in config]
             else:  # cutlass_mla or flashinfer_mla
                 configs_with_params = [(cfg, None, param) for cfg, param in config]

From 0e2039b2251ba6124e7e580d3f98bdca8d4864bf Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 12:02:41 -0400
Subject: [PATCH 11/45] refactor

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/mla_runner.py | 618 +++++++++++-------
 benchmarks/attention_benchmarks/runner.py     | 445 ++++++++++---
 2 files changed, 733 insertions(+), 330 deletions(-)

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index f37b4ba25e7e..e1a652b1be3e 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -8,6 +8,7 @@
 needing full VllmConfig integration.
 """
 
+import importlib
 from typing import Optional
 
 import numpy as np
@@ -22,8 +23,34 @@
     ParallelConfig,
     SchedulerConfig,
     VllmConfig,
+    set_current_vllm_config,
 )
 
+# ============================================================================
+# VllmConfig Creation
+# ============================================================================
+
+
+def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
+    """
+    Add mock methods for layer-specific queries to ModelConfig.
+
+    These methods are needed by metadata builders but aren't normally
+    present on ModelConfig when used in benchmark contexts.
+    """
+    import types
+
+    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
+    model_config.get_sliding_window_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_logits_soft_cap_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_sm_scale_for_layer = types.MethodType(
+        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
+    )
+
 
 def create_minimal_vllm_config(
     model_name: str = "deepseek-v3",
@@ -70,21 +97,10 @@ def create_minimal_vllm_config(
     # Override head counts and dims for MLA
     model_config.hf_config = MockHfConfig(mla_dims)
 
-    # Add mock methods for layer-specific queries (needed by metadata builders)
-    import types
+    # Add mock methods for layer-specific queries
+    _add_mock_methods_to_model_config(model_config)
 
-    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
-    model_config.get_sliding_window_for_layer = types.MethodType(
-        lambda self, _i: None, model_config
-    )
-    model_config.get_logits_soft_cap_for_layer = types.MethodType(
-        lambda self, _i: None, model_config
-    )
-    model_config.get_sm_scale_for_layer = types.MethodType(
-        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
-    )
-
-    # Cache config
+    # Create sub-configs
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
@@ -97,7 +113,6 @@ def create_minimal_vllm_config(
         cpu_offload_gb=0,
     )
 
-    # Scheduler config
     scheduler_config = SchedulerConfig(
         task="auto",
         max_num_seqs=max_num_seqs,
@@ -114,7 +129,6 @@ def create_minimal_vllm_config(
         send_delta_data=False,
     )
 
-    # Parallel config
     parallel_config = ParallelConfig(
         pipeline_parallel_size=1,
         tensor_parallel_size=1,
@@ -127,7 +141,6 @@ def create_minimal_vllm_config(
         distributed_executor_backend=None,
     )
 
-    # Compilation config
     compilation_config = CompilationConfig(
         level=0,
         backend="",
@@ -148,8 +161,7 @@ def create_minimal_vllm_config(
         cudagraph_backend="flashinfer",
     )
 
-    # Create VllmConfig
-    vllm_config = VllmConfig(
+    return VllmConfig(
         model_config=model_config,
         cache_config=cache_config,
         parallel_config=parallel_config,
@@ -157,7 +169,10 @@ def create_minimal_vllm_config(
         compilation_config=compilation_config,
     )
 
-    return vllm_config
+
+# ============================================================================
+# Backend Configuration
+# ============================================================================
 
 
 # Backend name to class name prefix mapping
@@ -210,6 +225,328 @@ def _get_backend_config(backend: str) -> dict:
     }
 
 
+# ============================================================================
+# Metadata Building Helpers
+# ============================================================================
+
+
+def _build_attention_metadata(
+    requests: list,
+    block_size: int,
+    device: torch.device,
+    builder_instance,
+) -> tuple:
+    """
+    Build attention metadata from batch requests.
+
+    Args:
+        requests: List of BatchRequest objects
+        block_size: KV cache block size
+        device: Target device
+        builder_instance: Metadata builder instance
+
+    Returns:
+        Tuple of (metadata, kv_cache_num_blocks)
+    """
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv = max(kv_lens)
+
+    # Build query start locations
+    q_start_cpu = np.array(
+        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
+        dtype=np.int32,
+    )
+    q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
+
+    # Build sequence lengths
+    seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
+    seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
+
+    # Build num_computed_tokens (context length for each request)
+    context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
+    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
+
+    # Build block table
+    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
+    max_num_blocks = max(num_blocks_per_req)
+
+    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
+    current_block = 0
+    for i, num_blocks in enumerate(num_blocks_per_req):
+        for j in range(num_blocks):
+            block_table_cpu[i, j] = current_block
+            current_block += 1
+
+    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+    # Build slot mapping
+    slot_mapping_list = []
+    for i, (q_len, kv_len, num_blocks) in enumerate(
+        zip(q_lens, kv_lens, num_blocks_per_req)
+    ):
+        context_len = kv_len - q_len
+        for j in range(q_len):
+            token_kv_idx = context_len + j
+            block_idx = token_kv_idx // block_size
+            offset_in_block = token_kv_idx % block_size
+            global_block_id = block_table_cpu[i, block_idx]
+            slot_id = global_block_id * block_size + offset_in_block
+            slot_mapping_list.append(slot_id)
+
+    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
+
+    # Create CommonAttentionMetadata
+    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+    common_attn_metadata = CommonAttentionMetadata(
+        num_reqs=len(requests),
+        max_query_len=max(q_lens),
+        max_seq_len=max_kv,
+        num_actual_tokens=total_q,
+        query_start_loc=q_start_gpu,
+        query_start_loc_cpu=q_start_cpu,
+        seq_lens=seq_lens_gpu,
+        seq_lens_cpu=seq_lens_cpu,
+        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        slot_mapping=slot_mapping,
+        block_table_tensor=block_table_gpu,
+        dcp_local_seq_lens=None,
+    )
+
+    # Use the production build() method
+    metadata = builder_instance.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_attn_metadata,
+        fast_build=False,
+    )
+
+    return metadata, current_block
+
+
+def _create_query_tensors(
+    total_q: int,
+    mla_dims: dict,
+    query_format: str,
+    device: torch.device,
+    dtype: torch.dtype,
+):
+    """
+    Create query tensors in the appropriate format for the backend.
+
+    Args:
+        total_q: Total number of query tokens
+        mla_dims: MLA dimension configuration
+        query_format: Either "tuple" or "concat"
+        device: Target device
+        dtype: Tensor dtype
+
+    Returns:
+        Query tensor(s) - either (q_nope, q_pe) tuple or concatenated tensor
+    """
+    if query_format == "tuple":
+        q_nope = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"],
+            device=device,
+            dtype=dtype,
+        )
+        q_pe = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        return (q_nope, q_pe)
+    else:  # concat
+        return torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+
+
+# ============================================================================
+# Backend Initialization
+# ============================================================================
+
+
+def _create_backend_impl(
+    backend_cfg: dict,
+    mla_dims: dict,
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    """
+    Create backend implementation instance.
+
+    Args:
+        backend_cfg: Backend configuration dict
+        mla_dims: MLA dimension configuration
+        vllm_config: VllmConfig instance
+        device: Target device
+
+    Returns:
+        Tuple of (impl, layer, builder_instance)
+    """
+    # Import backend classes
+    backend_module = importlib.import_module(backend_cfg["module"])
+    impl_class = getattr(backend_module, backend_cfg["impl_class"])
+
+    # Calculate scale
+    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
+
+    # Create impl
+    impl = impl_class(
+        num_heads=mla_dims["num_q_heads"],
+        head_size=mla_dims["head_dim"],
+        scale=scale,
+        num_kv_heads=mla_dims["num_kv_heads"],
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+        logits_soft_cap=None,
+        attn_type="decoder",
+        kv_sharing_target_layer_name=None,
+        q_lora_rank=None,
+        kv_lora_rank=mla_dims["kv_lora_rank"],
+        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+        qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+        qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+        v_head_dim=mla_dims["v_head_dim"],
+        kv_b_proj=None,
+    )
+
+    # Initialize DCP attributes
+    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
+        impl.dcp_world_size = 1
+        impl.dcp_rank = 0
+
+    # Create mock layer
+    layer = MockLayer(device, impl=impl)
+
+    # Create builder instance if needed
+    builder_instance = None
+    if backend_cfg["builder_class"]:
+        builder_class = getattr(backend_module, backend_cfg["builder_class"])
+
+        from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+        kv_cache_spec = FullAttentionSpec(
+            block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
+            num_kv_heads=1,  # MLA uses 1 KV head
+            head_size=576,  # MLA head dim
+            dtype=torch.float16,
+        )
+
+        # Populate static_forward_context so builder can find the layer
+        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
+        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
+
+        builder_instance = builder_class(
+            kv_cache_spec=kv_cache_spec,
+            layer_names=["placeholder"],
+            vllm_config=vllm_config,
+            device=device,
+        )
+
+    return impl, layer, builder_instance
+
+
+# ============================================================================
+# Benchmark Execution
+# ============================================================================
+
+
+def _run_single_benchmark(
+    config,
+    impl,
+    layer,
+    builder_instance,
+    backend_cfg: dict,
+    mla_dims: dict,
+    device: torch.device,
+) -> dict:
+    """
+    Run a single benchmark iteration.
+
+    Args:
+        config: BenchmarkConfig instance
+        impl: Backend implementation instance
+        layer: MockLayer instance
+        builder_instance: Metadata builder instance
+        backend_cfg: Backend configuration dict
+        mla_dims: MLA dimension configuration
+        device: Target device
+
+    Returns:
+        Dict with timing statistics
+    """
+    # Parse batch spec
+    requests = parse_batch_spec(config.batch_spec)
+    q_lens = [r.q_len for r in requests]
+    total_q = sum(q_lens)
+
+    # Determine block size
+    block_size = backend_cfg["block_size"] or config.block_size
+
+    # Build metadata
+    metadata, num_blocks = _build_attention_metadata(
+        requests, block_size, device, builder_instance
+    )
+
+    # Create KV cache
+    kv_cache = torch.zeros(
+        num_blocks,
+        block_size,
+        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+        device=device,
+        dtype=torch.float16,
+    )
+
+    # Create query tensors
+    query = _create_query_tensors(
+        total_q,
+        mla_dims,
+        backend_cfg["query_format"],
+        device,
+        torch.float16,
+    )
+
+    # Warmup
+    for _ in range(config.warmup_iters):
+        impl._forward_decode(query, kv_cache, metadata, layer)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(config.repeats):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for _ in range(config.num_layers):
+            impl._forward_decode(query, kv_cache, metadata, layer)
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed_ms = start.elapsed_time(end)
+        times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+    return {
+        "mean": np.mean(times),
+        "std": np.std(times),
+        "min": np.min(times),
+        "max": np.max(times),
+        "throughput": total_q / np.mean(times) if times else 0,
+    }
+
+
 def _run_mla_benchmark_batched(
     backend: str,
     configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
@@ -248,86 +585,23 @@ def _run_mla_benchmark_batched(
         block_size=block_size,
     )
 
-    # Import backend classes dynamically
-    import importlib
-
-    from vllm.config import set_current_vllm_config
-
-    backend_module = importlib.import_module(backend_cfg["module"])
-    impl_class = getattr(backend_module, backend_cfg["impl_class"])
-
     # Setup MLA dimensions (reused)
     mla_dims = setup_mla_dims("deepseek-v3")
 
-    # Import builder class if needed (for threshold setting)
-    builder_class = None
-    builder_instance = None
+    results = []
 
     with set_current_vllm_config(vllm_config):
-        scale = 1.0 / np.sqrt(
-            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+        # Create backend impl, layer, and builder (reused across benchmarks)
+        impl, layer, builder_instance = _create_backend_impl(
+            backend_cfg, mla_dims, vllm_config, device
         )
 
-        # Create impl once (reused across all benchmarks)
-        impl = impl_class(
-            num_heads=mla_dims["num_q_heads"],
-            head_size=mla_dims["head_dim"],
-            scale=scale,
-            num_kv_heads=mla_dims["num_kv_heads"],
-            alibi_slopes=None,
-            sliding_window=None,
-            kv_cache_dtype="auto",
-            logits_soft_cap=None,
-            attn_type="decoder",
-            kv_sharing_target_layer_name=None,
-            q_lora_rank=None,
-            kv_lora_rank=mla_dims["kv_lora_rank"],
-            qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
-            qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
-            qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-            v_head_dim=mla_dims["v_head_dim"],
-            kv_b_proj=None,
-        )
-
-        # Initialize DCP attributes
-        if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
-            impl.dcp_world_size = 1
-            impl.dcp_rank = 0
-
-        # Create mock layer (used for benchmarks)
-        layer = MockLayer(device, impl=impl)
-
+        # Get builder class for threshold setting (if applicable)
+        builder_class = None
         if backend_cfg["builder_class"]:
+            backend_module = importlib.import_module(backend_cfg["module"])
             builder_class = getattr(backend_module, backend_cfg["builder_class"])
 
-            # Create a builder instance to use build() method
-            from vllm.v1.kv_cache_interface import FullAttentionSpec
-
-            kv_cache_spec = FullAttentionSpec(
-                block_size=block_size,
-                num_kv_heads=1,  # MLA uses 1 KV head
-                head_size=576,  # MLA head dim
-                dtype=torch.float16,
-            )
-
-            # Populate static_forward_context so builder can find the layer
-            # MockLayer now inherits from AttentionLayerBase, so isinstance checks pass
-            vllm_config.compilation_config.static_forward_context = {
-                "placeholder": layer
-            }
-
-            builder_instance = builder_class(
-                kv_cache_spec=kv_cache_spec,
-                layer_names=["placeholder"],  # Dummy layer name (like in tests)
-                vllm_config=vllm_config,
-                device=device,
-            )
-
-        # Import common metadata for backends that use it
-        if backend_cfg["metadata_class"] == "MLACommonMetadata":
-            pass
-        results = []
-
         # Run each benchmark with the shared impl
         for config, threshold, num_splits in configs_with_params:
             # Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -343,155 +617,16 @@ def _run_mla_benchmark_batched(
                 impl._num_kv_splits = num_splits
 
             try:
-                # Parse batch spec
-                requests = parse_batch_spec(config.batch_spec)
-
-                q_lens = [r.q_len for r in requests]
-                kv_lens = [r.kv_len for r in requests]
-                total_q = sum(q_lens)
-                max_kv = max(kv_lens)
-
-                # Build query start locations
-                q_start_cpu = np.array(
-                    [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
-                    dtype=np.int32,
-                )
-                q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
-
-                # Build sequence lengths
-                seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
-                seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
-
-                # Build num_computed_tokens (context length for each request)
-                context_lens = [
-                    kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)
-                ]
-                num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
-
-                # Build block table
-                num_blocks_per_req = [
-                    (kv + block_size - 1) // block_size for kv in kv_lens
-                ]
-                max_num_blocks = max(num_blocks_per_req)
-
-                block_table_cpu = np.zeros(
-                    (len(requests), max_num_blocks), dtype=np.int32
-                )
-                current_block = 0
-                for i, num_blocks in enumerate(num_blocks_per_req):
-                    for j in range(num_blocks):
-                        block_table_cpu[i, j] = current_block
-                        current_block += 1
-
-                block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
-
-                # Build slot mapping
-                slot_mapping_list = []
-                for i, (q_len, kv_len, num_blocks) in enumerate(
-                    zip(q_lens, kv_lens, num_blocks_per_req)
-                ):
-                    context_len = kv_len - q_len
-                    for j in range(q_len):
-                        token_kv_idx = context_len + j
-                        block_idx = token_kv_idx // block_size
-                        offset_in_block = token_kv_idx % block_size
-                        global_block_id = block_table_cpu[i, block_idx]
-                        slot_id = global_block_id * block_size + offset_in_block
-                        slot_mapping_list.append(slot_id)
-
-                slot_mapping = torch.tensor(
-                    slot_mapping_list, dtype=torch.int64, device=device
-                )
-
-                # Create CommonAttentionMetadata and use builder.build()
-                from vllm.v1.attention.backends.utils import CommonAttentionMetadata
-
-                common_attn_metadata = CommonAttentionMetadata(
-                    num_reqs=len(requests),
-                    max_query_len=max(q_lens),
-                    max_seq_len=max_kv,
-                    num_actual_tokens=total_q,
-                    query_start_loc=q_start_gpu,
-                    query_start_loc_cpu=q_start_cpu,
-                    seq_lens=seq_lens_gpu,
-                    seq_lens_cpu=seq_lens_cpu,
-                    num_computed_tokens_cpu=num_computed_tokens_cpu,
-                    slot_mapping=slot_mapping,
-                    block_table_tensor=block_table_gpu,
-                    dcp_local_seq_lens=None,
-                )
-
-                # Use the production build() method!
-                metadata = builder_instance.build(
-                    common_prefix_len=0,
-                    common_attn_metadata=common_attn_metadata,
-                    fast_build=False,
-                )
-
-                # Create KV cache
-                kv_cache = torch.zeros(
-                    current_block,
-                    block_size,
-                    mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-                    device=device,
-                    dtype=torch.float16,
-                )
-
-                # Create query tensors (format depends on backend)
-                if backend_cfg["query_format"] == "tuple":
-                    q_nope = torch.randn(
-                        total_q,
-                        mla_dims["num_q_heads"],
-                        mla_dims["kv_lora_rank"],
-                        device=device,
-                        dtype=torch.float16,
-                    )
-                    q_pe = torch.randn(
-                        total_q,
-                        mla_dims["num_q_heads"],
-                        mla_dims["qk_rope_head_dim"],
-                        device=device,
-                        dtype=torch.float16,
-                    )
-                    query = (q_nope, q_pe)
-                else:  # concat
-                    query = torch.randn(
-                        total_q,
-                        mla_dims["num_q_heads"],
-                        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-                        device=device,
-                        dtype=torch.float16,
-                    )
-
-                # Warmup
-                for _ in range(config.warmup_iters):
-                    impl._forward_decode(query, kv_cache, metadata, layer)
-                torch.cuda.synchronize()
-
-                # Benchmark
-                times = []
-                for _ in range(config.repeats):
-                    start = torch.cuda.Event(enable_timing=True)
-                    end = torch.cuda.Event(enable_timing=True)
-
-                    start.record()
-                    for _ in range(config.num_layers):
-                        impl._forward_decode(query, kv_cache, metadata, layer)
-                    end.record()
-
-                    torch.cuda.synchronize()
-                    elapsed_ms = start.elapsed_time(end)
-                    times.append(elapsed_ms / 1000.0 / config.num_layers)
-
-                results.append(
-                    {
-                        "mean": np.mean(times),
-                        "std": np.std(times),
-                        "min": np.min(times),
-                        "max": np.max(times),
-                        "throughput": total_q / np.mean(times) if times else 0,
-                    }
+                result = _run_single_benchmark(
+                    config,
+                    impl,
+                    layer,
+                    builder_instance,
+                    backend_cfg,
+                    mla_dims,
+                    device,
                 )
+                results.append(result)
 
             finally:
                 # Restore original threshold
@@ -502,7 +637,12 @@ def _run_mla_benchmark_batched(
                 if original_num_splits is not None:
                     impl._num_kv_splits = original_num_splits
 
-        return results
+    return results
+
+
+# ============================================================================
+# Public API
+# ============================================================================
 
 
 def run_mla_benchmark(
diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index cf9b4a62800d..6ce54d3c5aa4 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -2,43 +2,91 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 """
-Complete benchmark runner with real vLLM integration.
+Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
 
-This module provides working implementations that can actually run
-attention kernels, not placeholders.
+This module provides helpers for running standard attention backends
+(FlashAttention, Triton, FlashInfer) with real vLLM integration.
 """
 
 import numpy as np
 import torch
-from batch_spec import BatchRequest, parse_batch_spec, reorder_for_flashinfer
-from common import (
-    BenchmarkConfig,
-    BenchmarkResult,
-    MockLayer,
-    MockRunner,
-    get_attention_scale,
-)
+from batch_spec import parse_batch_spec, reorder_for_flashinfer
+from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
 
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
+# ============================================================================
+# Backend Configuration
+# ============================================================================
+
+
+_BACKEND_CONFIG = {
+    "flash": {
+        "module": "vllm.v1.attention.backends.flash_attn",
+        "backend_class": "FlashAttentionBackend",
+        "dtype": torch.float16,
+        "cache_layout": "standard",
+        # ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
+    },
+    "triton": {
+        "module": "vllm.v1.attention.backends.triton_attn",
+        "backend_class": "TritonAttentionBackend",
+        "dtype": torch.float32,
+        "cache_layout": "standard",
+    },
+    "flashinfer": {
+        "module": "vllm.v1.attention.backends.flashinfer",
+        "backend_class": "FlashInferBackend",
+        "dtype": torch.float16,
+        "cache_layout": "flashinfer",
+        # ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
+    },
+}
+
+
+def _get_backend_config(backend: str) -> dict:
+    """
+    Get backend configuration.
+
+    Args:
+        backend: Backend name (flash, triton, flashinfer)
+
+    Returns:
+        Backend configuration dict
+
+    Raises:
+        ValueError: If backend is unknown
+    """
+    if backend not in _BACKEND_CONFIG:
+        raise ValueError(
+            f"Unknown backend: {backend}. "
+            f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
+        )
+    return _BACKEND_CONFIG[backend]
+
+
+# ============================================================================
+# Metadata Building Helpers
+# ============================================================================
 
-def build_common_metadata(
-    requests: list[BatchRequest],
+
+def _build_attention_metadata(
+    requests: list,
     block_size: int,
     device: torch.device,
-) -> tuple[CommonAttentionMetadata, torch.Tensor, int]:
+) -> tuple:
     """
     Build CommonAttentionMetadata from batch requests.
 
     Args:
-        requests: List of BatchRequest
+        requests: List of BatchRequest objects
         block_size: KV cache block size
-        device: Torch device
+        device: Target device
 
     Returns:
-        Tuple of (CommonAttentionMetadata, slot_mapping, max_num_blocks)
+        Tuple of (metadata, slot_mapping, max_num_blocks)
     """
     q_lens = [r.q_len for r in requests]
     kv_lens = [r.kv_len for r in requests]
@@ -47,7 +95,8 @@ def build_common_metadata(
 
     # Build query start locations
     q_start_cpu = np.array(
-        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))], dtype=np.int32
+        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
+        dtype=np.int32,
     )
     q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
 
@@ -55,9 +104,10 @@ def build_common_metadata(
     seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
     seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
 
-    # Computed tokens (context before new query)
+    # Build num_computed_tokens (context length before new query)
     computed_tokens_cpu = np.array(
-        [kv - q for kv, q in zip(kv_lens, q_lens)], dtype=np.int32
+        [kv - q for kv, q in zip(kv_lens, q_lens)],
+        dtype=np.int32,
     )
 
     # Build block table
@@ -74,19 +124,18 @@ def build_common_metadata(
     for i, (q_len, kv_len, num_blocks) in enumerate(
         zip(q_lens, kv_lens, num_blocks_per_req)
     ):
-        # For each token in the query, map to its slot in the KV cache
         context_len = kv_len - q_len
         for j in range(q_len):
             token_kv_idx = context_len + j
             block_idx = token_kv_idx // block_size
             offset_in_block = token_kv_idx % block_size
-            # Global slot ID
             global_block_id = block_table_cpu[i, block_idx]
             slot_id = global_block_id * block_size + offset_in_block
             slot_mapping_list.append(slot_id)
 
     slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
 
+    # Create CommonAttentionMetadata
     metadata = CommonAttentionMetadata(
         query_start_loc=q_start_gpu,
         query_start_loc_cpu=torch.from_numpy(q_start_cpu),
@@ -104,52 +153,71 @@ def build_common_metadata(
     return metadata, slot_mapping, max_num_blocks
 
 
-def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
+def _build_block_table(
+    requests: list,
+    kv_lens: list[int],
+    block_size: int,
+    total_q: int,
+    max_num_blocks: int,
+    device: torch.device,
+) -> BlockTable:
     """
-    Run standard attention benchmark with real kernels.
+    Build BlockTable for metadata builder.
 
     Args:
-        config: Benchmark configuration
+        requests: List of BatchRequest objects
+        kv_lens: List of KV sequence lengths
+        block_size: KV cache block size
+        total_q: Total number of query tokens
+        max_num_blocks: Maximum number of blocks per request
+        device: Target device
 
     Returns:
-        BenchmarkResult with actual timing data
+        BlockTable instance
     """
-    device = torch.device(config.device)
-    torch.cuda.set_device(device)
+    bt = BlockTable(len(requests), max_num_blocks, total_q, False, device)
+    for i in range(len(requests)):
+        num_blocks = (kv_lens[i] + block_size - 1) // block_size
+        bt.add_row(list(range(num_blocks)), i)
+    bt.commit(len(requests))
+    return bt
 
-    # Parse batch spec
-    requests = parse_batch_spec(config.batch_spec)
 
-    # Reorder for FlashInfer if needed
-    if config.backend == "flashinfer":
-        requests = reorder_for_flashinfer(requests)
+# ============================================================================
+# Backend Initialization
+# ============================================================================
 
-    # Extract dimensions
-    q_lens = [r.q_len for r in requests]
-    kv_lens = [r.kv_len for r in requests]
-    total_q = sum(q_lens)
 
-    # Compute scale
-    scale = get_attention_scale(config.head_dim)
+def _create_backend_impl(
+    backend_cfg: dict,
+    config: BenchmarkConfig,
+    device: torch.device,
+):
+    """
+    Create backend implementation instance.
 
-    # Select backend and dtype
-    if config.backend == "flash":
-        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend as BE
+    Args:
+        backend_cfg: Backend configuration dict
+        config: BenchmarkConfig instance
+        device: Target device
 
-        dt = torch.float16
-    elif config.backend == "triton":
-        from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend as BE
+    Returns:
+        Tuple of (backend_class, impl, layer, dtype)
+    """
+    # Import backend class
+    import importlib
 
-        dt = torch.float32
-    elif config.backend == "flashinfer":
-        from vllm.v1.attention.backends.flashinfer import FlashInferBackend as BE
+    backend_module = importlib.import_module(backend_cfg["module"])
+    backend_class = getattr(backend_module, backend_cfg["backend_class"])
 
-        dt = torch.float16
-    else:
-        raise ValueError(f"Unknown backend: {config.backend}")
+    # Calculate scale
+    scale = get_attention_scale(config.head_dim)
+
+    # Get dtype
+    dtype = backend_cfg["dtype"]
 
     # Create attention impl
-    impl = BE.get_impl_cls()(
+    impl = backend_class.get_impl_cls()(
         num_heads=config.num_q_heads,
         head_size=config.head_dim,
         scale=scale,
@@ -159,14 +227,37 @@ def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
         kv_cache_dtype="auto",
     )
 
+    # Create mock layer
     layer = MockLayer(device)
 
-    # Build metadata
-    common_metadata, slot_mapping, max_num_blocks = build_common_metadata(
-        requests, config.block_size, device
-    )
+    return backend_class, impl, layer, dtype
+
+
+def _create_metadata_builder(
+    backend_class,
+    common_metadata: CommonAttentionMetadata,
+    block_table: BlockTable,
+    config: BenchmarkConfig,
+    dtype: torch.dtype,
+    device: torch.device,
+):
+    """
+    Create metadata builder instance.
+
+    Args:
+        backend_class: Backend class
+        common_metadata: CommonAttentionMetadata instance
+        block_table: BlockTable instance
+        config: BenchmarkConfig instance
+        dtype: Tensor dtype
+        device: Target device
 
+    Returns:
+        Built attention metadata
+    """
     # Create mock runner for builder
+    from common import MockRunner
+
     runner = MockRunner(
         seq_lens=common_metadata.seq_lens_cpu.numpy(),
         query_start_locs=common_metadata.query_start_loc_cpu.numpy(),
@@ -174,60 +265,91 @@ def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
         num_q_heads=config.num_q_heads,
         num_kv_heads=config.num_kv_heads,
         head_dim=config.head_dim,
-        dtype=dt,
+        dtype=dtype,
     )
 
-    # Build block table
-    bt = BlockTable(len(requests), max_num_blocks, total_q, False, device)
-    for i in range(len(requests)):
-        num_blocks = (kv_lens[i] + config.block_size - 1) // config.block_size
-        bt.add_row(list(range(num_blocks)), i)
-    bt.commit(len(requests))
-
     # Create metadata builder
-    builder = BE.get_builder_cls()(
+    builder = backend_class.get_builder_cls()(
         runner=runner,
         kv_cache_spec=AttentionSpec(
             block_size=config.block_size,
             num_kv_heads=config.num_kv_heads,
             head_size=config.head_dim,
-            dtype=dt,
+            dtype=dtype,
             use_mla=False,
         ),
-        block_table=bt,
+        block_table=block_table,
     )
 
-    # Build attention metadata
-    attn_metadata = builder.build(
-        num_reqs=len(requests),
-        num_actual_tokens=total_q,
-        max_query_len=max(q_lens),
-        common_prefix_len=0,
-        common_attn_metadata=common_metadata,
-    )
+    return builder
 
-    # Create input tensors
+
+# ============================================================================
+# Tensor Creation Helpers
+# ============================================================================
+
+
+def _create_input_tensors(
+    config: BenchmarkConfig,
+    total_q: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple:
+    """
+    Create Q, K, V input tensors for all layers.
+
+    Args:
+        config: BenchmarkConfig instance
+        total_q: Total number of query tokens
+        device: Target device
+        dtype: Tensor dtype
+
+    Returns:
+        Tuple of (q_list, k_list, v_list)
+    """
     q_list = [
         torch.randn(
-            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dt
+            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
         )
         for _ in range(config.num_layers)
     ]
     k_list = [
         torch.randn(
-            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dt
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
         )
         for _ in range(config.num_layers)
     ]
     v_list = [
         torch.randn(
-            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dt
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
         )
         for _ in range(config.num_layers)
     ]
+    return q_list, k_list, v_list
 
-    # KV cache
-    if config.backend == "flashinfer":
+
+def _create_kv_cache(
+    config: BenchmarkConfig,
+    max_num_blocks: int,
+    cache_layout: str,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> list:
+    """
+    Create KV cache tensors for all layers.
+
+    Args:
+        config: BenchmarkConfig instance
+        max_num_blocks: Maximum number of blocks
+        cache_layout: Cache layout type ("standard" or "flashinfer")
+        device: Target device
+        dtype: Tensor dtype
+
+    Returns:
+        List of KV cache tensors (one per layer)
+    """
+    if cache_layout == "flashinfer":
+        # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
         cache_list = [
             torch.zeros(
                 max_num_blocks,
@@ -236,11 +358,12 @@ def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
                 config.num_kv_heads,
                 config.head_dim,
                 device=device,
-                dtype=dt,
+                dtype=dtype,
             )
             for _ in range(config.num_layers)
         ]
     else:
+        # Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
         cache_list = [
             torch.zeros(
                 2,
@@ -249,14 +372,52 @@ def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
                 config.num_kv_heads,
                 config.head_dim,
                 device=device,
-                dtype=dt,
+                dtype=dtype,
             )
             for _ in range(config.num_layers)
         ]
+    return cache_list
+
+
+# ============================================================================
+# Benchmark Execution
+# ============================================================================
+
 
-    # Output buffer
+def _run_single_benchmark(
+    config: BenchmarkConfig,
+    impl,
+    layer,
+    q_list: list,
+    k_list: list,
+    v_list: list,
+    cache_list: list,
+    attn_metadata,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple:
+    """
+    Run single benchmark iteration with warmup and timing loop.
+
+    Args:
+        config: BenchmarkConfig instance
+        impl: Backend implementation instance
+        layer: MockLayer instance
+        q_list: List of Q tensors
+        k_list: List of K tensors
+        v_list: List of V tensors
+        cache_list: List of KV cache tensors
+        attn_metadata: Attention metadata
+        device: Target device
+        dtype: Tensor dtype
+
+    Returns:
+        Tuple of (times, mem_stats)
+    """
+    # Create output buffer
+    total_q = q_list[0].shape[0]
     out = torch.empty(
-        total_q, config.num_q_heads, config.head_dim, device=device, dtype=dt
+        total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
     )
 
     # Warmup
@@ -304,7 +465,95 @@ def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
             "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
         }
 
-    # Throughput
+    return times, mem_stats
+
+
+# ============================================================================
+# Public API
+# ============================================================================
+
+
+def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
+    """
+    Run standard attention benchmark with real kernels.
+
+    Supports: flash, triton, flashinfer
+
+    Args:
+        config: Benchmark configuration
+
+    Returns:
+        BenchmarkResult with timing and memory statistics
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    # Get backend configuration
+    backend_cfg = _get_backend_config(config.backend)
+
+    # Parse batch spec
+    requests = parse_batch_spec(config.batch_spec)
+
+    # Reorder for FlashInfer if needed
+    if config.backend == "flashinfer":
+        requests = reorder_for_flashinfer(requests)
+
+    # Extract dimensions
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+
+    # Build common metadata
+    common_metadata, slot_mapping, max_num_blocks = _build_attention_metadata(
+        requests, config.block_size, device
+    )
+
+    # Create backend impl, layer, and dtype
+    backend_class, impl, layer, dtype = _create_backend_impl(
+        backend_cfg, config, device
+    )
+
+    # Build block table
+    block_table = _build_block_table(
+        requests, kv_lens, config.block_size, total_q, max_num_blocks, device
+    )
+
+    # Create metadata builder and build metadata
+    builder = _create_metadata_builder(
+        backend_class, common_metadata, block_table, config, dtype, device
+    )
+
+    attn_metadata = builder.build(
+        num_reqs=len(requests),
+        num_actual_tokens=total_q,
+        max_query_len=max(q_lens),
+        common_prefix_len=0,
+        common_attn_metadata=common_metadata,
+    )
+
+    # Create input tensors
+    q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
+
+    # Create KV cache
+    cache_list = _create_kv_cache(
+        config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
+    )
+
+    # Run benchmark
+    times, mem_stats = _run_single_benchmark(
+        config,
+        impl,
+        layer,
+        q_list,
+        k_list,
+        v_list,
+        cache_list,
+        attn_metadata,
+        device,
+        dtype,
+    )
+
+    # Calculate throughput
     mean_time = np.mean(times)
     throughput = total_q / mean_time if mean_time > 0 else 0
 
@@ -320,15 +569,29 @@ def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
     )
 
 
+# ============================================================================
+# Backwards Compatibility
+# ============================================================================
+
+
+# Keep old function names for backwards compatibility
+def build_common_metadata(*args, **kwargs):
+    """Deprecated: Use _build_attention_metadata instead."""
+    return _build_attention_metadata(*args, **kwargs)
+
+
+def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
+    """Deprecated: Use run_attention_benchmark instead."""
+    return run_attention_benchmark(config)
+
+
 def run_mla_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
     """
     Run MLA benchmark with real kernels.
 
-    This is a template - needs specific backend implementation.
+    This is a stub - use mla_runner.py for MLA benchmarks.
     """
-    # TODO: Implement for specific MLA backends
-    # This requires more complex setup due to MLA-specific metadata
     raise NotImplementedError(
-        "MLA benchmark runner needs backend-specific implementation. "
-        "See benchmark_mla_numsplits.py for CUTLASS MLA example."
+        "MLA benchmark runner is in mla_runner.py. "
+        "Use run_mla_benchmark() from that module."
     )

From 9241ced7d62bc8a3edcc0c7da40171836cd72eb1 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 17:50:25 +0000
Subject: [PATCH 12/45] Fix attention benchmark: support decode/prefill modes,
 add MockKVBProj
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit fixes the attention benchmark to properly support both decode
and prefill pipelines for MLA backends after the recent refactor.

Key changes:
- Added MockKVBProj class to mock KV projection layer for prefill mode
- Created _create_input_tensors() to generate both decode and prefill inputs
  - Decode: uses kv_lora_rank (512) dimension
  - Prefill: uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
- Added automatic mode selection: calls _forward_decode() or _forward_prefill()
  based on metadata.decode/metadata.prefill
- Fixed threshold setting: changed from class to instance variable
- Added traceback printing for better error debugging

The benchmark now successfully compares decode vs prefill pipelines:
  qlen=2: decode=0.000033s, prefill=0.000303s -> decode is 9.09x faster

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/benchmark.py  |   3 +
 benchmarks/attention_benchmarks/common.py     |  34 ++++++
 benchmarks/attention_benchmarks/mla_runner.py | 112 ++++++++++++++----
 3 files changed, 127 insertions(+), 22 deletions(-)

diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index f3ae79bd135a..157c27ae6301 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -403,10 +403,13 @@ def main():
                         all_results.append(result)
 
                 except Exception as e:
+                    import traceback
                     console.print(
                         f"[red]Error running batched benchmarks for "
                         f"batch_size={batch_size}: {e}[/]"
                     )
+                    console.print("[red]Traceback:[/]")
+                    traceback.print_exc()
                     # Add error results for all configs
                     for config, _ in configs_with_thresholds:
                         result = BenchmarkResult(
diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index eeadd9bc531f..a7a6f06d5cd3 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -43,6 +43,40 @@ def get_text_config(self):
     AttentionLayerBase = object  # Fallback
 
 
+class MockKVBProj:
+    """Mock KV projection layer for MLA prefill mode.
+
+    Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
+    Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
+    """
+
+    def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
+        self.num_heads = num_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        self.out_dim = qk_nope_head_dim + v_head_dim
+
+    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
+        """
+        Project kv_c_normed to output space.
+
+        Args:
+            x: Input tensor [num_tokens, kv_lora_rank]
+
+        Returns:
+            Tuple containing output tensor [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
+        """
+        num_tokens = x.shape[0]
+        result = torch.randn(
+            num_tokens,
+            self.num_heads,
+            self.out_dim,
+            device=x.device,
+            dtype=x.dtype,
+        )
+        return (result,)  # Return as tuple to match ColumnParallelLinear API
+
+
 class MockLayer(AttentionLayerBase):
     """Mock attention layer with scale parameters and impl.
 
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index e1a652b1be3e..30d9908df0e4 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -14,7 +14,7 @@
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec
-from common import MockHfConfig, MockLayer, setup_mla_dims
+from common import MockHfConfig, MockKVBProj, MockLayer, setup_mla_dims
 
 from vllm.config import (
     CacheConfig,
@@ -325,7 +325,7 @@ def _build_attention_metadata(
     return metadata, current_block
 
 
-def _create_query_tensors(
+def _create_input_tensors(
     total_q: int,
     mla_dims: dict,
     query_format: str,
@@ -333,7 +333,11 @@ def _create_query_tensors(
     dtype: torch.dtype,
 ):
     """
-    Create query tensors in the appropriate format for the backend.
+    Create input tensors for both decode and prefill modes.
+
+    MLA requires different tensor formats for decode vs prefill:
+    - Decode: Uses kv_lora_rank (512) dimension
+    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 head dim limit
 
     Args:
         total_q: Total number of query tokens
@@ -343,10 +347,13 @@ def _create_query_tensors(
         dtype: Tensor dtype
 
     Returns:
-        Query tensor(s) - either (q_nope, q_pe) tuple or concatenated tensor
+        Tuple of (decode_inputs, prefill_inputs)
+        - decode_inputs: Query tensor(s) for decode mode
+        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill mode
     """
     if query_format == "tuple":
-        q_nope = torch.randn(
+        # Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
+        q_nope_decode = torch.randn(
             total_q,
             mla_dims["num_q_heads"],
             mla_dims["kv_lora_rank"],
@@ -360,15 +367,58 @@ def _create_query_tensors(
             device=device,
             dtype=dtype,
         )
-        return (q_nope, q_pe)
+        decode_inputs = (q_nope_decode, q_pe)
+
+        # For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
+        q_nope_prefill = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_nope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
     else:  # concat
-        return torch.randn(
+        decode_inputs = torch.randn(
             total_q,
             mla_dims["num_q_heads"],
             mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
             device=device,
             dtype=dtype,
         )
+        # For prefill with concat format
+        prefill_q = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+
+    # Create additional inputs needed for prefill forward
+    k_c_normed = torch.randn(
+        total_q,
+        mla_dims["kv_lora_rank"],
+        device=device,
+        dtype=dtype,
+    )
+    k_pe = torch.randn(
+        total_q,
+        1,  # Single head for MLA
+        mla_dims["qk_rope_head_dim"],
+        device=device,
+        dtype=dtype,
+    )
+    k_scale = torch.ones(1, device=device, dtype=torch.float32)
+
+    prefill_inputs = {
+        "q": prefill_q,
+        "k_c_normed": k_c_normed,
+        "k_pe": k_pe,
+        "k_scale": k_scale,
+    }
+
+    return decode_inputs, prefill_inputs
 
 
 # ============================================================================
@@ -401,6 +451,13 @@ def _create_backend_impl(
     # Calculate scale
     scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
 
+    # Create mock kv_b_proj layer for prefill mode
+    mock_kv_b_proj = MockKVBProj(
+        num_heads=mla_dims["num_q_heads"],
+        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+        v_head_dim=mla_dims["v_head_dim"],
+    )
+
     # Create impl
     impl = impl_class(
         num_heads=mla_dims["num_q_heads"],
@@ -419,7 +476,7 @@ def _create_backend_impl(
         qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
         qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
         v_head_dim=mla_dims["v_head_dim"],
-        kv_b_proj=None,
+        kv_b_proj=mock_kv_b_proj,
     )
 
     # Initialize DCP attributes
@@ -509,8 +566,8 @@ def _run_single_benchmark(
         dtype=torch.float16,
     )
 
-    # Create query tensors
-    query = _create_query_tensors(
+    # Create input tensors for both decode and prefill modes
+    decode_inputs, prefill_inputs = _create_input_tensors(
         total_q,
         mla_dims,
         backend_cfg["query_format"],
@@ -518,9 +575,26 @@ def _run_single_benchmark(
         torch.float16,
     )
 
+    # Determine which forward method to use based on metadata
+    if metadata.decode is not None:
+        forward_fn = lambda: impl._forward_decode(
+            decode_inputs, kv_cache, metadata, layer
+        )
+    elif metadata.prefill is not None:
+        forward_fn = lambda: impl._forward_prefill(
+            prefill_inputs["q"],
+            prefill_inputs["k_c_normed"],
+            prefill_inputs["k_pe"],
+            kv_cache,
+            metadata,
+            prefill_inputs["k_scale"],
+        )
+    else:
+        raise RuntimeError("Metadata has neither decode nor prefill metadata")
+
     # Warmup
     for _ in range(config.warmup_iters):
-        impl._forward_decode(query, kv_cache, metadata, layer)
+        forward_fn()
     torch.cuda.synchronize()
 
     # Benchmark
@@ -531,7 +605,7 @@ def _run_single_benchmark(
 
         start.record()
         for _ in range(config.num_layers):
-            impl._forward_decode(query, kv_cache, metadata, layer)
+            forward_fn()
         end.record()
 
         torch.cuda.synchronize()
@@ -596,19 +670,13 @@ def _run_mla_benchmark_batched(
             backend_cfg, mla_dims, vllm_config, device
         )
 
-        # Get builder class for threshold setting (if applicable)
-        builder_class = None
-        if backend_cfg["builder_class"]:
-            backend_module = importlib.import_module(backend_cfg["module"])
-            builder_class = getattr(backend_module, backend_cfg["builder_class"])
-
         # Run each benchmark with the shared impl
         for config, threshold, num_splits in configs_with_params:
             # Set threshold for this benchmark (FlashAttn/FlashMLA only)
             original_threshold = None
-            if threshold is not None and builder_class:
-                original_threshold = builder_class.reorder_batch_threshold
-                builder_class.reorder_batch_threshold = threshold
+            if threshold is not None and builder_instance:
+                original_threshold = builder_instance.reorder_batch_threshold
+                builder_instance.reorder_batch_threshold = threshold
 
             # Set num_splits for CUTLASS
             original_num_splits = None
@@ -631,7 +699,7 @@ def _run_mla_benchmark_batched(
             finally:
                 # Restore original threshold
                 if original_threshold is not None:
-                    builder_class.reorder_batch_threshold = original_threshold
+                    builder_instance.reorder_batch_threshold = original_threshold
 
                 # Restore original num_splits
                 if original_num_splits is not None:

From 2ef27cccdda493194d423a57fb84108e640f8f89 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 14:05:58 -0400
Subject: [PATCH 13/45] turn off auto

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
index 1b5305c0c866..2cd6877e77ec 100644
--- a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
@@ -26,7 +26,7 @@ num_splits:
   - 32
 
 # Compare against auto-selected num_kv_splits
-compare_auto: true
+compare_auto: false
 
 # Model configuration (DeepSeek V2/V3 defaults)
 model:

From 8a890e70de54252b2777327614c32a9830f3a548 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 14:06:41 -0400
Subject: [PATCH 14/45] abbreviate column titles

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/common.py | 25 +++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index a7a6f06d5cd3..8542dbc2102b 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -64,7 +64,8 @@ def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
             x: Input tensor [num_tokens, kv_lora_rank]
 
         Returns:
-            Tuple containing output tensor [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
+            Tuple containing output tensor
+                [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
         """
         num_tokens = x.shape[0]
         result = torch.randn(
@@ -340,18 +341,30 @@ def print_table(
                 by_spec[spec] = {}
             by_spec[spec][r.config.backend] = r
 
+        # Create shortened backend names for display
+        def shorten_backend_name(name: str) -> str:
+            """Shorten long backend names for table display."""
+            # Remove common prefixes
+            name = name.replace("flashattn_mla", "famla")
+            name = name.replace("flashinfer_mla", "fimla")
+            name = name.replace("flashmla", "fmla")
+            name = name.replace("cutlass_mla", "cmla")
+            name = name.replace("num_splits", "ns")
+            return name
+
         table = Table(title="Attention Benchmark Results")
-        table.add_column("Batch Spec", no_wrap=True)
+        table.add_column("Batch\nSpec", no_wrap=True)
 
         multi = len(backends) > 1
         for backend in backends:
+            short_name = shorten_backend_name(backend)
             # Time column
-            col_time = f"{backend} Time (s)"
-            table.add_column(col_time, justify="right", no_wrap=True)
+            col_time = f"{short_name}\nTime (s)"
+            table.add_column(col_time, justify="right", no_wrap=False)
             if multi and compare_to_fastest:
                 # Relative performance column
-                col_rel = f"{backend} vs Fastest"
-                table.add_column(col_rel, justify="right", no_wrap=True)
+                col_rel = f"{short_name}\nvs Best"
+                table.add_column(col_rel, justify="right", no_wrap=False)
 
         # Add rows
         for spec in sorted(by_spec.keys()):

From 3a92b2e53e6ad28451a917bdc07b7065b4b6eda5 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 14:46:20 -0400
Subject: [PATCH 15/45] refactor

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/benchmark.py  | 285 ++++++++++--------
 benchmarks/attention_benchmarks/common.py     |  18 +-
 .../configs/cutlass_numsplits.yaml            |  17 +-
 .../configs/flashinfer_vs_cutlass.yaml        |  12 +-
 .../configs/speculative_decode.yaml           |  26 +-
 5 files changed, 202 insertions(+), 156 deletions(-)

diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index 157c27ae6301..b45b9f719bfd 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -15,15 +15,14 @@
     # MLA backends
     python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64s1k"
 
-    # CUTLASS num-splits sweep
+    # Parameter sweep (CLI)
     python benchmark.py --backend cutlass_mla \
                         --batch-specs "64s1k" \
-                        --num-splits 1 4 8 16
+                        --sweep-param num_kv_splits \
+                        --sweep-values 1 4 8 16
 
-    # Speculative decode threshold tuning
-    python benchmark.py --backend flashmla \
-                        --batch-specs "spec4s1k" \
-                        --thresholds 1 4 16 64
+    # Parameter sweep (YAML config - recommended)
+    python benchmark.py --config configs/cutlass_numsplits.yaml
 """
 
 import argparse
@@ -38,7 +37,7 @@
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from batch_spec import parse_batch_spec
-from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
+from common import BenchmarkConfig, BenchmarkResult, ParameterSweep, ResultsFormatter
 
 
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
@@ -64,6 +63,114 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
     )
 
 
+def run_parameter_sweep(
+    backends: list[str],
+    batch_specs: list[str],
+    base_config_args: dict,
+    sweep: ParameterSweep,
+    console: Console,
+) -> list[BenchmarkResult]:
+    """
+    Run parameter sweep for given backends and batch specs.
+
+    Args:
+        backends: List of backend names
+        batch_specs: List of batch specifications
+        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
+        sweep: ParameterSweep configuration
+        console: Rich console for output
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    all_results = []
+
+    # Build list of values to sweep (including auto if requested)
+    sweep_values = list(sweep.values)
+    if sweep.include_auto:
+        sweep_values.append("auto")
+
+    console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
+
+    total = len(backends) * len(batch_specs) * len(sweep_values)
+
+    with tqdm(total=total, desc="Benchmarking") as pbar:
+        for backend in backends:
+            for spec in batch_specs:
+                for value in sweep_values:
+                    # Create config with descriptive backend name
+                    backend_label = sweep.get_label(backend, value)
+                    config = BenchmarkConfig(
+                        backend=backend_label, batch_spec=spec, **base_config_args
+                    )
+
+                    try:
+                        # Create clean config with original backend name for actual run
+                        clean_config = replace(config, backend=backend)
+
+                        # Prepare kwargs for benchmark runner
+                        kwargs = {}
+                        if value != "auto":
+                            kwargs[sweep.param_name] = value
+
+                        # Determine if MLA backend
+                        if backend in [
+                            "cutlass_mla",
+                            "flashinfer_mla",
+                            "flashattn_mla",
+                            "flashmla",
+                        ]:
+                            result = run_mla_benchmark(clean_config, **kwargs)
+                        else:
+                            result = run_standard_attention_benchmark(clean_config)
+
+                        # Replace result's config with labeled version
+                        result = replace(result, config=config)
+                        all_results.append(result)
+
+                    except Exception as e:
+                        console.print(
+                            f"[red]Error {backend} {spec} {sweep.param_name}="
+                            f"{value}: {e}[/]"
+                        )
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=float("inf"),
+                            std_time=0,
+                            min_time=float("inf"),
+                            max_time=float("inf"),
+                            error=str(e),
+                        )
+                        all_results.append(result)
+
+                    pbar.update(1)
+
+    # Display sweep results
+    console.print("\n[bold green]Sweep Results:[/]")
+    backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
+    formatter = ResultsFormatter(console)
+    formatter.print_table(all_results, backend_labels)
+
+    # Show optimal values
+    console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
+    by_spec = {}
+    for r in all_results:
+        if r.success:
+            spec = r.config.batch_spec
+            if spec not in by_spec:
+                by_spec[spec] = []
+            by_spec[spec].append(r)
+
+    for spec in sorted(by_spec.keys()):
+        results = by_spec[spec]
+        best = min(results, key=lambda r: r.mean_time)
+        console.print(
+            f"  {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
+        )
+
+    return all_results
+
+
 def load_config_from_yaml(config_path: str) -> dict:
     """Load configuration from YAML file."""
     with open(config_path) as f:
@@ -190,23 +297,16 @@ def main():
     parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
     parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
 
-    # MLA-specific options
+    # Parameter sweep (use YAML config for advanced sweeps)
     parser.add_argument(
-        "--num-splits",
-        type=int,
-        nargs="+",
-        help="CUTLASS MLA: Test multiple num_kv_splits values",
+        "--sweep-param",
+        help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
     )
     parser.add_argument(
-        "--thresholds",
+        "--sweep-values",
         type=int,
         nargs="+",
-        help="FlashMLA/FlashAttn MLA: Test multiple reorder_batch_threshold values",
-    )
-    parser.add_argument(
-        "--compare-auto",
-        action="store_true",
-        help="CUTLASS MLA: Also test auto num_kv_splits",
+        help="Values to sweep for the parameter",
     )
 
     # Output
@@ -283,13 +383,19 @@ def main():
             args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
             args.profile_memory = bench.get("profile_memory", args.profile_memory)
 
-        # MLA-specific sweeps
-        if "num_splits" in yaml_config:
-            args.num_splits = yaml_config["num_splits"]
-        if "thresholds" in yaml_config:
-            args.thresholds = yaml_config["thresholds"]
-        if "compare_auto" in yaml_config:
-            args.compare_auto = yaml_config["compare_auto"]
+        # Parameter sweep configuration
+        if "parameter_sweep" in yaml_config:
+            sweep_config = yaml_config["parameter_sweep"]
+            args.parameter_sweep = ParameterSweep(
+                param_name=sweep_config["param_name"],
+                values=sweep_config["values"],
+                include_auto=sweep_config.get("include_auto", False),
+                label_format=sweep_config.get(
+                    "label_format", "{backend}_{param_name}_{value}"
+                ),
+            )
+        else:
+            args.parameter_sweep = None
 
         # Output
         if "output" in yaml_config:
@@ -301,6 +407,19 @@ def main():
 
         console.print()
 
+    # Handle CLI-based parameter sweep (if not from YAML)
+    if (
+        (not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
+        and args.sweep_param
+        and args.sweep_values
+    ):
+        args.parameter_sweep = ParameterSweep(
+            param_name=args.sweep_param,
+            values=args.sweep_values,
+            include_auto=False,
+            label_format="{backend}_{param_name}_{value}",
+        )
+
     # Determine backends
     backends = args.backends or ([args.backend] if args.backend else ["flash"])
     console.print(f"Backends: {', '.join(backends)}")
@@ -404,6 +523,7 @@ def main():
 
                 except Exception as e:
                     import traceback
+
                     console.print(
                         f"[red]Error running batched benchmarks for "
                         f"batch_size={batch_size}: {e}[/]"
@@ -497,102 +617,23 @@ def main():
                     f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
                 )
 
-    # Handle special cases: num-splits sweep or threshold sweep
-    elif args.num_splits or args.thresholds:
-        # Sweep mode
-        sweep_param = "num_splits" if args.num_splits else "thresholds"
-        sweep_values = args.num_splits or args.thresholds
-
-        if args.compare_auto and args.num_splits:
-            sweep_values = list(sweep_values) + ["auto"]
-
-        console.print(f"[yellow]Sweep mode: testing {sweep_param} = {sweep_values}[/]")
-
-        total = len(backends) * len(args.batch_specs) * len(sweep_values)
-
-        with tqdm(total=total, desc="Benchmarking") as pbar:
-            for backend in backends:
-                for spec in args.batch_specs:
-                    for value in sweep_values:
-                        # Create config
-                        config = BenchmarkConfig(
-                            backend=f"{backend}_{sweep_param}_{value}",
-                            batch_spec=spec,
-                            num_layers=args.num_layers,
-                            head_dim=args.head_dim,
-                            num_q_heads=args.num_q_heads,
-                            num_kv_heads=args.num_kv_heads,
-                            block_size=args.block_size,
-                            device=args.device,
-                            repeats=args.repeats,
-                            warmup_iters=args.warmup_iters,
-                            profile_memory=args.profile_memory,
-                        )
-
-                        try:
-                            # Create a clean config with just the backend name
-                            # for the actual benchmark but keep the full name
-                            # with sweep params in the result
-                            clean_config = replace(config, backend=backend)
-
-                            if args.num_splits:
-                                # CUTLASS num_kv_splits
-                                num_splits = None if value == "auto" else value
-                                result = run_mla_benchmark(
-                                    clean_config, num_kv_splits=num_splits
-                                )
-                            else:
-                                # Threshold sweep
-                                result = run_mla_benchmark(
-                                    clean_config, reorder_batch_threshold=value
-                                )
-
-                            # Replace the result's config with the one that has
-                            # the sweep params in the name
-                            result = replace(result, config=config)
-                            all_results.append(result)
-                        except Exception as e:
-                            console.print(
-                                f"[red]Error {backend} {spec} {sweep_param}="
-                                f"{value}: {e}[/]"
-                            )
-                            result = BenchmarkResult(
-                                config=config,
-                                mean_time=float("inf"),
-                                std_time=0,
-                                min_time=float("inf"),
-                                max_time=float("inf"),
-                                error=str(e),
-                            )
-                            all_results.append(result)
-
-                        pbar.update(1)
-
-        # Display sweep results
-        console.print("\n[bold green]Sweep Results:[/]")
-        backend_names = [
-            f"{b}_{sweep_param}_{v}" for b in backends for v in sweep_values
-        ]
-        formatter = ResultsFormatter(console)
-        formatter.print_table(all_results, backend_names)
-
-        # Show optimal
-        console.print(f"\n[bold cyan]Optimal {sweep_param} per batch spec:[/]")
-        by_spec = {}
-        for r in all_results:
-            if r.success:
-                spec = r.config.batch_spec
-                if spec not in by_spec:
-                    by_spec[spec] = []
-                by_spec[spec].append(r)
-
-        for spec in sorted(by_spec.keys()):
-            results = by_spec[spec]
-            best = min(results, key=lambda r: r.mean_time)
-            console.print(
-                f"  {spec}: [bold green]{best.config.backend}[/] "
-                f"({best.mean_time:.6f}s)"
-            )
+    # Handle parameter sweep mode (unified)
+    elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
+        # Unified parameter sweep
+        base_config_args = {
+            "num_layers": args.num_layers,
+            "head_dim": args.head_dim,
+            "num_q_heads": args.num_q_heads,
+            "num_kv_heads": args.num_kv_heads,
+            "block_size": args.block_size,
+            "device": args.device,
+            "repeats": args.repeats,
+            "warmup_iters": args.warmup_iters,
+            "profile_memory": args.profile_memory,
+        }
+        all_results = run_parameter_sweep(
+            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
+        )
 
     else:
         # Normal mode: compare backends
diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index 8542dbc2102b..0bda71ef04ec 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -192,6 +192,22 @@ def __init__(
         self.dtype = dtype
 
 
+@dataclass
+class ParameterSweep:
+    """Configuration for sweeping a backend parameter."""
+
+    param_name: str  # Name of the backend parameter to sweep
+    values: list[Any]  # List of values to test
+    include_auto: bool = False  # Also test with param unset (auto mode)
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+
+
 @dataclass
 class BenchmarkConfig:
     """Configuration for a single benchmark run."""
@@ -349,7 +365,7 @@ def shorten_backend_name(name: str) -> str:
             name = name.replace("flashinfer_mla", "fimla")
             name = name.replace("flashmla", "fmla")
             name = name.replace("cutlass_mla", "cmla")
-            name = name.replace("num_splits", "ns")
+            name = name.replace("numsplits", "ns")
             return name
 
         table = Table(title="Attention Benchmark Results")
diff --git a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
index 2cd6877e77ec..0f35334a4489 100644
--- a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
@@ -16,17 +16,12 @@ batch_specs:
   - "128q1s1k"    # 128 decode requests, 1k KV cache
   - "128q1s4k"    # 128 decode requests, 4k KV cache
 
-# Sweep num_kv_splits values
-num_splits:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-
-# Compare against auto-selected num_kv_splits
-compare_auto: false
+# Unified parameter sweep configuration
+parameter_sweep:
+  param_name: "num_kv_splits"
+  values: [1, 2, 4, 8, 16, 32]
+  include_auto: false
+  label_format: "{backend}_numsplits_{value}"
 
 # Model configuration (DeepSeek V2/V3 defaults)
 model:
diff --git a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
index 1b4a0904f8b4..e45f1da0288e 100644
--- a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
+++ b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
@@ -20,13 +20,11 @@ batch_specs:
 
 # For CUTLASS, test optimized num_kv_splits
 # Based on Study 1 results, you may want to adjust these values
-num_splits:
-  - 4           # Often optimal for medium batches
-  - 8           # Often optimal for larger batches
-  - 16          # Test for very large batches
-
-# Also compare against auto-selection
-compare_auto: true
+parameter_sweep:
+  param_name: "num_kv_splits"
+  values: [4, 8, 16]  # Often optimal for medium to large batches
+  include_auto: true  # Also compare against auto-selection
+  label_format: "{backend}_numsplits_{value}"
 
 # Model configuration (DeepSeek V2/V3 defaults)
 model:
diff --git a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
index 8a960ac9555f..56d2428fe74f 100644
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -47,20 +47,16 @@ backends:
 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
 # - flashinfer_mla
 
-device: "cuda:0"
-repeats: 10  # More repeats for statistical significance
-warmup_iters: 5
-profile_memory: false
+# Benchmark settings
+benchmark:
+  device: "cuda:0"
+  repeats: 10  # More repeats for statistical significance
+  warmup_iters: 5
+  profile_memory: false
 
 # Test these threshold values for optimization
-reorder_batch_thresholds:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  - 512
+parameter_sweep:
+  param_name: "reorder_batch_threshold"
+  values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+  include_auto: false
+  label_format: "{backend}_threshold_{value}"

From 8377b0201c93a62b4c438f5df894a236efc4f9a3 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 15:00:29 -0400
Subject: [PATCH 16/45] update configurations

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/cutlass_numsplits.yaml            | 81 +++++++++++++++++--
 1 file changed, 74 insertions(+), 7 deletions(-)

diff --git a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
index 0f35334a4489..90dcb841d8a7 100644
--- a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
@@ -7,14 +7,81 @@ description: "CUTLASS MLA num-splits optimization study"
 # Single backend for this study
 backend: cutlass_mla
 
-# Test various decode batch sizes with different KV cache lengths
+# Fine-grained matrix sweep: batch sizes × sequence lengths
+# Batch sizes: 8, 16, 24, 32, 48, 64, 96, 128
+# Sequence lengths: 1k, 2k, 4k, 8k, 16k, 32k, 64k
 batch_specs:
-  - "32q1s1k"     # 32 decode requests, 1k KV cache
-  - "64q1s1k"     # 64 decode requests, 1k KV cache
-  - "64q1s4k"     # 64 decode requests, 4k KV cache
-  - "64q1s16k"    # 64 decode requests, 16k KV cache
-  - "128q1s1k"    # 128 decode requests, 1k KV cache
-  - "128q1s4k"    # 128 decode requests, 4k KV cache
+  # Batch size: 8
+  - "8q1s1k"
+  - "8q1s2k"
+  - "8q1s4k"
+  - "8q1s8k"
+  - "8q1s16k"
+  - "8q1s32k"
+  - "8q1s64k"
+
+  # Batch size: 16
+  - "16q1s1k"
+  - "16q1s2k"
+  - "16q1s4k"
+  - "16q1s8k"
+  - "16q1s16k"
+  - "16q1s32k"
+  - "16q1s64k"
+
+  # Batch size: 24
+  - "24q1s1k"
+  - "24q1s2k"
+  - "24q1s4k"
+  - "24q1s8k"
+  - "24q1s16k"
+  - "24q1s32k"
+  - "24q1s64k"
+
+  # Batch size: 32
+  - "32q1s1k"
+  - "32q1s2k"
+  - "32q1s4k"
+  - "32q1s8k"
+  - "32q1s16k"
+  - "32q1s32k"
+  - "32q1s64k"
+
+  # Batch size: 48
+  - "48q1s1k"
+  - "48q1s2k"
+  - "48q1s4k"
+  - "48q1s8k"
+  - "48q1s16k"
+  - "48q1s32k"
+  - "48q1s64k"
+
+  # Batch size: 64
+  - "64q1s1k"
+  - "64q1s2k"
+  - "64q1s4k"
+  - "64q1s8k"
+  - "64q1s16k"
+  - "64q1s32k"
+  - "64q1s64k"
+
+  # Batch size: 96
+  - "96q1s1k"
+  - "96q1s2k"
+  - "96q1s4k"
+  - "96q1s8k"
+  - "96q1s16k"
+  - "96q1s32k"
+  - "96q1s64k"
+
+  # Batch size: 128
+  - "128q1s1k"
+  - "128q1s2k"
+  - "128q1s4k"
+  - "128q1s8k"
+  - "128q1s16k"
+  - "128q1s32k"
+  - "128q1s64k"
 
 # Unified parameter sweep configuration
 parameter_sweep:

From b46342c531663b0b6350672f36dbcb411d88db96 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 15:40:55 -0400
Subject: [PATCH 17/45] fix tests

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/batch_spec.py |  12 +-
 .../attention_benchmarks/test_batch_spec.py   | 103 ++++++++----------
 2 files changed, 47 insertions(+), 68 deletions(-)

diff --git a/benchmarks/attention_benchmarks/batch_spec.py b/benchmarks/attention_benchmarks/batch_spec.py
index eaa82ad9dddb..0b981ecc099c 100644
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -210,22 +210,16 @@ def split_by_type(
         requests: List of BatchRequest
 
     Returns:
-        Dict with keys: 'decode', 'prefill', 'extend', 'speculative', 'chunked'
+        Dict with keys: 'decode', 'prefill', 'extend'
     """
     result = {
         "decode": [],
         "prefill": [],
         "extend": [],
-        "speculative": [],
-        "chunked": [],
     }
 
     for req in requests:
-        if req.is_chunked:
-            result["chunked"].append(req)
-        elif req.is_speculative:
-            result["speculative"].append(req)
-        elif req.is_decode:
+        if req.is_decode:
             result["decode"].append(req)
         elif req.is_prefill:
             result["prefill"].append(req)
@@ -252,8 +246,6 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
         "num_decode": len(by_type["decode"]),
         "num_prefill": len(by_type["prefill"]),
         "num_extend": len(by_type["extend"]),
-        "num_speculative": len(by_type["speculative"]),
-        "num_chunked": len(by_type["chunked"]),
         "total_tokens": sum(r.q_len for r in requests),
         "total_kv_cache": sum(r.kv_len for r in requests),
         "max_q_len": max((r.q_len for r in requests), default=0),
diff --git a/benchmarks/attention_benchmarks/test_batch_spec.py b/benchmarks/attention_benchmarks/test_batch_spec.py
index c6db153c9102..84367d0ed2e9 100644
--- a/benchmarks/attention_benchmarks/test_batch_spec.py
+++ b/benchmarks/attention_benchmarks/test_batch_spec.py
@@ -32,14 +32,14 @@ def test_basic_patterns():
     print("  ✓ q2k -> [(2048, 2048)]")
 
     # Decode
-    result = parse_batch_spec("8s1k")
+    result = parse_batch_spec("8q1s1k")
     assert len(result) == 8
     assert all(r.q_len == 1 and r.kv_len == 1024 for r in result)
     assert all(r.is_decode for r in result)
-    print("  ✓ 8s1k -> 8 x [(1, 1024)]")
+    print("  ✓ 8q1s1k -> 8 x [(1, 1024)]")
 
     # Context extension
-    result = parse_batch_spec("q1s2k")
+    result = parse_batch_spec("q1ks2k")
     assert len(result) == 1
     assert result[0].q_len == 1024
     assert result[0].kv_len == 2048
@@ -51,73 +51,61 @@ def test_combined_patterns():
     """Test combined batch specifications."""
     print("\nTesting combined patterns...")
 
-    result = parse_batch_spec("2q1k_32s1k")
+    result = parse_batch_spec("2q1k_32q1s1k")
     assert len(result) == 34
     assert sum(1 for r in result if r.is_prefill) == 2
     assert sum(1 for r in result if r.is_decode) == 32
-    print("  ✓ 2q1k_32s1k -> 2 prefill + 32 decode")
+    print("  ✓ 2q1k_32q1s1k -> 2 prefill + 32 decode")
 
-    result = parse_batch_spec("4q2k_spec8s1k_64s2k")
-    assert len(result) == 69
-    print("  ✓ 4q2k_spec8s1k_64s2k -> complex mix")
+    result = parse_batch_spec("4q2k_8q4s1k_64q1s2k")
+    assert len(result) == 76  # 4 + 8 + 64
+    print("  ✓ 4q2k_8q4s1k_64q1s2k -> complex mix")
 
 
-def test_speculative_decode():
-    """Test speculative decode patterns."""
-    print("\nTesting speculative decode...")
+def test_extend_patterns():
+    """Test context extension (extend) patterns."""
+    print("\nTesting extend patterns...")
 
-    result = parse_batch_spec("spec4s1k")
+    # 4-token extension with 1k context
+    result = parse_batch_spec("q4s1k")
     assert len(result) == 1
     assert result[0].q_len == 4
     assert result[0].kv_len == 1024
-    assert result[0].is_speculative
-    assert result[0].spec_length == 4
-    print("  ✓ spec4s1k -> 4-token speculative")
+    assert result[0].is_extend
+    assert not result[0].is_decode
+    assert not result[0].is_prefill
+    print("  ✓ q4s1k -> 4-token extend with 1k context")
 
-    result = parse_batch_spec("8spec8s2k")
+    # 8 requests of 8-token extension
+    result = parse_batch_spec("8q8s2k")
     assert len(result) == 8
-    assert all(r.is_speculative and r.spec_length == 8 for r in result)
-    print("  ✓ 8spec8s2k -> 8 x 8-token speculative")
-
-
-def test_chunked_prefill():
-    """Test chunked prefill patterns."""
-    print("\nTesting chunked prefill...")
-
-    result = parse_batch_spec("chunk8q16k")
-    assert len(result) == 1
-    assert result[0].q_len == 16384
-    assert result[0].is_chunked
-    assert result[0].chunk_size == 8
-    print("  ✓ chunk8q16k -> chunked 16k prefill")
-
-    result = parse_batch_spec("2chunk4q8k")
-    assert len(result) == 2
-    assert all(r.is_chunked and r.chunk_size == 4 for r in result)
-    print("  ✓ 2chunk4q8k -> 2 x chunked 8k prefill")
+    assert all(r.q_len == 8 and r.kv_len == 2048 for r in result)
+    assert all(r.is_extend for r in result)
+    print("  ✓ 8q8s2k -> 8 x 8-token extend with 2k context")
 
 
 def test_formatting():
     """Test batch spec formatting."""
     print("\nTesting formatting...")
 
-    requests = parse_batch_spec("2q2k_32s1k")
+    requests = parse_batch_spec("2q2k_32q1s1k")
     formatted = format_batch_spec(requests)
     assert "2 prefill" in formatted
     assert "32 decode" in formatted
     print(f"  ✓ Format: {formatted}")
 
-    requests = parse_batch_spec("spec4s1k_8s1k")
+    requests = parse_batch_spec("q4s1k_8q1s1k")
     formatted = format_batch_spec(requests)
-    assert "specdecode" in formatted
-    print(f"  ✓ Format with spec: {formatted}")
+    assert "1 extend" in formatted
+    assert "8 decode" in formatted
+    print(f"  ✓ Format with extend: {formatted}")
 
 
 def test_batch_stats():
     """Test batch statistics."""
     print("\nTesting batch statistics...")
 
-    requests = parse_batch_spec("2q2k_32s1k")
+    requests = parse_batch_spec("2q2k_32q1s1k")
     stats = get_batch_stats(requests)
 
     assert stats["total_requests"] == 34
@@ -162,9 +150,9 @@ def test_range_generation_simple():
     """Test simple range generation."""
     print("\nTesting range generation (simple)...")
 
-    ranges = [{"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 5, "step": 1}}]
+    ranges = [{"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 5, "step": 1}}]
     specs = generate_batch_specs_from_ranges(ranges)
-    expected = ["q1s1k", "q2s1k", "q3s1k", "q4s1k", "q5s1k"]
+    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q4ks1k", "q5ks1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ Simple range: {len(specs)} specs generated")
 
@@ -174,11 +162,11 @@ def test_range_generation_multiple():
     print("\nTesting range generation (multiple ranges)...")
 
     ranges = [
-        {"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 3, "step": 1}},
-        {"template": "q{q_len}s1k", "q_len": {"start": 10, "stop": 20, "step": 5}},
+        {"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 3, "step": 1}},
+        {"template": "q{q_len}ks1k", "q_len": {"start": 10, "stop": 20, "step": 5}},
     ]
     specs = generate_batch_specs_from_ranges(ranges)
-    expected = ["q1s1k", "q2s1k", "q3s1k", "q10s1k", "q15s1k", "q20s1k"]
+    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q10ks1k", "q15ks1k", "q20ks1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ Multiple ranges: {len(specs)} specs generated")
 
@@ -188,9 +176,9 @@ def test_range_generation_large():
     print("\nTesting range generation (large range)...")
 
     ranges = [
-        {"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 16, "step": 1}},
-        {"template": "q{q_len}s1k", "q_len": {"start": 17, "stop": 64, "step": 2}},
-        {"template": "q{q_len}s1k", "q_len": {"start": 65, "stop": 128, "step": 4}},
+        {"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 16, "step": 1}},
+        {"template": "q{q_len}ks1k", "q_len": {"start": 17, "stop": 64, "step": 2}},
+        {"template": "q{q_len}ks1k", "q_len": {"start": 65, "stop": 128, "step": 4}},
     ]
     specs = generate_batch_specs_from_ranges(ranges)
     expected_count = 16 + 24 + 16  # (1-16) + (17,19,21...63) + (65,69,73...125)
@@ -206,14 +194,14 @@ def test_range_generation_cartesian():
 
     ranges = [
         {
-            "template": "q{q_len}s{kv_len}k",
+            "template": "q{q_len}ks{kv_len}k",
             "q_len": {"start": 1, "stop": 2, "step": 1},
             "kv_len": {"start": 1, "stop": 2, "step": 1},
         }
     ]
     specs = generate_batch_specs_from_ranges(ranges)
     # Should generate Cartesian product: (1,1), (1,2), (2,1), (2,2)
-    expected = ["q1s1k", "q1s2k", "q2s1k", "q2s2k"]
+    expected = ["q1ks1k", "q1ks2k", "q2ks1k", "q2ks2k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ Cartesian product: {len(specs)} specs generated")
 
@@ -224,34 +212,34 @@ def test_range_generation_end_inclusive():
 
     # Test inclusive (default)
     ranges_inclusive = [
-        {"template": "q{q_len}s1k", "q_len": {"start": 1, "stop": 3, "step": 1}}
+        {"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 3, "step": 1}}
     ]
     specs = generate_batch_specs_from_ranges(ranges_inclusive)
-    expected = ["q1s1k", "q2s1k", "q3s1k"]
+    expected = ["q1ks1k", "q2ks1k", "q3ks1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print(f"  ✓ end_inclusive default (true): {specs}")
 
     # Test explicit inclusive
     ranges_explicit_inclusive = [
         {
-            "template": "q{q_len}s1k",
+            "template": "q{q_len}ks1k",
             "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": True},
         }
     ]
     specs = generate_batch_specs_from_ranges(ranges_explicit_inclusive)
-    expected = ["q1s1k", "q2s1k", "q3s1k", "q4s1k", "q5s1k"]
+    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q4ks1k", "q5ks1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print("  ✓ end_inclusive=true: includes stop value")
 
     # Test exclusive
     ranges_exclusive = [
         {
-            "template": "q{q_len}s1k",
+            "template": "q{q_len}ks1k",
             "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": False},
         }
     ]
     specs = generate_batch_specs_from_ranges(ranges_exclusive)
-    expected = ["q1s1k", "q2s1k", "q3s1k", "q4s1k"]
+    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q4ks1k"]
     assert specs == expected, f"Expected {expected}, got {specs}"
     print("  ✓ end_inclusive=false: excludes stop value")
 
@@ -264,8 +252,7 @@ def main():
 
     test_basic_patterns()
     test_combined_patterns()
-    test_speculative_decode()
-    test_chunked_prefill()
+    test_extend_patterns()
     test_formatting()
     test_batch_stats()
     test_manual_batch()

From f09a96326fd9c3357494e21f9cf726e464693630 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 15:46:16 -0400
Subject: [PATCH 18/45] update old batch specs

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md    | 6 +++---
 benchmarks/attention_benchmarks/benchmark.py | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 9e1e6281737a..41fa60b05fca 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -255,9 +255,9 @@ from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
 from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
 
 # Parse batch specs
-requests = parse_batch_spec("2q2k_q4s1k_32s1k")
+requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
 print(format_batch_spec(requests))
-# "2 prefill (2x2k), 1 extend (1xq4s1k), 32 decode (32x1k)"
+# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
 
 # Get batch statistics
 stats = get_batch_stats(requests)
@@ -317,7 +317,7 @@ source /path/to/vllm/.venv/bin/activate
 - Some backends need Hopper/Blackwell
 
 **OOM?**
-- Reduce batch size: `"32s1k"` → `"16s1k"`
+- Reduce batch size: `"32q1s1k"` → `"16q1s1k"`
 - Reduce sequence length: `"64q1s16k"` → `"64q1s4k"`
 
 ## What's Included
diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index b45b9f719bfd..d268ce948cd6 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -10,14 +10,14 @@
 
 Examples:
     # Standard attention
-    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8s1k"
+    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
 
     # MLA backends
-    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64s1k"
+    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
 
     # Parameter sweep (CLI)
     python benchmark.py --backend cutlass_mla \
-                        --batch-specs "64s1k" \
+                        --batch-specs "64q1s1k" \
                         --sweep-param num_kv_splits \
                         --sweep-values 1 4 8 16
 
@@ -280,7 +280,7 @@ def main():
     parser.add_argument(
         "--batch-specs",
         nargs="+",
-        default=["q2k", "8s1k"],
+        default=["q2k", "8q1s1k"],
         help="Batch specifications using extended grammar",
     )
 

From ce3a1eca1c33e7f48857e0e97da7cca10c057b1e Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 15:53:49 -0400
Subject: [PATCH 19/45] bugfix mla dims

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/mla_runner.py | 78 ++++++++++++++++---
 1 file changed, 69 insertions(+), 9 deletions(-)

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index 30d9908df0e4..b7c602d42d32 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -56,20 +56,25 @@ def create_minimal_vllm_config(
     model_name: str = "deepseek-v3",
     block_size: int = 128,
     max_num_seqs: int = 256,
+    mla_dims: Optional[dict] = None,
 ) -> VllmConfig:
     """
     Create minimal VllmConfig for MLA benchmarks.
 
     Args:
-        model_name: Model name (deepseek-v2, deepseek-v3, etc.)
+        model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
+                    provided
         block_size: KV cache block size
         max_num_seqs: Maximum number of sequences
+        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
+                  setup_mla_dims(model_name)
 
     Returns:
         VllmConfig for benchmarking
     """
-    # Get MLA dimensions
-    mla_dims = setup_mla_dims(model_name)
+    # Get MLA dimensions - use provided or load from model name
+    if mla_dims is None:
+        mla_dims = setup_mla_dims(model_name)
 
     # Create model config
     model_config = ModelConfig(
@@ -337,7 +342,7 @@ def _create_input_tensors(
 
     MLA requires different tensor formats for decode vs prefill:
     - Decode: Uses kv_lora_rank (512) dimension
-    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 head dim limit
+    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
 
     Args:
         total_q: Total number of query tokens
@@ -349,7 +354,7 @@ def _create_input_tensors(
     Returns:
         Tuple of (decode_inputs, prefill_inputs)
         - decode_inputs: Query tensor(s) for decode mode
-        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill mode
+        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
     """
     if query_format == "tuple":
         # Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
@@ -515,6 +520,55 @@ def _create_backend_impl(
     return impl, layer, builder_instance
 
 
+# ============================================================================
+# Config Helpers
+# ============================================================================
+
+
+def _extract_mla_dims_from_config(config) -> Optional[dict]:
+    """
+    Extract MLA dimensions from BenchmarkConfig if all required fields are present.
+
+    Args:
+        config: BenchmarkConfig instance
+
+    Returns:
+        Dict with MLA dimensions if all fields are provided, None otherwise
+    """
+    # Check if all MLA-specific fields are provided
+    if all(
+        [
+            config.kv_lora_rank is not None,
+            config.qk_nope_head_dim is not None,
+            config.qk_rope_head_dim is not None,
+            config.v_head_dim is not None,
+        ]
+    ):
+        return {
+            "kv_lora_rank": config.kv_lora_rank,
+            "qk_nope_head_dim": config.qk_nope_head_dim,
+            "qk_rope_head_dim": config.qk_rope_head_dim,
+            "v_head_dim": config.v_head_dim,
+            "num_q_heads": config.num_q_heads,
+            "num_kv_heads": config.num_kv_heads,
+            "head_dim": config.head_dim,
+        }
+    # Fallback: if MLA fields not fully specified, try to construct from basic fields
+    elif config.head_dim == 576:
+        # This looks like a DeepSeek MLA config, use standard dimensions with custom
+        # head count
+        return {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": config.num_q_heads,
+            "num_kv_heads": config.num_kv_heads,
+            "head_dim": config.head_dim,
+        }
+    return None
+
+
 # ============================================================================
 # Benchmark Execution
 # ============================================================================
@@ -653,15 +707,21 @@ def _run_mla_benchmark_batched(
     config_block_size = configs_with_params[0][0].block_size
     block_size = backend_cfg["block_size"] or config_block_size
 
+    # Extract MLA dimensions from the first config
+    first_config = configs_with_params[0][0]
+    mla_dims = _extract_mla_dims_from_config(first_config)
+
+    # If config didn't provide MLA dims, fall back to default model
+    if mla_dims is None:
+        mla_dims = setup_mla_dims("deepseek-v3")
+
     # Create and set vLLM config for MLA (reused across all benchmarks)
     vllm_config = create_minimal_vllm_config(
-        model_name="deepseek-v3",
+        model_name="deepseek-v3",  # Used only for model path
         block_size=block_size,
+        mla_dims=mla_dims,  # Use custom dims from config or default
     )
 
-    # Setup MLA dimensions (reused)
-    mla_dims = setup_mla_dims("deepseek-v3")
-
     results = []
 
     with set_current_vllm_config(vllm_config):

From 4b6e2bfde7e834bc82258ef36a825da699128b10 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 16:01:37 -0400
Subject: [PATCH 20/45] add plotting script

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../tools/visualize_numsplits.py              | 401 ++++++++++++++++++
 1 file changed, 401 insertions(+)
 create mode 100644 benchmarks/attention_benchmarks/tools/visualize_numsplits.py

diff --git a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
new file mode 100644
index 000000000000..dd49e4231577
--- /dev/null
+++ b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
@@ -0,0 +1,401 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Visualize CUTLASS MLA num_kv_splits benchmark results.
+
+Usage:
+    python visualize_numsplits.py cutlass_numsplits_results.json
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import regex as re
+from matplotlib.colors import LinearSegmentedColormap, ListedColormap
+
+
+def parse_batch_spec(spec: str) -> tuple[int, int]:
+    """Parse batch spec like '32q1s16k' into (batch_size, seq_length_k)."""
+    match = re.match(r"(\d+)q1s(\d+)k", spec)
+    if not match:
+        raise ValueError(f"Cannot parse batch spec: {spec}")
+    batch_size = int(match.group(1))
+    seq_length_k = int(match.group(2))
+    return batch_size, seq_length_k
+
+
+def load_results(json_path: str) -> list:
+    """Load benchmark results from JSON file."""
+    with open(json_path) as f:
+        return json.load(f)
+
+
+def extract_optimal_splits(results: list) -> dict[tuple[int, int], int]:
+    """
+    Extract optimal num_kv_splits for each (batch_size, seq_length) pair.
+
+    Returns:
+        Dict mapping (batch_size, seq_length_k) -> optimal_num_kv_splits
+    """
+    # Group results by batch_spec
+    by_batch_spec = {}
+    for result in results:
+        batch_spec = result["config"]["batch_spec"]
+        if batch_spec not in by_batch_spec:
+            by_batch_spec[batch_spec] = []
+        by_batch_spec[batch_spec].append(result)
+
+    optimal_splits = {}
+
+    for batch_spec, batch_results in by_batch_spec.items():
+        batch_size, seq_length_k = parse_batch_spec(batch_spec)
+
+        # Find the configuration with minimum time
+        min_time = float("inf")
+        optimal_split = 1
+
+        for result in batch_results:
+            if result["error"] is None and "mean_time" in result:
+                time = result["mean_time"]
+                if time < min_time:
+                    min_time = time
+                    # Extract num_kv_splits from backend name
+                    backend_name = result["config"]["backend"]
+                    match = re.search(r"numsplits_(\d+)", backend_name)
+                    if match:
+                        optimal_split = int(match.group(1))
+
+        optimal_splits[(batch_size, seq_length_k)] = optimal_split
+
+    return optimal_splits
+
+
+def create_heatmap(optimal_splits: dict[tuple[int, int], int], output_path: str):
+    """Create heatmap showing optimal num_kv_splits."""
+    # Extract unique batch sizes and sequence lengths
+    batch_sizes = sorted(set(b for b, _ in optimal_splits))
+    seq_lengths = sorted(
+        set(s for _, s in optimal_splits), reverse=True
+    )  # Reverse for bottom-to-top
+
+    # Create matrix
+    matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
+    for i, seq_len in enumerate(seq_lengths):
+        for j, batch_size in enumerate(batch_sizes):
+            matrix[i, j] = optimal_splits.get((batch_size, seq_len), np.nan)
+
+    # Create figure
+    fig, ax = plt.subplots(figsize=(12, 8))
+
+    # Convert to log2 scale for coloring
+    matrix_log2 = np.log2(matrix)
+
+    # Get min/max values from actual data
+    valid_values = matrix_log2[~np.isnan(matrix_log2)]
+    min_log2 = np.floor(valid_values.min())
+    max_log2 = np.ceil(valid_values.max())
+
+    # Extend bounds by 0.5 on each side to center the discrete colors
+    # e.g., value 1 (log2=0) spans -0.5 to 0.5
+    #       value 2 (log2=1) spans 0.5 to 1.5, etc.
+    vmin = min_log2 - 0.5
+    vmax = max_log2 + 0.5
+
+    # Create discrete colormap - one color per power of 2
+    # Powers of 2: 1, 2, 4, 8, 16, 32 -> log2: 0, 1, 2, 3, 4, 5
+    n_colors = int(max_log2 - min_log2 + 1)
+
+    # Use viridis colormap (no value judgment on num_kv_splits)
+    # Sample evenly across the colormap
+    viridis = plt.cm.viridis
+    indices = np.linspace(0, 1, n_colors)
+    colors_to_use = [viridis(i) for i in indices]
+
+    cmap = ListedColormap(colors_to_use)
+
+    # Create heatmap with log2 scaled data
+    im = ax.imshow(matrix_log2, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax)
+
+    # Set ticks
+    ax.set_xticks(np.arange(len(batch_sizes)))
+    ax.set_yticks(np.arange(len(seq_lengths)))
+    ax.set_xticklabels(batch_sizes)
+    ax.set_yticklabels([f"{s}k" for s in seq_lengths])
+
+    # Labels
+    ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
+    ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
+    ax.set_title(
+        "Optimal num_kv_splits for CUTLASS MLA\n(Lower is simpler, higher is more"
+        " parallelism)",
+        fontsize=14,
+        fontweight="bold",
+        pad=20,
+    )
+
+    # Add text annotations
+    for i in range(len(seq_lengths)):
+        for j in range(len(batch_sizes)):
+            value = matrix[i, j]
+            if not np.isnan(value):
+                ax.text(
+                    j,
+                    i,
+                    int(value),
+                    ha="center",
+                    va="center",
+                    color="black",
+                    fontsize=10,
+                    fontweight="bold",
+                )
+
+    # Colorbar with power-of-2 labels
+    cbar = plt.colorbar(im, ax=ax)
+    cbar.set_label("Optimal num_kv_splits", rotation=270, labelpad=20, fontsize=12)
+
+    # Set colorbar ticks at the center of each discrete segment
+    # Ticks should be at integer log2 values (0, 1, 2, 3...) which are centered in each
+    # color band
+    tick_positions = np.arange(min_log2, max_log2 + 1)
+    tick_labels = [str(int(2**i)) for i in tick_positions]
+    cbar.set_ticks(tick_positions)
+    cbar.set_ticklabels(tick_labels)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=300, bbox_inches="tight")
+    print(f"Saved heatmap to {output_path}")
+    plt.close()
+
+
+def create_performance_heatmap(results: list, output_path: str):
+    """Create heatmap showing speedup from optimal splits vs splits=1."""
+    # Group results by batch_spec
+    by_batch_spec = {}
+    for result in results:
+        batch_spec = result["config"]["batch_spec"]
+        if batch_spec not in by_batch_spec:
+            by_batch_spec[batch_spec] = []
+        by_batch_spec[batch_spec].append(result)
+
+    speedup_matrix = {}
+
+    for batch_spec, batch_results in by_batch_spec.items():
+        batch_size, seq_length_k = parse_batch_spec(batch_spec)
+
+        # Get time for splits=1
+        baseline_time = None
+        min_time = float("inf")
+
+        for result in batch_results:
+            if result["error"] is None and "mean_time" in result:
+                time = result["mean_time"]
+                backend_name = result["config"]["backend"]
+                # Match exactly numsplits_1 (not numsplits_16, etc.)
+                if backend_name.endswith("numsplits_1"):
+                    baseline_time = time
+                if time < min_time:
+                    min_time = time
+
+        if baseline_time:
+            speedup = baseline_time / min_time
+            speedup_matrix[(batch_size, seq_length_k)] = speedup
+
+    # Extract unique batch sizes and sequence lengths
+    batch_sizes = sorted(set(b for b, _ in speedup_matrix))
+    seq_lengths = sorted(
+        set(s for _, s in speedup_matrix), reverse=True
+    )  # Reverse for bottom-to-top
+
+    # Create matrix
+    matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
+    for i, seq_len in enumerate(seq_lengths):
+        for j, batch_size in enumerate(batch_sizes):
+            matrix[i, j] = speedup_matrix.get((batch_size, seq_len), np.nan)
+
+    # Create figure
+    fig, ax = plt.subplots(figsize=(12, 8))
+
+    # Create heatmap with colormap: 1.0x = white (neutral), higher = green (good)
+
+    # Create colormap: 1.0 = white, higher = green
+    max_speedup = np.nanmax(matrix)
+    colors_dict = {
+        "red": [
+            (0.0, 1.0, 1.0),  # At 1.0x (vmin): white
+            (1.0, 0.0, 0.0),
+        ],  # At max speedup: green
+        "green": [(0.0, 1.0, 1.0), (1.0, 0.5, 0.5)],
+        "blue": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
+    }
+    speedup_cmap = LinearSegmentedColormap("Speedup", colors_dict)
+
+    im = ax.imshow(matrix, cmap=speedup_cmap, aspect="auto", vmin=1.0, vmax=max_speedup)
+
+    # Set ticks
+    ax.set_xticks(np.arange(len(batch_sizes)))
+    ax.set_yticks(np.arange(len(seq_lengths)))
+    ax.set_xticklabels(batch_sizes)
+    ax.set_yticklabels([f"{s}k" for s in seq_lengths])
+
+    # Labels
+    ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
+    ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
+    ax.set_title(
+        "Speedup from Optimal num_kv_splits vs. splits=1\n(Green = better with splits, "
+        "Red = same)",
+        fontsize=14,
+        fontweight="bold",
+        pad=20,
+    )
+
+    # Add text annotations
+    for i in range(len(seq_lengths)):
+        for j in range(len(batch_sizes)):
+            value = matrix[i, j]
+            if not np.isnan(value):
+                ax.text(
+                    j,
+                    i,
+                    f"{value:.2f}x",
+                    ha="center",
+                    va="center",
+                    color="black",
+                    fontsize=9,
+                    fontweight="bold",
+                )
+
+    # Colorbar
+    cbar = plt.colorbar(im, ax=ax)
+    cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=300, bbox_inches="tight")
+    print(f"Saved speedup heatmap to {output_path}")
+    plt.close()
+
+
+def analyze_pattern(optimal_splits: dict[tuple[int, int], int]):
+    """Analyze the pattern and suggest a formula."""
+    print("\n" + "=" * 80)
+    print("PATTERN ANALYSIS")
+    print("=" * 80)
+
+    # Group by optimal split value
+    by_split_value = {}
+    for (batch, seq), split in optimal_splits.items():
+        if split not in by_split_value:
+            by_split_value[split] = []
+        by_split_value[split].append((batch, seq))
+
+    print("\nConfigurations grouped by optimal num_kv_splits:")
+    for split in sorted(by_split_value.keys()):
+        configs = by_split_value[split]
+        print(f"\n  num_kv_splits = {split} ({len(configs)} configs):")
+        for batch, seq in sorted(configs)[:5]:  # Show first 5
+            print(f"    - batch={batch:3d}, seq={seq:3d}k")
+        if len(configs) > 5:
+            print(f"    ... and {len(configs) - 5} more")
+
+    # Analyze ratio: seq_length / batch_size
+    print("\n" + "-" * 80)
+    print("Analysis of seq_length/batch_size ratio:")
+    print("-" * 80)
+
+    ratio_by_split = {split: [] for split in by_split_value}
+    for (batch, seq), split in optimal_splits.items():
+        ratio = seq / batch
+        ratio_by_split[split].append(ratio)
+
+    print(f"\n{'Split':<8} {'Min Ratio':<12} {'Max Ratio':<12} {'Avg Ratio':<12}")
+    print("-" * 50)
+    for split in sorted(ratio_by_split.keys()):
+        ratios = ratio_by_split[split]
+        if ratios:
+            print(
+                f"{split:<8} {min(ratios):<12.1f} {max(ratios):<12.1f} "
+                f"{np.mean(ratios):<12.1f}"
+            )
+
+    # Suggest heuristic formula
+    print("\n" + "=" * 80)
+    print("SUGGESTED HEURISTIC FORMULA")
+    print("=" * 80)
+
+    print("\nBased on the data, a simple heuristic could be:")
+    print("""
+    ratio = seq_length_k / batch_size
+
+    if ratio >= 4.0:
+        num_kv_splits = 8
+    elif ratio >= 2.0:
+        num_kv_splits = 4
+    elif ratio >= 1.0:
+        num_kv_splits = 2
+    else:
+        num_kv_splits = 1
+    """)
+
+    # Test the formula
+    print("\nTesting suggested formula against actual results:")
+    correct = 0
+    total = 0
+
+    for (batch, seq), actual_split in optimal_splits.items():
+        ratio = seq / batch
+        if ratio >= 4.0:
+            predicted_split = 8
+        elif ratio >= 2.0:
+            predicted_split = 4
+        elif ratio >= 1.0:
+            predicted_split = 2
+        else:
+            predicted_split = 1
+
+        total += 1
+        if predicted_split == actual_split:
+            correct += 1
+        elif total <= 10:  # Show first 10 mismatches
+            print(
+                f"  Mismatch: batch={batch:3d}, seq={seq:3d}k, "
+                f"predicted={predicted_split}, actual={actual_split}"
+            )
+
+    accuracy = 100 * correct / total
+    print(f"\nFormula accuracy: {correct}/{total} = {accuracy:.1f}%")
+    print("=" * 80 + "\n")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python visualize_numsplits.py <results.json>")
+        sys.exit(1)
+
+    json_path = sys.argv[1]
+    output_dir = Path(json_path).parent
+
+    print(f"Loading results from {json_path}...")
+    results = load_results(json_path)
+
+    print("Extracting optimal splits...")
+    optimal_splits = extract_optimal_splits(results)
+
+    print(f"Found {len(optimal_splits)} configurations")
+
+    # Create visualizations
+    print("\nGenerating visualizations...")
+
+    create_heatmap(optimal_splits, output_dir / "numsplits_heatmap.png")
+    create_performance_heatmap(results, output_dir / "numsplits_speedup.png")
+
+    # Analyze pattern
+    analyze_pattern(optimal_splits)
+
+    print("\nDone! Check the output directory for visualization files.")
+
+
+if __name__ == "__main__":
+    main()

From daf49003c08dacfc9f159705d651398a74052062 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 14 Oct 2025 17:47:13 -0400
Subject: [PATCH 21/45] visualize some potential heuristics

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../tools/visualize_numsplits.py              | 346 +++++++++++++++---
 1 file changed, 304 insertions(+), 42 deletions(-)

diff --git a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
index dd49e4231577..0f5d9201d097 100644
--- a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
+++ b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
@@ -278,6 +278,270 @@ def create_performance_heatmap(results: list, output_path: str):
     plt.close()
 
 
+def heuristic_ratio_based(batch_size: int, seq_length_k: int) -> int:
+    """Original ratio-based heuristic (from visualize_numsplits.py)."""
+    ratio = seq_length_k / batch_size
+    if ratio >= 2.5:
+        return 8
+    elif ratio >= 1.2:
+        return 4
+    elif ratio >= 0.5:
+        return 2
+    else:
+        return 1
+
+
+def heuristic_constant(batch_size: int, seq_length_k: int) -> int:
+    """Ultra-simple constant heuristic: always use 2 for small batches."""
+    if batch_size <= 32:
+        return 2
+    else:
+        return 1
+
+
+def create_heuristic_policy_heatmaps(
+    optimal_splits: dict[tuple[int, int], int], output_dir: Path
+):
+    """Create heatmaps showing num_splits chosen by each heuristic policy."""
+    # Define heuristics to compare
+    heuristics = {
+        "Ratio-based": heuristic_ratio_based,
+        "Constant (batch<=32)": heuristic_constant,
+    }
+
+    # Extract unique batch sizes and sequence lengths
+    batch_sizes = sorted(set(b for b, _ in optimal_splits))
+    seq_lengths = sorted(set(s for _, s in optimal_splits), reverse=True)
+
+    # Create a separate heatmap for each heuristic
+    for heuristic_name, heuristic_func in heuristics.items():
+        # Build matrix of chosen num_splits
+        matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
+
+        for i, seq_len in enumerate(seq_lengths):
+            for j, batch_size in enumerate(batch_sizes):
+                predicted_splits = heuristic_func(batch_size, seq_len)
+                matrix[i, j] = predicted_splits
+
+        # Create heatmap
+        _fig, ax = plt.subplots(figsize=(12, 8))
+
+        # Convert to log2 scale for coloring (same as optimal heatmap)
+        matrix_log2 = np.log2(matrix)
+
+        # Get min/max values
+        valid_values = matrix_log2[~np.isnan(matrix_log2)]
+        min_log2 = np.floor(valid_values.min())
+        max_log2 = np.ceil(valid_values.max())
+
+        vmin = min_log2 - 0.5
+        vmax = max_log2 + 0.5
+
+        # Create discrete colormap
+        n_colors = int(max_log2 - min_log2 + 1)
+        from matplotlib import cm
+
+        viridis = cm.viridis
+        indices = np.linspace(0, 1, n_colors)
+        colors_to_use = [viridis(i) for i in indices]
+        cmap = ListedColormap(colors_to_use)
+
+        # Create heatmap with log2 scaled data
+        im = ax.imshow(matrix_log2, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax)
+
+        # Set ticks
+        ax.set_xticks(np.arange(len(batch_sizes)))
+        ax.set_yticks(np.arange(len(seq_lengths)))
+        ax.set_xticklabels(batch_sizes)
+        ax.set_yticklabels([f"{s}k" for s in seq_lengths])
+
+        # Labels
+        ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
+        ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
+        ax.set_title(
+            f"num_kv_splits Chosen by {heuristic_name} Policy",
+            fontsize=14,
+            fontweight="bold",
+            pad=20,
+        )
+
+        # Add text annotations (show actual value and mark mismatches)
+        for i in range(len(seq_lengths)):
+            for j in range(len(batch_sizes)):
+                value = matrix[i, j]
+                seq_len = seq_lengths[i]
+                batch_size = batch_sizes[j]
+                optimal = optimal_splits.get((batch_size, seq_len), None)
+
+                if not np.isnan(value):
+                    # Mark mismatches with red text
+                    if optimal is not None and int(value) != optimal:
+                        color = "red"
+                        text = f"{int(value)}\n✗"
+                    else:
+                        color = "black"
+                        text = str(int(value))
+
+                    ax.text(
+                        j,
+                        i,
+                        text,
+                        ha="center",
+                        va="center",
+                        color=color,
+                        fontsize=10,
+                        fontweight="bold",
+                    )
+
+        # Colorbar with power-of-2 labels
+        cbar = plt.colorbar(im, ax=ax)
+        cbar.set_label("num_kv_splits", rotation=270, labelpad=20, fontsize=12)
+        tick_positions = np.arange(min_log2, max_log2 + 1)
+        tick_labels = [str(int(2**i)) for i in tick_positions]
+        cbar.set_ticks(tick_positions)
+        cbar.set_ticklabels(tick_labels)
+
+        plt.tight_layout()
+
+        # Save with sanitized filename
+        safe_name = (
+            heuristic_name.lower().replace(" ", "_").replace("(", "").replace(")", "")
+        )
+        output_path = output_dir / f"numsplits_policy_{safe_name}.png"
+        plt.savefig(output_path, dpi=300, bbox_inches="tight")
+        print(f"Saved {heuristic_name} policy heatmap to {output_path}")
+        plt.close()
+
+
+def create_heuristic_speedup_heatmaps(
+    results: list, optimal_splits: dict[tuple[int, int], int], output_dir: Path
+):
+    """Create speedup heatmaps for each heuristic policy."""
+    # Define heuristics to compare
+    heuristics = {
+        "Ratio-based (Original)": heuristic_ratio_based,
+        "Constant (batch<=32)": heuristic_constant,
+    }
+
+    # Group results by batch_spec for performance lookup
+    by_batch_spec = {}
+    for result in results:
+        batch_spec = result["config"]["batch_spec"]
+        if batch_spec not in by_batch_spec:
+            by_batch_spec[batch_spec] = {}
+
+        if result["error"] is None and "mean_time" in result:
+            backend_name = result["config"]["backend"]
+            match = re.search(r"numsplits_(\d+)", backend_name)
+            if match:
+                num_splits = int(match.group(1))
+                by_batch_spec[batch_spec][num_splits] = result["mean_time"]
+
+    # Extract unique batch sizes and sequence lengths
+    batch_sizes = sorted(set(b for b, _ in optimal_splits))
+    seq_lengths = sorted(set(s for _, s in optimal_splits), reverse=True)
+
+    # Create a separate heatmap for each heuristic
+    for heuristic_name, heuristic_func in heuristics.items():
+        # Build speedup matrix for this heuristic
+        speedup_matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
+        total_speedup = 0.0
+        count = 0
+
+        for i, seq_len in enumerate(seq_lengths):
+            for j, batch_size in enumerate(batch_sizes):
+                batch_spec = f"{batch_size}q1s{seq_len}k"
+                if batch_spec not in by_batch_spec:
+                    speedup_matrix[i, j] = np.nan
+                    continue
+
+                timings = by_batch_spec[batch_spec]
+                baseline_time = timings.get(1, None)
+
+                if not baseline_time:
+                    speedup_matrix[i, j] = np.nan
+                    continue
+
+                # Get the num_splits predicted by this heuristic
+                predicted_splits = heuristic_func(batch_size, seq_len)
+                predicted_time = timings.get(predicted_splits, baseline_time)
+                speedup = baseline_time / predicted_time
+
+                speedup_matrix[i, j] = speedup
+                total_speedup += speedup
+                count += 1
+
+        avg_speedup = total_speedup / count if count > 0 else 1.0
+
+        # Create heatmap
+        _fig, ax = plt.subplots(figsize=(12, 8))
+
+        # Colormap: 1.0 = white (neutral), higher = green (good)
+        max_speedup = np.nanmax(speedup_matrix)
+        colors_dict = {
+            "red": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
+            "green": [(0.0, 1.0, 1.0), (1.0, 0.5, 0.5)],
+            "blue": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
+        }
+        speedup_cmap = LinearSegmentedColormap("Speedup", colors_dict)
+
+        im = ax.imshow(
+            speedup_matrix,
+            cmap=speedup_cmap,
+            aspect="auto",
+            vmin=1.0,
+            vmax=max_speedup,
+        )
+
+        # Set ticks
+        ax.set_xticks(np.arange(len(batch_sizes)))
+        ax.set_yticks(np.arange(len(seq_lengths)))
+        ax.set_xticklabels(batch_sizes)
+        ax.set_yticklabels([f"{s}k" for s in seq_lengths])
+
+        # Labels
+        ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
+        ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
+        ax.set_title(
+            f"Speedup with {heuristic_name} Policy\n"
+            f"(Average speedup: {avg_speedup:.3f}x vs. splits=1)",
+            fontsize=14,
+            fontweight="bold",
+            pad=20,
+        )
+
+        # Add text annotations
+        for i in range(len(seq_lengths)):
+            for j in range(len(batch_sizes)):
+                value = speedup_matrix[i, j]
+                if not np.isnan(value):
+                    ax.text(
+                        j,
+                        i,
+                        f"{value:.2f}x",
+                        ha="center",
+                        va="center",
+                        color="black",
+                        fontsize=9,
+                        fontweight="bold",
+                    )
+
+        # Colorbar
+        cbar = plt.colorbar(im, ax=ax)
+        cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
+
+        plt.tight_layout()
+
+        # Save with sanitized filename
+        safe_name = (
+            heuristic_name.lower().replace(" ", "_").replace("(", "").replace(")", "")
+        )
+        output_path = output_dir / f"numsplits_speedup_{safe_name}.png"
+        plt.savefig(output_path, dpi=300, bbox_inches="tight")
+        print(f"Saved {heuristic_name} speedup heatmap to {output_path}")
+        plt.close()
+
+
 def analyze_pattern(optimal_splits: dict[tuple[int, int], int]):
     """Analyze the pattern and suggest a formula."""
     print("\n" + "=" * 80)
@@ -320,53 +584,49 @@ def analyze_pattern(optimal_splits: dict[tuple[int, int], int]):
                 f"{np.mean(ratios):<12.1f}"
             )
 
-    # Suggest heuristic formula
+    # Test heuristics
     print("\n" + "=" * 80)
-    print("SUGGESTED HEURISTIC FORMULA")
+    print("HEURISTIC COMPARISON")
     print("=" * 80)
 
-    print("\nBased on the data, a simple heuristic could be:")
-    print("""
-    ratio = seq_length_k / batch_size
-
-    if ratio >= 4.0:
-        num_kv_splits = 8
-    elif ratio >= 2.0:
-        num_kv_splits = 4
-    elif ratio >= 1.0:
-        num_kv_splits = 2
-    else:
-        num_kv_splits = 1
-    """)
-
-    # Test the formula
-    print("\nTesting suggested formula against actual results:")
-    correct = 0
-    total = 0
+    heuristics = {
+        "Ratio-based": heuristic_ratio_based,
+        "Constant (batch<=32)": heuristic_constant,
+    }
 
-    for (batch, seq), actual_split in optimal_splits.items():
-        ratio = seq / batch
-        if ratio >= 4.0:
-            predicted_split = 8
-        elif ratio >= 2.0:
-            predicted_split = 4
-        elif ratio >= 1.0:
-            predicted_split = 2
-        else:
-            predicted_split = 1
-
-        total += 1
-        if predicted_split == actual_split:
-            correct += 1
-        elif total <= 10:  # Show first 10 mismatches
-            print(
-                f"  Mismatch: batch={batch:3d}, seq={seq:3d}k, "
-                f"predicted={predicted_split}, actual={actual_split}"
-            )
+    for name, heuristic_func in heuristics.items():
+        correct = 0
+        total = 0
+        mismatches = []
+
+        for (batch, seq), actual_split in optimal_splits.items():
+            predicted_split = heuristic_func(batch, seq)
+            total += 1
+            if predicted_split == actual_split:
+                correct += 1
+            else:
+                mismatches.append((batch, seq, predicted_split, actual_split))
+
+        accuracy = 100 * correct / total
+        print(f"\n{name}:")
+        print(f"  Accuracy: {correct}/{total} = {accuracy:.1f}%")
+
+        if mismatches and len(mismatches) <= 10:
+            print("  Mismatches:")
+            for batch, seq, pred, actual in mismatches:
+                print(
+                    f"    batch={batch:3d}, seq={seq:3d}k -> "
+                    f"predicted={pred}, actual={actual}"
+                )
+        elif mismatches:
+            print(f"  {len(mismatches)} mismatches (showing first 5):")
+            for batch, seq, pred, actual in mismatches[:5]:
+                print(
+                    f"    batch={batch:3d}, seq={seq:3d}k -> "
+                    f"predicted={pred}, actual={actual}"
+                )
 
-    accuracy = 100 * correct / total
-    print(f"\nFormula accuracy: {correct}/{total} = {accuracy:.1f}%")
-    print("=" * 80 + "\n")
+    print("\n" + "=" * 80 + "\n")
 
 
 def main():
@@ -390,6 +650,8 @@ def main():
 
     create_heatmap(optimal_splits, output_dir / "numsplits_heatmap.png")
     create_performance_heatmap(results, output_dir / "numsplits_speedup.png")
+    create_heuristic_policy_heatmaps(optimal_splits, output_dir)
+    create_heuristic_speedup_heatmaps(results, optimal_splits, output_dir)
 
     # Analyze pattern
     analyze_pattern(optimal_splits)

From ce22932d2691a0280a8ed73fa35c133ccd25a728 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 15 Oct 2025 13:16:57 -0400
Subject: [PATCH 22/45] clean up plotting script

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/cutlass_numsplits.yaml            |  44 +-
 .../tools/visualize_numsplits.py              | 649 ++++++++++--------
 2 files changed, 415 insertions(+), 278 deletions(-)

diff --git a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
index 90dcb841d8a7..a96f044ff4da 100644
--- a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
@@ -11,6 +11,36 @@ backend: cutlass_mla
 # Batch sizes: 8, 16, 24, 32, 48, 64, 96, 128
 # Sequence lengths: 1k, 2k, 4k, 8k, 16k, 32k, 64k
 batch_specs:
+  # Batch size: 1
+  - "1q1s1k"
+  - "1q1s2k"
+  - "1q1s4k"
+  - "1q1s8k"
+  - "1q1s16k"
+  - "1q1s32k"
+  - "1q1s64k"
+  - "1q1s128k"
+
+  # Batch size: 2
+  - "2q1s1k"
+  - "2q1s2k"
+  - "2q1s4k"
+  - "2q1s8k"
+  - "2q1s16k"
+  - "2q1s32k"
+  - "2q1s64k"
+  - "2q1s128k"
+
+  # Batch size: 4
+  - "4q1s1k"
+  - "4q1s2k"
+  - "4q1s4k"
+  - "4q1s8k"
+  - "4q1s16k"
+  - "4q1s32k"
+  - "4q1s64k"
+  - "4q1s128k"
+
   # Batch size: 8
   - "8q1s1k"
   - "8q1s2k"
@@ -19,6 +49,7 @@ batch_specs:
   - "8q1s16k"
   - "8q1s32k"
   - "8q1s64k"
+  - "8q1s128k"
 
   # Batch size: 16
   - "16q1s1k"
@@ -28,6 +59,7 @@ batch_specs:
   - "16q1s16k"
   - "16q1s32k"
   - "16q1s64k"
+  - "16q1s128k"
 
   # Batch size: 24
   - "24q1s1k"
@@ -37,6 +69,7 @@ batch_specs:
   - "24q1s16k"
   - "24q1s32k"
   - "24q1s64k"
+  - "24q1s128k"
 
   # Batch size: 32
   - "32q1s1k"
@@ -46,6 +79,7 @@ batch_specs:
   - "32q1s16k"
   - "32q1s32k"
   - "32q1s64k"
+  - "32q1s128k"
 
   # Batch size: 48
   - "48q1s1k"
@@ -55,6 +89,7 @@ batch_specs:
   - "48q1s16k"
   - "48q1s32k"
   - "48q1s64k"
+  - "48q1s128k"
 
   # Batch size: 64
   - "64q1s1k"
@@ -64,6 +99,7 @@ batch_specs:
   - "64q1s16k"
   - "64q1s32k"
   - "64q1s64k"
+  - "64q1s128k"
 
   # Batch size: 96
   - "96q1s1k"
@@ -73,6 +109,7 @@ batch_specs:
   - "96q1s16k"
   - "96q1s32k"
   - "96q1s64k"
+  - "96q1s128k"
 
   # Batch size: 128
   - "128q1s1k"
@@ -82,12 +119,13 @@ batch_specs:
   - "128q1s16k"
   - "128q1s32k"
   - "128q1s64k"
+  - "128q1s128k"
 
 # Unified parameter sweep configuration
 parameter_sweep:
   param_name: "num_kv_splits"
-  values: [1, 2, 4, 8, 16, 32]
-  include_auto: false
+  values: [1, 2, 4, 8, 16]
+  include_auto: true
   label_format: "{backend}_numsplits_{value}"
 
 # Model configuration (DeepSeek V2/V3 defaults)
@@ -101,7 +139,7 @@ model:
 # Benchmark settings
 benchmark:
   device: "cuda:0"
-  repeats: 10          # More repeats for statistical significance
+  repeats: 20
   warmup_iters: 5
   profile_memory: false
 
diff --git a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
index 0f5d9201d097..a56eea6257bb 100644
--- a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
+++ b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
@@ -10,6 +10,7 @@
 
 import json
 import sys
+from collections.abc import Mapping
 from pathlib import Path
 
 import matplotlib.pyplot as plt
@@ -34,10 +35,16 @@ def load_results(json_path: str) -> list:
         return json.load(f)
 
 
-def extract_optimal_splits(results: list) -> dict[tuple[int, int], int]:
+def extract_optimal_splits(
+    results: list, exclude_auto: bool = False
+) -> dict[tuple[int, int], int]:
     """
     Extract optimal num_kv_splits for each (batch_size, seq_length) pair.
 
+    Args:
+        results: List of benchmark results
+        exclude_auto: If True, exclude "auto" backend from consideration
+
     Returns:
         Dict mapping (batch_size, seq_length_k) -> optimal_num_kv_splits
     """
@@ -45,6 +52,12 @@ def extract_optimal_splits(results: list) -> dict[tuple[int, int], int]:
     by_batch_spec = {}
     for result in results:
         batch_spec = result["config"]["batch_spec"]
+        backend_name = result["config"]["backend"]
+
+        # Skip auto if requested
+        if exclude_auto and "auto" in backend_name:
+            continue
+
         if batch_spec not in by_batch_spec:
             by_batch_spec[batch_spec] = []
         by_batch_spec[batch_spec].append(result)
@@ -74,70 +87,65 @@ def extract_optimal_splits(results: list) -> dict[tuple[int, int], int]:
     return optimal_splits
 
 
-def create_heatmap(optimal_splits: dict[tuple[int, int], int], output_path: str):
-    """Create heatmap showing optimal num_kv_splits."""
-    # Extract unique batch sizes and sequence lengths
-    batch_sizes = sorted(set(b for b, _ in optimal_splits))
-    seq_lengths = sorted(
-        set(s for _, s in optimal_splits), reverse=True
-    )  # Reverse for bottom-to-top
+def _get_axes_from_splits_dict(
+    splits_dict: Mapping[tuple[int, int], int | float],
+) -> tuple[list[int], list[int]]:
+    """Extract sorted batch sizes and sequence lengths from splits dictionary."""
+    batch_sizes = sorted(set(b for b, _ in splits_dict))
+    seq_lengths = sorted(set(s for _, s in splits_dict), reverse=True)
+    return batch_sizes, seq_lengths
+
 
-    # Create matrix
+def _create_splits_matrix(
+    splits_dict: Mapping[tuple[int, int], int | float],
+    batch_sizes: list[int],
+    seq_lengths: list[int],
+) -> np.ndarray:
+    """Create matrix from splits dictionary."""
     matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
     for i, seq_len in enumerate(seq_lengths):
         for j, batch_size in enumerate(batch_sizes):
-            matrix[i, j] = optimal_splits.get((batch_size, seq_len), np.nan)
+            matrix[i, j] = splits_dict.get((batch_size, seq_len), np.nan)
+    return matrix
 
-    # Create figure
-    fig, ax = plt.subplots(figsize=(12, 8))
 
-    # Convert to log2 scale for coloring
-    matrix_log2 = np.log2(matrix)
-
-    # Get min/max values from actual data
-    valid_values = matrix_log2[~np.isnan(matrix_log2)]
-    min_log2 = np.floor(valid_values.min())
-    max_log2 = np.ceil(valid_values.max())
+def _setup_heatmap_axes(ax, batch_sizes: list[int], seq_lengths: list[int], title: str):
+    """Setup common axes properties for heatmaps."""
+    ax.set_xticks(np.arange(len(batch_sizes)))
+    ax.set_yticks(np.arange(len(seq_lengths)))
+    ax.set_xticklabels(batch_sizes)
+    ax.set_yticklabels([f"{s}k" for s in seq_lengths])
+    ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
+    ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
+    ax.set_title(title, fontsize=14, fontweight="bold", pad=20)
 
-    # Extend bounds by 0.5 on each side to center the discrete colors
-    # e.g., value 1 (log2=0) spans -0.5 to 0.5
-    #       value 2 (log2=1) spans 0.5 to 1.5, etc.
-    vmin = min_log2 - 0.5
-    vmax = max_log2 + 0.5
 
-    # Create discrete colormap - one color per power of 2
-    # Powers of 2: 1, 2, 4, 8, 16, 32 -> log2: 0, 1, 2, 3, 4, 5
+def _create_log2_colormap(min_log2: float, max_log2: float) -> tuple:
+    """Create discrete log2 colormap and bounds."""
     n_colors = int(max_log2 - min_log2 + 1)
-
-    # Use viridis colormap (no value judgment on num_kv_splits)
-    # Sample evenly across the colormap
     viridis = plt.cm.viridis
     indices = np.linspace(0, 1, n_colors)
-    colors_to_use = [viridis(i) for i in indices]
-
-    cmap = ListedColormap(colors_to_use)
+    colors = [viridis(i) for i in indices]
+    cmap = ListedColormap(colors)
+    vmin = min_log2 - 0.5
+    vmax = max_log2 + 0.5
+    return cmap, vmin, vmax
 
-    # Create heatmap with log2 scaled data
-    im = ax.imshow(matrix_log2, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax)
 
-    # Set ticks
-    ax.set_xticks(np.arange(len(batch_sizes)))
-    ax.set_yticks(np.arange(len(seq_lengths)))
-    ax.set_xticklabels(batch_sizes)
-    ax.set_yticklabels([f"{s}k" for s in seq_lengths])
+def _add_log2_colorbar(im, ax, label: str, min_log2: float, max_log2: float):
+    """Add colorbar with power-of-2 labels."""
+    cbar = plt.colorbar(im, ax=ax)
+    cbar.set_label(label, rotation=270, labelpad=20, fontsize=12)
+    tick_positions = np.arange(min_log2, max_log2 + 1)
+    tick_labels = [str(int(2**i)) for i in tick_positions]
+    cbar.set_ticks(tick_positions)
+    cbar.set_ticklabels(tick_labels)
 
-    # Labels
-    ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
-    ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
-    ax.set_title(
-        "Optimal num_kv_splits for CUTLASS MLA\n(Lower is simpler, higher is more"
-        " parallelism)",
-        fontsize=14,
-        fontweight="bold",
-        pad=20,
-    )
 
-    # Add text annotations
+def _annotate_splits_matrix(
+    ax, matrix: np.ndarray, batch_sizes: list[int], seq_lengths: list[int]
+):
+    """Add text annotations showing split values."""
     for i in range(len(seq_lengths)):
         for j in range(len(batch_sizes)):
             value = matrix[i, j]
@@ -153,17 +161,32 @@ def create_heatmap(optimal_splits: dict[tuple[int, int], int], output_path: str)
                     fontweight="bold",
                 )
 
-    # Colorbar with power-of-2 labels
-    cbar = plt.colorbar(im, ax=ax)
-    cbar.set_label("Optimal num_kv_splits", rotation=270, labelpad=20, fontsize=12)
 
-    # Set colorbar ticks at the center of each discrete segment
-    # Ticks should be at integer log2 values (0, 1, 2, 3...) which are centered in each
-    # color band
-    tick_positions = np.arange(min_log2, max_log2 + 1)
-    tick_labels = [str(int(2**i)) for i in tick_positions]
-    cbar.set_ticks(tick_positions)
-    cbar.set_ticklabels(tick_labels)
+def create_heatmap(optimal_splits: dict[tuple[int, int], int], output_path: str):
+    """Create heatmap showing optimal num_kv_splits."""
+    batch_sizes, seq_lengths = _get_axes_from_splits_dict(optimal_splits)
+    matrix = _create_splits_matrix(optimal_splits, batch_sizes, seq_lengths)
+
+    _fig, ax = plt.subplots(figsize=(12, 8))
+
+    # Convert to log2 scale for coloring
+    matrix_log2 = np.log2(matrix)
+    valid_values = matrix_log2[~np.isnan(matrix_log2)]
+    min_log2 = np.floor(valid_values.min())
+    max_log2 = np.ceil(valid_values.max())
+
+    cmap, vmin, vmax = _create_log2_colormap(min_log2, max_log2)
+    im = ax.imshow(matrix_log2, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax)
+
+    _setup_heatmap_axes(
+        ax,
+        batch_sizes,
+        seq_lengths,
+        "Optimal num_kv_splits for CUTLASS MLA\n"
+        "(Lower is simpler, higher is more parallelism)",
+    )
+    _annotate_splits_matrix(ax, matrix, batch_sizes, seq_lengths)
+    _add_log2_colorbar(im, ax, "Optimal num_kv_splits", min_log2, max_log2)
 
     plt.tight_layout()
     plt.savefig(output_path, dpi=300, bbox_inches="tight")
@@ -171,22 +194,56 @@ def create_heatmap(optimal_splits: dict[tuple[int, int], int], output_path: str)
     plt.close()
 
 
-def create_performance_heatmap(results: list, output_path: str):
-    """Create heatmap showing speedup from optimal splits vs splits=1."""
-    # Group results by batch_spec
+def _create_speedup_colormap():
+    """Create colormap for speedup: 1.0 = white, higher = green."""
+    colors_dict = {
+        "red": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
+        "green": [(0.0, 1.0, 1.0), (1.0, 0.5, 0.5)],
+        "blue": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
+    }
+    return LinearSegmentedColormap("Speedup", colors_dict)
+
+
+def _annotate_speedup_matrix(
+    ax, matrix: np.ndarray, batch_sizes: list[int], seq_lengths: list[int]
+):
+    """Add text annotations showing speedup values."""
+    for i in range(len(seq_lengths)):
+        for j in range(len(batch_sizes)):
+            value = matrix[i, j]
+            if not np.isnan(value):
+                ax.text(
+                    j,
+                    i,
+                    f"{value:.2f}x",
+                    ha="center",
+                    va="center",
+                    color="black",
+                    fontsize=9,
+                    fontweight="bold",
+                )
+
+
+def _compute_speedup_matrix(
+    results: list, exclude_auto: bool = False
+) -> dict[tuple[int, int], float]:
+    """Compute speedup matrix from results (optimal vs splits=1)."""
     by_batch_spec = {}
     for result in results:
         batch_spec = result["config"]["batch_spec"]
+        backend_name = result["config"]["backend"]
+
+        if exclude_auto and "auto" in backend_name:
+            continue
+
         if batch_spec not in by_batch_spec:
             by_batch_spec[batch_spec] = []
         by_batch_spec[batch_spec].append(result)
 
     speedup_matrix = {}
-
     for batch_spec, batch_results in by_batch_spec.items():
         batch_size, seq_length_k = parse_batch_spec(batch_spec)
 
-        # Get time for splits=1
         baseline_time = None
         min_time = float("inf")
 
@@ -194,7 +251,6 @@ def create_performance_heatmap(results: list, output_path: str):
             if result["error"] is None and "mean_time" in result:
                 time = result["mean_time"]
                 backend_name = result["config"]["backend"]
-                # Match exactly numsplits_1 (not numsplits_16, etc.)
                 if backend_name.endswith("numsplits_1"):
                     baseline_time = time
                 if time < min_time:
@@ -204,71 +260,32 @@ def create_performance_heatmap(results: list, output_path: str):
             speedup = baseline_time / min_time
             speedup_matrix[(batch_size, seq_length_k)] = speedup
 
-    # Extract unique batch sizes and sequence lengths
-    batch_sizes = sorted(set(b for b, _ in speedup_matrix))
-    seq_lengths = sorted(
-        set(s for _, s in speedup_matrix), reverse=True
-    )  # Reverse for bottom-to-top
+    return speedup_matrix
 
-    # Create matrix
-    matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
-    for i, seq_len in enumerate(seq_lengths):
-        for j, batch_size in enumerate(batch_sizes):
-            matrix[i, j] = speedup_matrix.get((batch_size, seq_len), np.nan)
 
-    # Create figure
-    fig, ax = plt.subplots(figsize=(12, 8))
+def create_performance_heatmap(
+    results: list, output_path: str, exclude_auto: bool = False
+):
+    """Create heatmap showing speedup from optimal splits vs splits=1."""
+    speedup_dict = _compute_speedup_matrix(results, exclude_auto)
+    batch_sizes, seq_lengths = _get_axes_from_splits_dict(speedup_dict)
+    matrix = _create_splits_matrix(speedup_dict, batch_sizes, seq_lengths)
 
-    # Create heatmap with colormap: 1.0x = white (neutral), higher = green (good)
+    _fig, ax = plt.subplots(figsize=(12, 8))
 
-    # Create colormap: 1.0 = white, higher = green
     max_speedup = np.nanmax(matrix)
-    colors_dict = {
-        "red": [
-            (0.0, 1.0, 1.0),  # At 1.0x (vmin): white
-            (1.0, 0.0, 0.0),
-        ],  # At max speedup: green
-        "green": [(0.0, 1.0, 1.0), (1.0, 0.5, 0.5)],
-        "blue": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
-    }
-    speedup_cmap = LinearSegmentedColormap("Speedup", colors_dict)
-
+    speedup_cmap = _create_speedup_colormap()
     im = ax.imshow(matrix, cmap=speedup_cmap, aspect="auto", vmin=1.0, vmax=max_speedup)
 
-    # Set ticks
-    ax.set_xticks(np.arange(len(batch_sizes)))
-    ax.set_yticks(np.arange(len(seq_lengths)))
-    ax.set_xticklabels(batch_sizes)
-    ax.set_yticklabels([f"{s}k" for s in seq_lengths])
-
-    # Labels
-    ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
-    ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
-    ax.set_title(
-        "Speedup from Optimal num_kv_splits vs. splits=1\n(Green = better with splits, "
-        "Red = same)",
-        fontsize=14,
-        fontweight="bold",
-        pad=20,
+    _setup_heatmap_axes(
+        ax,
+        batch_sizes,
+        seq_lengths,
+        "Speedup from Optimal num_kv_splits vs. splits=1\n"
+        "(Green = better with splits, White = same)",
     )
+    _annotate_speedup_matrix(ax, matrix, batch_sizes, seq_lengths)
 
-    # Add text annotations
-    for i in range(len(seq_lengths)):
-        for j in range(len(batch_sizes)):
-            value = matrix[i, j]
-            if not np.isnan(value):
-                ax.text(
-                    j,
-                    i,
-                    f"{value:.2f}x",
-                    ha="center",
-                    va="center",
-                    color="black",
-                    fontsize=9,
-                    fontweight="bold",
-                )
-
-    # Colorbar
     cbar = plt.colorbar(im, ax=ax)
     cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
 
@@ -299,111 +316,119 @@ def heuristic_constant(batch_size: int, seq_length_k: int) -> int:
         return 1
 
 
+def heuristic_batch_based(batch_size: int, seq_length_k: int) -> int:
+    """
+    Improved batch-size-based heuristic with zero slowdowns.
+
+    This policy avoids all slowdowns by never splitting large batches,
+    while still achieving significant speedups for small batches.
+    """
+    if batch_size <= 4:
+        # Very small batch: aggressive splitting
+        if seq_length_k >= 16:
+            return 8
+        elif seq_length_k >= 4:
+            return 4
+        elif seq_length_k >= 2:
+            return 2
+        else:
+            return 1
+    elif batch_size <= 8:
+        # Small batch: moderate splitting
+        if seq_length_k >= 16:
+            return 8
+        elif seq_length_k >= 4:
+            return 4
+        else:
+            return 1
+    elif batch_size <= 16:
+        # Medium batch: conservative splitting
+        if seq_length_k >= 16:
+            return 2
+        else:
+            return 1
+    else:
+        # Large batch: never split (avoids slowdowns)
+        return 1
+
+
+def _annotate_heuristic_matrix(
+    ax,
+    matrix: np.ndarray,
+    batch_sizes: list[int],
+    seq_lengths: list[int],
+    optimal_splits: dict[tuple[int, int], int],
+):
+    """Add text annotations showing heuristic values and mismatches."""
+    for i in range(len(seq_lengths)):
+        for j in range(len(batch_sizes)):
+            value = matrix[i, j]
+            seq_len = seq_lengths[i]
+            batch_size = batch_sizes[j]
+            optimal = optimal_splits.get((batch_size, seq_len), None)
+
+            if not np.isnan(value):
+                # Mark mismatches with red text
+                if optimal is not None and int(value) != optimal:
+                    color = "red"
+                    text = f"{int(value)}\n✗"
+                else:
+                    color = "black"
+                    text = str(int(value))
+
+                ax.text(
+                    j,
+                    i,
+                    text,
+                    ha="center",
+                    va="center",
+                    color=color,
+                    fontsize=10,
+                    fontweight="bold",
+                )
+
+
 def create_heuristic_policy_heatmaps(
     optimal_splits: dict[tuple[int, int], int], output_dir: Path
 ):
     """Create heatmaps showing num_splits chosen by each heuristic policy."""
-    # Define heuristics to compare
     heuristics = {
         "Ratio-based": heuristic_ratio_based,
+        "Batch-based (improved)": heuristic_batch_based,
         "Constant (batch<=32)": heuristic_constant,
     }
 
-    # Extract unique batch sizes and sequence lengths
-    batch_sizes = sorted(set(b for b, _ in optimal_splits))
-    seq_lengths = sorted(set(s for _, s in optimal_splits), reverse=True)
+    batch_sizes, seq_lengths = _get_axes_from_splits_dict(optimal_splits)
 
-    # Create a separate heatmap for each heuristic
     for heuristic_name, heuristic_func in heuristics.items():
         # Build matrix of chosen num_splits
         matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
-
         for i, seq_len in enumerate(seq_lengths):
             for j, batch_size in enumerate(batch_sizes):
-                predicted_splits = heuristic_func(batch_size, seq_len)
-                matrix[i, j] = predicted_splits
+                matrix[i, j] = heuristic_func(batch_size, seq_len)
 
-        # Create heatmap
         _fig, ax = plt.subplots(figsize=(12, 8))
 
-        # Convert to log2 scale for coloring (same as optimal heatmap)
+        # Convert to log2 scale for coloring
         matrix_log2 = np.log2(matrix)
-
-        # Get min/max values
         valid_values = matrix_log2[~np.isnan(matrix_log2)]
         min_log2 = np.floor(valid_values.min())
         max_log2 = np.ceil(valid_values.max())
 
-        vmin = min_log2 - 0.5
-        vmax = max_log2 + 0.5
-
-        # Create discrete colormap
-        n_colors = int(max_log2 - min_log2 + 1)
-        from matplotlib import cm
-
-        viridis = cm.viridis
-        indices = np.linspace(0, 1, n_colors)
-        colors_to_use = [viridis(i) for i in indices]
-        cmap = ListedColormap(colors_to_use)
-
-        # Create heatmap with log2 scaled data
+        cmap, vmin, vmax = _create_log2_colormap(min_log2, max_log2)
         im = ax.imshow(matrix_log2, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax)
 
-        # Set ticks
-        ax.set_xticks(np.arange(len(batch_sizes)))
-        ax.set_yticks(np.arange(len(seq_lengths)))
-        ax.set_xticklabels(batch_sizes)
-        ax.set_yticklabels([f"{s}k" for s in seq_lengths])
-
-        # Labels
-        ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
-        ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
-        ax.set_title(
+        _setup_heatmap_axes(
+            ax,
+            batch_sizes,
+            seq_lengths,
             f"num_kv_splits Chosen by {heuristic_name} Policy",
-            fontsize=14,
-            fontweight="bold",
-            pad=20,
         )
-
-        # Add text annotations (show actual value and mark mismatches)
-        for i in range(len(seq_lengths)):
-            for j in range(len(batch_sizes)):
-                value = matrix[i, j]
-                seq_len = seq_lengths[i]
-                batch_size = batch_sizes[j]
-                optimal = optimal_splits.get((batch_size, seq_len), None)
-
-                if not np.isnan(value):
-                    # Mark mismatches with red text
-                    if optimal is not None and int(value) != optimal:
-                        color = "red"
-                        text = f"{int(value)}\n✗"
-                    else:
-                        color = "black"
-                        text = str(int(value))
-
-                    ax.text(
-                        j,
-                        i,
-                        text,
-                        ha="center",
-                        va="center",
-                        color=color,
-                        fontsize=10,
-                        fontweight="bold",
-                    )
-
-        # Colorbar with power-of-2 labels
-        cbar = plt.colorbar(im, ax=ax)
-        cbar.set_label("num_kv_splits", rotation=270, labelpad=20, fontsize=12)
-        tick_positions = np.arange(min_log2, max_log2 + 1)
-        tick_labels = [str(int(2**i)) for i in tick_positions]
-        cbar.set_ticks(tick_positions)
-        cbar.set_ticklabels(tick_labels)
+        _annotate_heuristic_matrix(ax, matrix, batch_sizes, seq_lengths, optimal_splits)
+        _add_log2_colorbar(im, ax, "num_kv_splits", min_log2, max_log2)
 
         plt.tight_layout()
 
-        # Save with sanitized filename
         safe_name = (
             heuristic_name.lower().replace(" ", "_").replace("(", "").replace(")", "")
         )
@@ -413,17 +438,8 @@ def create_heuristic_policy_heatmaps(
         plt.close()
 
 
-def create_heuristic_speedup_heatmaps(
-    results: list, optimal_splits: dict[tuple[int, int], int], output_dir: Path
-):
-    """Create speedup heatmaps for each heuristic policy."""
-    # Define heuristics to compare
-    heuristics = {
-        "Ratio-based (Original)": heuristic_ratio_based,
-        "Constant (batch<=32)": heuristic_constant,
-    }
-
-    # Group results by batch_spec for performance lookup
+def _build_timings_lookup(results: list) -> dict[str, dict[int, float]]:
+    """Build lookup table of timings by batch_spec and num_splits."""
     by_batch_spec = {}
     for result in results:
         batch_spec = result["config"]["batch_spec"]
@@ -436,14 +452,23 @@ def create_heuristic_speedup_heatmaps(
             if match:
                 num_splits = int(match.group(1))
                 by_batch_spec[batch_spec][num_splits] = result["mean_time"]
+    return by_batch_spec
 
-    # Extract unique batch sizes and sequence lengths
-    batch_sizes = sorted(set(b for b, _ in optimal_splits))
-    seq_lengths = sorted(set(s for _, s in optimal_splits), reverse=True)
 
-    # Create a separate heatmap for each heuristic
+def create_heuristic_speedup_heatmaps(
+    results: list, optimal_splits: dict[tuple[int, int], int], output_dir: Path
+):
+    """Create speedup heatmaps for each heuristic policy."""
+    heuristics = {
+        "Ratio-based (Original)": heuristic_ratio_based,
+        "Batch-based (improved)": heuristic_batch_based,
+        "Constant (batch<=32)": heuristic_constant,
+    }
+
+    by_batch_spec = _build_timings_lookup(results)
+    batch_sizes, seq_lengths = _get_axes_from_splits_dict(optimal_splits)
+
     for heuristic_name, heuristic_func in heuristics.items():
-        # Build speedup matrix for this heuristic
         speedup_matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
         total_speedup = 0.0
         count = 0
@@ -451,40 +476,25 @@ def create_heuristic_speedup_heatmaps(
         for i, seq_len in enumerate(seq_lengths):
             for j, batch_size in enumerate(batch_sizes):
                 batch_spec = f"{batch_size}q1s{seq_len}k"
-                if batch_spec not in by_batch_spec:
+                timings = by_batch_spec.get(batch_spec, {})
+                baseline_time = timings.get(1)
+
+                if baseline_time:
+                    predicted_splits = heuristic_func(batch_size, seq_len)
+                    predicted_time = timings.get(predicted_splits, baseline_time)
+                    speedup = baseline_time / predicted_time
+                    speedup_matrix[i, j] = speedup
+                    total_speedup += speedup
+                    count += 1
+                else:
                     speedup_matrix[i, j] = np.nan
-                    continue
-
-                timings = by_batch_spec[batch_spec]
-                baseline_time = timings.get(1, None)
-
-                if not baseline_time:
-                    speedup_matrix[i, j] = np.nan
-                    continue
-
-                # Get the num_splits predicted by this heuristic
-                predicted_splits = heuristic_func(batch_size, seq_len)
-                predicted_time = timings.get(predicted_splits, baseline_time)
-                speedup = baseline_time / predicted_time
-
-                speedup_matrix[i, j] = speedup
-                total_speedup += speedup
-                count += 1
 
         avg_speedup = total_speedup / count if count > 0 else 1.0
 
-        # Create heatmap
         _fig, ax = plt.subplots(figsize=(12, 8))
 
-        # Colormap: 1.0 = white (neutral), higher = green (good)
         max_speedup = np.nanmax(speedup_matrix)
-        colors_dict = {
-            "red": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
-            "green": [(0.0, 1.0, 1.0), (1.0, 0.5, 0.5)],
-            "blue": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
-        }
-        speedup_cmap = LinearSegmentedColormap("Speedup", colors_dict)
-
+        speedup_cmap = _create_speedup_colormap()
         im = ax.imshow(
             speedup_matrix,
             cmap=speedup_cmap,
@@ -493,46 +503,20 @@ def create_heuristic_speedup_heatmaps(
             vmax=max_speedup,
         )
 
-        # Set ticks
-        ax.set_xticks(np.arange(len(batch_sizes)))
-        ax.set_yticks(np.arange(len(seq_lengths)))
-        ax.set_xticklabels(batch_sizes)
-        ax.set_yticklabels([f"{s}k" for s in seq_lengths])
-
-        # Labels
-        ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
-        ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
-        ax.set_title(
+        _setup_heatmap_axes(
+            ax,
+            batch_sizes,
+            seq_lengths,
             f"Speedup with {heuristic_name} Policy\n"
             f"(Average speedup: {avg_speedup:.3f}x vs. splits=1)",
-            fontsize=14,
-            fontweight="bold",
-            pad=20,
         )
+        _annotate_speedup_matrix(ax, speedup_matrix, batch_sizes, seq_lengths)
 
-        # Add text annotations
-        for i in range(len(seq_lengths)):
-            for j in range(len(batch_sizes)):
-                value = speedup_matrix[i, j]
-                if not np.isnan(value):
-                    ax.text(
-                        j,
-                        i,
-                        f"{value:.2f}x",
-                        ha="center",
-                        va="center",
-                        color="black",
-                        fontsize=9,
-                        fontweight="bold",
-                    )
-
-        # Colorbar
         cbar = plt.colorbar(im, ax=ax)
         cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
 
         plt.tight_layout()
 
-        # Save with sanitized filename
         safe_name = (
             heuristic_name.lower().replace(" ", "_").replace("(", "").replace(")", "")
         )
@@ -542,6 +526,113 @@ def create_heuristic_speedup_heatmaps(
         plt.close()
 
 
+def create_auto_heatmap(results: list, output_path: str):
+    """Create heatmap showing num_kv_splits chosen by auto policy."""
+    # Find all configs with auto results
+    auto_configs = set()
+    for result in results:
+        if "auto" in result["config"]["backend"] and result["error"] is None:
+            batch_spec = result["config"]["batch_spec"]
+            batch_size, seq_length_k = parse_batch_spec(batch_spec)
+            auto_configs.add((batch_size, seq_length_k))
+
+    if not auto_configs:
+        print("Skipping auto heatmap (no auto results found)")
+        return
+
+    batch_sizes, seq_lengths = _get_axes_from_splits_dict({k: 1 for k in auto_configs})
+    matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
+    for i, seq_len in enumerate(seq_lengths):
+        for j, batch_size in enumerate(batch_sizes):
+            matrix[i, j] = 1 if (batch_size, seq_len) in auto_configs else np.nan
+
+    _fig, ax = plt.subplots(figsize=(12, 8))
+
+    cmap = ListedColormap(["#2ca02c"])  # Green for auto
+    _im = ax.imshow(matrix, cmap=cmap, aspect="auto", vmin=0, vmax=1)
+
+    _setup_heatmap_axes(
+        ax, batch_sizes, seq_lengths, "Auto num_kv_splits Policy Coverage"
+    )
+
+    # Add "AUTO" text annotations
+    for i in range(len(seq_lengths)):
+        for j in range(len(batch_sizes)):
+            if not np.isnan(matrix[i, j]):
+                ax.text(
+                    j,
+                    i,
+                    "AUTO",
+                    ha="center",
+                    va="center",
+                    color="white",
+                    fontsize=10,
+                    fontweight="bold",
+                )
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=300, bbox_inches="tight")
+    print(f"Saved auto heatmap to {output_path}")
+    plt.close()
+
+
+def create_auto_speedup_heatmap(results: list, output_path: str):
+    """Create heatmap showing speedup from auto vs splits=1."""
+    # Build speedup dictionary
+    speedup_dict = {}
+    timings_by_spec = {}
+
+    for result in results:
+        if result["error"] is not None or "mean_time" not in result:
+            continue
+
+        batch_spec = result["config"]["batch_spec"]
+        backend_name = result["config"]["backend"]
+
+        if batch_spec not in timings_by_spec:
+            timings_by_spec[batch_spec] = {}
+
+        if "auto" in backend_name:
+            timings_by_spec[batch_spec]["auto"] = result["mean_time"]
+        elif backend_name.endswith("numsplits_1"):
+            timings_by_spec[batch_spec]["baseline"] = result["mean_time"]
+
+    for batch_spec, timings in timings_by_spec.items():
+        if "baseline" in timings and "auto" in timings:
+            batch_size, seq_length_k = parse_batch_spec(batch_spec)
+            speedup = timings["baseline"] / timings["auto"]
+            speedup_dict[(batch_size, seq_length_k)] = speedup
+
+    if not speedup_dict:
+        print("Skipping auto speedup heatmap (no auto results found)")
+        return
+
+    batch_sizes, seq_lengths = _get_axes_from_splits_dict(speedup_dict)
+    matrix = _create_splits_matrix(speedup_dict, batch_sizes, seq_lengths)
+
+    _fig, ax = plt.subplots(figsize=(12, 8))
+
+    max_speedup = np.nanmax(matrix)
+    speedup_cmap = _create_speedup_colormap()
+    im = ax.imshow(matrix, cmap=speedup_cmap, aspect="auto", vmin=1.0, vmax=max_speedup)
+
+    _setup_heatmap_axes(
+        ax,
+        batch_sizes,
+        seq_lengths,
+        "Speedup from Auto Policy vs. splits=1\n(Green = better with auto)",
+    )
+    _annotate_speedup_matrix(ax, matrix, batch_sizes, seq_lengths)
+
+    cbar = plt.colorbar(im, ax=ax)
+    cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=300, bbox_inches="tight")
+    print(f"Saved auto speedup heatmap to {output_path}")
+    plt.close()
+
+
 def analyze_pattern(optimal_splits: dict[tuple[int, int], int]):
     """Analyze the pattern and suggest a formula."""
     print("\n" + "=" * 80)
@@ -591,6 +682,7 @@ def analyze_pattern(optimal_splits: dict[tuple[int, int], int]):
 
     heuristics = {
         "Ratio-based": heuristic_ratio_based,
+        "Batch-based (improved)": heuristic_batch_based,
         "Constant (batch<=32)": heuristic_constant,
     }
 
@@ -640,19 +732,26 @@ def main():
     print(f"Loading results from {json_path}...")
     results = load_results(json_path)
 
-    print("Extracting optimal splits...")
-    optimal_splits = extract_optimal_splits(results)
+    print("Extracting optimal splits (excluding auto)...")
+    optimal_splits = extract_optimal_splits(results, exclude_auto=True)
 
     print(f"Found {len(optimal_splits)} configurations")
 
     # Create visualizations
     print("\nGenerating visualizations...")
 
-    create_heatmap(optimal_splits, output_dir / "numsplits_heatmap.png")
-    create_performance_heatmap(results, output_dir / "numsplits_speedup.png")
+    print("\n--- Manual Configuration Plots (excluding auto) ---")
+    create_heatmap(optimal_splits, str(output_dir / "numsplits_heatmap.png"))
+    create_performance_heatmap(
+        results, str(output_dir / "numsplits_speedup.png"), exclude_auto=True
+    )
     create_heuristic_policy_heatmaps(optimal_splits, output_dir)
     create_heuristic_speedup_heatmaps(results, optimal_splits, output_dir)
 
+    print("\n--- Auto Policy Plots ---")
+    create_auto_heatmap(results, str(output_dir / "numsplits_heatmap_auto.png"))
+    create_auto_speedup_heatmap(results, str(output_dir / "numsplits_speedup_auto.png"))
+
     # Analyze pattern
     analyze_pattern(optimal_splits)
 

From a8882688bde5a8be28d5e5e453d1438b1c8015a5 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 15 Oct 2025 13:44:08 -0400
Subject: [PATCH 23/45] new policy

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../tools/visualize_numsplits.py              | 42 ++++++-------------
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
index a56eea6257bb..bb463de5b8dd 100644
--- a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
+++ b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
@@ -318,37 +318,21 @@ def heuristic_constant(batch_size: int, seq_length_k: int) -> int:
 
 def heuristic_batch_based(batch_size: int, seq_length_k: int) -> int:
     """
-    Improved batch-size-based heuristic with zero slowdowns.
-
-    This policy avoids all slowdowns by never splitting large batches,
-    while still achieving significant speedups for small batches.
+    Simple batch-based heuristic with zero slowdowns.
     """
-    if batch_size <= 4:
-        # Very small batch: aggressive splitting
-        if seq_length_k >= 16:
-            return 8
-        elif seq_length_k >= 4:
-            return 4
-        elif seq_length_k >= 2:
-            return 2
-        else:
-            return 1
-    elif batch_size <= 8:
-        # Small batch: moderate splitting
-        if seq_length_k >= 16:
-            return 8
-        elif seq_length_k >= 4:
-            return 4
-        else:
-            return 1
-    elif batch_size <= 16:
-        # Medium batch: conservative splitting
-        if seq_length_k >= 16:
-            return 2
-        else:
-            return 1
+    if batch_size <= 4 and seq_length_k >= 8:
+        return 16
+    elif batch_size <= 8 and seq_length_k >= 2:
+        return 8
+    elif (batch_size <= 16 and seq_length_k >= 4) or (
+        batch_size == 48 and seq_length_k >= 32
+    ):
+        return 4
+    elif (batch_size <= 32 and seq_length_k >= 8) or (
+        batch_size == 96 and seq_length_k >= 16
+    ):
+        return 2
     else:
-        # Large batch: never split (avoids slowdowns)
         return 1
 
 

From 24bf31d4045728842d65bbdcb47ee86544d7a280 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 15 Oct 2025 14:40:15 -0400
Subject: [PATCH 24/45] comments

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../attention_benchmarks/configs/cutlass_numsplits.yaml       | 4 +---
 .../attention_benchmarks/configs/flashinfer_vs_cutlass.yaml   | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
index a96f044ff4da..faeef5ffc09c 100644
--- a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
+++ b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
@@ -1,6 +1,4 @@
-# Study 1: Should we revert CUTLASS MLA num-splits heuristic?
-# Question: What is the optimal num_kv_splits for different batch sizes?
-# Related PRs: #24966, #25509
+# Study 1: What is the optimal CUTLASS_MLA num_kv_splits for different batch sizes?
 
 description: "CUTLASS MLA num-splits optimization study"
 
diff --git a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
index e45f1da0288e..76593c9b3fd8 100644
--- a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
+++ b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
@@ -1,5 +1,4 @@
 # Study 3: Is FlashInfer-MLA better than CUTLASS MLA after num-splits optimization?
-# Question: After optimizing CUTLASS MLA's num_kv_splits, is FlashInfer-MLA still competitive?
 
 description: "FlashInfer-MLA vs optimized CUTLASS MLA comparison"
 

From 07e680cc9cbf33da0357a1f876f3f12a0fa29931 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 21 Oct 2025 18:00:14 +0000
Subject: [PATCH 25/45] bugfixes, add model parameter sweep

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/benchmark.py  | 227 +++++++++++++++++-
 benchmarks/attention_benchmarks/common.py     |  47 +++-
 .../configs/hopper_head_count.yaml            | 139 +++++++++--
 benchmarks/attention_benchmarks/mla_runner.py |  40 +--
 benchmarks/attention_benchmarks/runner.py     |  13 +-
 5 files changed, 413 insertions(+), 53 deletions(-)

diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index d268ce948cd6..bfdbba1d7e8b 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -37,7 +37,13 @@
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from batch_spec import parse_batch_spec
-from common import BenchmarkConfig, BenchmarkResult, ParameterSweep, ResultsFormatter
+from common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    ModelParameterSweep,
+    ParameterSweep,
+    ResultsFormatter,
+)
 
 
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
@@ -63,6 +69,190 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
     )
 
 
+def run_model_parameter_sweep(
+    backends: list[str],
+    batch_specs: list[str],
+    base_config_args: dict,
+    sweep: ModelParameterSweep,
+    console: Console,
+) -> list[BenchmarkResult]:
+    """
+    Run model parameter sweep for given backends and batch specs.
+
+    Args:
+        backends: List of backend names
+        batch_specs: List of batch specifications
+        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
+        sweep: ModelParameterSweep configuration
+        console: Rich console for output
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    all_results = []
+
+    console.print(
+        f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
+    )
+
+    total = len(backends) * len(batch_specs) * len(sweep.values)
+
+    with tqdm(total=total, desc="Benchmarking") as pbar:
+        for backend in backends:
+            for spec in batch_specs:
+                for value in sweep.values:
+                    # Create config with modified model parameter
+                    config_args = base_config_args.copy()
+                    config_args[sweep.param_name] = value
+
+                    # Create descriptive backend name
+                    backend_label = sweep.get_label(backend, value)
+                    config = BenchmarkConfig(
+                        backend=backend_label, batch_spec=spec, **config_args
+                    )
+
+                    try:
+                        # Create clean config with original backend name for actual run
+                        clean_config = replace(config, backend=backend)
+
+                        # Determine if MLA backend
+                        if backend in [
+                            "cutlass_mla",
+                            "flashinfer_mla",
+                            "flashattn_mla",
+                            "flashmla",
+                        ]:
+                            result = run_mla_benchmark(clean_config)
+                        else:
+                            result = run_standard_attention_benchmark(clean_config)
+
+                        # Replace result's config with labeled version
+                        result = replace(result, config=config)
+                        all_results.append(result)
+
+                    except Exception as e:
+                        console.print(
+                            f"[red]Error {backend} {spec} {sweep.param_name}="
+                            f"{value}: {e}[/]"
+                        )
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=float("inf"),
+                            std_time=0,
+                            min_time=float("inf"),
+                            max_time=float("inf"),
+                            error=str(e),
+                        )
+                        all_results.append(result)
+
+                    pbar.update(1)
+
+    # Display sweep results - create separate table for each parameter value
+    console.print("\n[bold green]Model Parameter Sweep Results:[/]")
+    formatter = ResultsFormatter(console)
+
+    # Group results by parameter value and extract backend mapping
+    by_param_value = {}
+    backend_mapping = {}  # Maps labeled backend -> original backend
+
+    for r in all_results:
+        # Extract original backend and param value from labeled backend
+        # The label format is: {backend}_{param_name}_{value}
+        # We need to reverse engineer this
+        labeled_backend = r.config.backend
+
+        # Try each backend to find which one this result belongs to
+        for backend in backends:
+            for value in sweep.values:
+                expected_label = sweep.get_label(backend, value)
+                if labeled_backend == expected_label:
+                    backend_mapping[labeled_backend] = backend
+                    param_value = str(value)
+
+                    if param_value not in by_param_value:
+                        by_param_value[param_value] = []
+                    by_param_value[param_value].append(r)
+                    break
+
+    # Create a table for each parameter value
+    sorted_param_values = sorted(
+        by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
+    )
+
+    for param_value in sorted_param_values:
+        console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
+        param_results = by_param_value[param_value]
+
+        # Create modified results with original backend names
+        modified_results = []
+        for r in param_results:
+            # Get the original backend name from our mapping
+            original_backend = backend_mapping[r.config.backend]
+            modified_config = replace(r.config, backend=original_backend)
+            modified_result = replace(r, config=modified_config)
+            modified_results.append(modified_result)
+
+        # Print table with original backend names
+        formatter.print_table(modified_results, backends, compare_to_fastest=True)
+
+    # Show optimal backend for each (param_value, batch_spec) combination
+    console.print(
+        f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
+    )
+
+    # Group by (param_value, batch_spec)
+    by_param_and_spec = {}
+    for r in all_results:
+        if r.success:
+            # Find which (backend, value) this result corresponds to
+            labeled_backend = r.config.backend
+            for backend in backends:
+                for value in sweep.values:
+                    expected_label = sweep.get_label(backend, value)
+                    if labeled_backend == expected_label:
+                        param_value = str(value)
+                        spec = r.config.batch_spec
+                        key = (param_value, spec)
+
+                        if key not in by_param_and_spec:
+                            by_param_and_spec[key] = []
+                        by_param_and_spec[key].append(r)
+                        break
+
+    # Sort by param value then spec
+    sorted_keys = sorted(
+        by_param_and_spec.keys(),
+        key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
+    )
+
+    current_param_value = None
+    for param_value, spec in sorted_keys:
+        # Print header when param value changes
+        if param_value != current_param_value:
+            console.print(f"\n  [bold]{sweep.param_name}={param_value}:[/]")
+            current_param_value = param_value
+
+        results = by_param_and_spec[(param_value, spec)]
+        best = min(results, key=lambda r: r.mean_time)
+
+        # Extract original backend name using the mapping
+        backend_name = backend_mapping[best.config.backend]
+
+        # Show all backends' times for comparison
+        times_str = " | ".join(
+            [
+                f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
+                for r in sorted(results, key=lambda r: r.mean_time)
+            ]
+        )
+
+        console.print(
+            f"    {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
+        )
+
+    return all_results
+
+
 def run_parameter_sweep(
     backends: list[str],
     batch_specs: list[str],
@@ -397,6 +587,19 @@ def main():
         else:
             args.parameter_sweep = None
 
+        # Model parameter sweep configuration
+        if "model_parameter_sweep" in yaml_config:
+            sweep_config = yaml_config["model_parameter_sweep"]
+            args.model_parameter_sweep = ModelParameterSweep(
+                param_name=sweep_config["param_name"],
+                values=sweep_config["values"],
+                label_format=sweep_config.get(
+                    "label_format", "{backend}_{param_name}_{value}"
+                ),
+            )
+        else:
+            args.model_parameter_sweep = None
+
         # Output
         if "output" in yaml_config:
             output = yaml_config["output"]
@@ -617,6 +820,28 @@ def main():
                     f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
                 )
 
+    # Handle model parameter sweep mode
+    elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
+        # Model parameter sweep
+        base_config_args = {
+            "num_layers": args.num_layers,
+            "head_dim": args.head_dim,
+            "num_q_heads": args.num_q_heads,
+            "num_kv_heads": args.num_kv_heads,
+            "block_size": args.block_size,
+            "device": args.device,
+            "repeats": args.repeats,
+            "warmup_iters": args.warmup_iters,
+            "profile_memory": args.profile_memory,
+        }
+        all_results = run_model_parameter_sweep(
+            backends,
+            args.batch_specs,
+            base_config_args,
+            args.model_parameter_sweep,
+            console,
+        )
+
     # Handle parameter sweep mode (unified)
     elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
         # Unified parameter sweep
diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index 0bda71ef04ec..8cd17bbdb0c9 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -9,7 +9,7 @@
 import time
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 import torch
@@ -85,7 +85,7 @@ class MockLayer(AttentionLayerBase):
     in get_layers_from_vllm_config when FlashInfer prefill is enabled.
     """
 
-    def __init__(self, device: torch.device, impl=None):
+    def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
         # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
         self._k_scale = torch.tensor(1.0, device=device)
         self._v_scale = torch.tensor(1.0, device=device)
@@ -96,12 +96,18 @@ def __init__(self, device: torch.device, impl=None):
         self._q_scale_float = float(self._q_scale.item())
         # AttentionImpl for metadata builders to query
         self.impl = impl
+        # KV cache spec for get_kv_cache_spec
+        self._kv_cache_spec = kv_cache_spec
 
     def get_attn_backend(self):
         """Get the attention backend class (required by AttentionLayerBase)."""
         # Return None as this is just a mock layer for benchmarking
         return None
 
+    def get_kv_cache_spec(self):
+        """Get the KV cache spec (required by AttentionLayerBase)."""
+        return self._kv_cache_spec
+
 
 class MockModelConfig:
     """Mock model configuration."""
@@ -208,6 +214,21 @@ def get_label(self, backend: str, value: Any) -> str:
         )
 
 
+@dataclass
+class ModelParameterSweep:
+    """Configuration for sweeping a model configuration parameter."""
+
+    param_name: str  # Name of the model config parameter to sweep (e.g., "num_q_heads")
+    values: list[Any]  # List of values to test
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+
+
 @dataclass
 class BenchmarkConfig:
     """Configuration for a single benchmark run."""
@@ -227,14 +248,14 @@ class BenchmarkConfig:
     use_cuda_graphs: bool = False
 
     # MLA-specific
-    kv_lora_rank: Optional[int] = None
-    qk_nope_head_dim: Optional[int] = None
-    qk_rope_head_dim: Optional[int] = None
-    v_head_dim: Optional[int] = None
+    kv_lora_rank: int | None = None
+    qk_nope_head_dim: int | None = None
+    qk_rope_head_dim: int | None = None
+    v_head_dim: int | None = None
 
     # Backend-specific tuning
-    num_kv_splits: Optional[int] = None  # CUTLASS MLA
-    reorder_batch_threshold: Optional[int] = None  # FlashAttn MLA, FlashMLA
+    num_kv_splits: int | None = None  # CUTLASS MLA
+    reorder_batch_threshold: int | None = None  # FlashAttn MLA, FlashMLA
 
 
 @dataclass
@@ -246,10 +267,10 @@ class BenchmarkResult:
     std_time: float  # seconds
     min_time: float  # seconds
     max_time: float  # seconds
-    throughput_tokens_per_sec: Optional[float] = None
-    memory_allocated_mb: Optional[float] = None
-    memory_reserved_mb: Optional[float] = None
-    error: Optional[str] = None
+    throughput_tokens_per_sec: float | None = None
+    memory_allocated_mb: float | None = None
+    memory_reserved_mb: float | None = None
+    error: str | None = None
 
     @property
     def success(self) -> bool:
@@ -332,7 +353,7 @@ def _get_memory_stats(self) -> dict:
 class ResultsFormatter:
     """Format and display benchmark results."""
 
-    def __init__(self, console: Optional[Console] = None):
+    def __init__(self, console: Console | None = None):
         self.console = console or Console()
 
     def print_table(
diff --git a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
index ac990f75992c..1c693bf98859 100644
--- a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
@@ -9,24 +9,134 @@ backends:
   - flashattn_mla
   - flashmla
 
-# Standard decode workloads
+# Comprehensive batch spec matrix: batch sizes × sequence lengths
+# Batch sizes: 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128
+# Sequence lengths: 1k, 2k, 4k, 8k, 16k, 32k, 64k, 128k
 batch_specs:
-  - "32q1s1k"     # 32 decode requests, 1k KV cache
-  - "64q1s1k"     # 64 decode requests, 1k KV cache
-  - "64q1s4k"     # 64 decode requests, 4k KV cache
-  - "128q1s1k"    # 128 decode requests, 1k KV cache
-  - "128q1s4k"    # 128 decode requests, 4k KV cache
-
-# Model configuration - will test different head counts
-# Note: You'll need to run this multiple times with different num_q_heads values
-# Or modify benchmark.py to support head_counts parameter
+  # Batch size: 1
+  - "1q1s1k"
+  - "1q1s2k"
+  - "1q1s4k"
+  - "1q1s8k"
+  - "1q1s16k"
+  - "1q1s32k"
+  - "1q1s64k"
+  - "1q1s128k"
+
+  # Batch size: 2
+  - "2q1s1k"
+  - "2q1s2k"
+  - "2q1s4k"
+  - "2q1s8k"
+  - "2q1s16k"
+  - "2q1s32k"
+  - "2q1s64k"
+  - "2q1s128k"
+
+  # Batch size: 4
+  - "4q1s1k"
+  - "4q1s2k"
+  - "4q1s4k"
+  - "4q1s8k"
+  - "4q1s16k"
+  - "4q1s32k"
+  - "4q1s64k"
+  - "4q1s128k"
+
+  # Batch size: 8
+  - "8q1s1k"
+  - "8q1s2k"
+  - "8q1s4k"
+  - "8q1s8k"
+  - "8q1s16k"
+  - "8q1s32k"
+  - "8q1s64k"
+  - "8q1s128k"
+
+  # Batch size: 16
+  - "16q1s1k"
+  - "16q1s2k"
+  - "16q1s4k"
+  - "16q1s8k"
+  - "16q1s16k"
+  - "16q1s32k"
+  - "16q1s64k"
+  - "16q1s128k"
+
+  # Batch size: 24
+  - "24q1s1k"
+  - "24q1s2k"
+  - "24q1s4k"
+  - "24q1s8k"
+  - "24q1s16k"
+  - "24q1s32k"
+  - "24q1s64k"
+  - "24q1s128k"
+
+  # Batch size: 32
+  - "32q1s1k"
+  - "32q1s2k"
+  - "32q1s4k"
+  - "32q1s8k"
+  - "32q1s16k"
+  - "32q1s32k"
+  - "32q1s64k"
+  - "32q1s128k"
+
+  # Batch size: 48
+  - "48q1s1k"
+  - "48q1s2k"
+  - "48q1s4k"
+  - "48q1s8k"
+  - "48q1s16k"
+  - "48q1s32k"
+  - "48q1s64k"
+  - "48q1s128k"
+
+  # Batch size: 64
+  - "64q1s1k"
+  - "64q1s2k"
+  - "64q1s4k"
+  - "64q1s8k"
+  - "64q1s16k"
+  - "64q1s32k"
+  - "64q1s64k"
+  - "64q1s128k"
+
+  # Batch size: 96
+  - "96q1s1k"
+  - "96q1s2k"
+  - "96q1s4k"
+  - "96q1s8k"
+  - "96q1s16k"
+  - "96q1s32k"
+  - "96q1s64k"
+  - "96q1s128k"
+
+  # Batch size: 128
+  - "128q1s1k"
+  - "128q1s2k"
+  - "128q1s4k"
+  - "128q1s8k"
+  - "128q1s16k"
+  - "128q1s32k"
+  - "128q1s64k"
+  - "128q1s128k"
+
+# Model configuration
 model:
   num_layers: 10
   head_dim: 576        # MLA uses 576
-  num_q_heads: 128     # Test with: 16, 32, 64, 128, 256
+  num_q_heads: 128     # Default value (will be overridden by sweep)
   num_kv_heads: 1      # MLA uses single KV head
   block_size: 128
 
+# Model parameter sweep - test different head counts
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [16, 32, 64, 128, 256]
+  label_format: "{backend}_heads_{value}"
+
 # Benchmark settings
 benchmark:
   device: "cuda:0"
@@ -39,13 +149,6 @@ output:
   csv: "hopper_head_count_results.csv"
   json: "hopper_head_count_results.json"
 
-# To test different head counts, run:
-# for heads in 16 32 64 128 256; do
-#   python benchmark.py --config configs/hopper_head_count.yaml \
-#     --num-q-heads $heads \
-#     --output-csv hopper_heads_${heads}.csv
-# done
-
 # Expected outcome:
 # - Determine which backend is faster on Hopper
 # - Identify if head count impacts relative performance
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index b7c602d42d32..bfb60fc35d93 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -9,7 +9,6 @@
 """
 
 import importlib
-from typing import Optional
 
 import numpy as np
 import torch
@@ -56,7 +55,7 @@ def create_minimal_vllm_config(
     model_name: str = "deepseek-v3",
     block_size: int = 128,
     max_num_seqs: int = 256,
-    mla_dims: Optional[dict] = None,
+    mla_dims: dict | None = None,
 ) -> VllmConfig:
     """
     Create minimal VllmConfig for MLA benchmarks.
@@ -259,15 +258,15 @@ def _build_attention_metadata(
     max_kv = max(kv_lens)
 
     # Build query start locations
-    q_start_cpu = np.array(
+    q_start_cpu = torch.tensor(
         [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
-        dtype=np.int32,
+        dtype=torch.int32,
     )
-    q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
+    q_start_gpu = q_start_cpu.to(device)
 
     # Build sequence lengths
-    seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
-    seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
+    seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
+    seq_lens_gpu = seq_lens_cpu.to(device)
 
     # Build num_computed_tokens (context length for each request)
     context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
@@ -489,23 +488,24 @@ def _create_backend_impl(
         impl.dcp_world_size = 1
         impl.dcp_rank = 0
 
+    # Create KV cache spec for MockLayer
+    from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+    kv_cache_spec = FullAttentionSpec(
+        block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
+        num_kv_heads=1,  # MLA uses 1 KV head
+        head_size=576,  # MLA head dim
+        dtype=torch.float16,
+    )
+
     # Create mock layer
-    layer = MockLayer(device, impl=impl)
+    layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
 
     # Create builder instance if needed
     builder_instance = None
     if backend_cfg["builder_class"]:
         builder_class = getattr(backend_module, backend_cfg["builder_class"])
 
-        from vllm.v1.kv_cache_interface import FullAttentionSpec
-
-        kv_cache_spec = FullAttentionSpec(
-            block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
-            num_kv_heads=1,  # MLA uses 1 KV head
-            head_size=576,  # MLA head dim
-            dtype=torch.float16,
-        )
-
         # Populate static_forward_context so builder can find the layer
         # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
         vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
@@ -525,7 +525,7 @@ def _create_backend_impl(
 # ============================================================================
 
 
-def _extract_mla_dims_from_config(config) -> Optional[dict]:
+def _extract_mla_dims_from_config(config) -> dict | None:
     """
     Extract MLA dimensions from BenchmarkConfig if all required fields are present.
 
@@ -776,8 +776,8 @@ def _run_mla_benchmark_batched(
 def run_mla_benchmark(
     backend: str,
     config,
-    reorder_batch_threshold: Optional[int] = None,
-    num_kv_splits: Optional[int] = None,
+    reorder_batch_threshold: int | None = None,
+    num_kv_splits: int | None = None,
 ) -> dict:
     """
     Unified MLA benchmark runner for all backends.
diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index 6ce54d3c5aa4..9e465b5808d2 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -227,8 +227,19 @@ def _create_backend_impl(
         kv_cache_dtype="auto",
     )
 
+    # Create KV cache spec for MockLayer
+    from vllm.v1.kv_cache_interface import AttentionSpec
+
+    kv_cache_spec = AttentionSpec(
+        block_size=config.block_size,
+        num_kv_heads=config.num_kv_heads,
+        head_size=config.head_dim,
+        dtype=dtype,
+        use_mla=False,
+    )
+
     # Create mock layer
-    layer = MockLayer(device)
+    layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
 
     return backend_class, impl, layer, dtype
 

From 6419a6da543798a4947c99290b4e9962f8d66372 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 21 Oct 2025 18:34:49 +0000
Subject: [PATCH 26/45] don't download from HF

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/common.py     |  5 ++
 benchmarks/attention_benchmarks/mla_runner.py | 81 +++++++++++++------
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index 8cd17bbdb0c9..0d25489e5815 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -28,6 +28,11 @@ def __init__(self, mla_dims: dict):
         self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
         self.model_type = "deepseek_v2"
         self.is_encoder_decoder = False
+        self.kv_lora_rank = mla_dims["kv_lora_rank"]
+        self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
+        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
+        self.v_head_dim = mla_dims["v_head_dim"]
+        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
 
     def get_text_config(self):
         return self
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index bfb60fc35d93..d4818efa7374 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -75,31 +75,64 @@ def create_minimal_vllm_config(
     if mla_dims is None:
         mla_dims = setup_mla_dims(model_name)
 
-    # Create model config
-    model_config = ModelConfig(
-        model=f"deepseek-ai/{model_name}",
-        tokenizer=None,
-        tokenizer_mode="auto",
-        trust_remote_code=True,
-        dtype="float16",
-        seed=0,
-        max_model_len=32768,
-        quantization=None,
-        quantization_param_path=None,
-        enforce_eager=False,
-        max_context_len_to_capture=None,
-        max_seq_len_to_capture=8192,
-        max_logprobs=20,
-        disable_sliding_window=False,
-        skip_tokenizer_init=True,
-        served_model_name=None,
-        limit_mm_per_prompt=None,
-        use_async_output_proc=True,
-        config_format="auto",
-    )
+    # Create mock HF config first (avoids downloading from HuggingFace)
+    mock_hf_config = MockHfConfig(mla_dims)
+
+    # Create a temporary minimal config.json to avoid HF downloads
+    # This ensures consistent ModelConfig construction without network access
+    import json
+    import os
+    import shutil
+    import tempfile
+
+    minimal_config = {
+        "architectures": ["DeepseekV2ForCausalLM"],
+        "model_type": "deepseek_v2",
+        "num_attention_heads": mla_dims["num_q_heads"],
+        "num_key_value_heads": mla_dims["num_kv_heads"],
+        "hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
+        "torch_dtype": "float16",
+        "max_position_embeddings": 163840,  # DeepSeek V3 default
+        "rope_theta": 10000.0,
+        "vocab_size": 128256,
+    }
+
+    # Create temporary directory with config.json
+    temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
+    config_path = os.path.join(temp_dir, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(minimal_config, f)
+
+    try:
+        # Create model config using local path - no HF downloads
+        model_config = ModelConfig(
+            model=temp_dir,  # Use local temp directory
+            tokenizer=None,
+            tokenizer_mode="auto",
+            trust_remote_code=True,
+            dtype="float16",
+            seed=0,
+            max_model_len=32768,
+            quantization=None,
+            quantization_param_path=None,
+            enforce_eager=False,
+            max_context_len_to_capture=None,
+            max_seq_len_to_capture=8192,
+            max_logprobs=20,
+            disable_sliding_window=False,
+            skip_tokenizer_init=True,
+            served_model_name=None,
+            limit_mm_per_prompt=None,
+            use_async_output_proc=True,
+            config_format="auto",
+        )
+    finally:
+        # Clean up temporary directory
+        shutil.rmtree(temp_dir, ignore_errors=True)
 
-    # Override head counts and dims for MLA
-    model_config.hf_config = MockHfConfig(mla_dims)
+    # Override with our mock config
+    model_config.hf_config = mock_hf_config
+    model_config.hf_text_config = mock_hf_config
 
     # Add mock methods for layer-specific queries
     _add_mock_methods_to_model_config(model_config)

From 33c95b67ff885ea793c62885e024cda2766aa579 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 21 Oct 2025 18:35:03 +0000
Subject: [PATCH 27/45] update specs

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/hopper_head_count.yaml            | 185 ++++++++++--------
 1 file changed, 104 insertions(+), 81 deletions(-)

diff --git a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
index 1c693bf98859..cbe6f28a8bd8 100644
--- a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
@@ -9,119 +9,142 @@ backends:
   - flashattn_mla
   - flashmla
 
-# Comprehensive batch spec matrix: batch sizes × sequence lengths
-# Batch sizes: 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128
-# Sequence lengths: 1k, 2k, 4k, 8k, 16k, 32k, 64k, 128k
+# Comprehensive batch spec matrix: batch sizes × query lengths
+# Batch sizes (num requests): 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128
+# Query lengths: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512
+# KV cache length: 1k (fixed)
 batch_specs:
   # Batch size: 1
   - "1q1s1k"
-  - "1q1s2k"
-  - "1q1s4k"
-  - "1q1s8k"
-  - "1q1s16k"
-  - "1q1s32k"
-  - "1q1s64k"
-  - "1q1s128k"
+  - "1q2s1k"
+  - "1q4s1k"
+  - "1q8s1k"
+  - "1q16s1k"
+  - "1q32s1k"
+  - "1q64s1k"
+  - "1q128s1k"
+  - "1q256s1k"
+  - "1q512s1k"
 
   # Batch size: 2
   - "2q1s1k"
-  - "2q1s2k"
-  - "2q1s4k"
-  - "2q1s8k"
-  - "2q1s16k"
-  - "2q1s32k"
-  - "2q1s64k"
-  - "2q1s128k"
+  - "2q2s1k"
+  - "2q4s1k"
+  - "2q8s1k"
+  - "2q16s1k"
+  - "2q32s1k"
+  - "2q64s1k"
+  - "2q128s1k"
+  - "2q256s1k"
+  - "2q512s1k"
 
   # Batch size: 4
   - "4q1s1k"
-  - "4q1s2k"
-  - "4q1s4k"
-  - "4q1s8k"
-  - "4q1s16k"
-  - "4q1s32k"
-  - "4q1s64k"
-  - "4q1s128k"
+  - "4q2s1k"
+  - "4q4s1k"
+  - "4q8s1k"
+  - "4q16s1k"
+  - "4q32s1k"
+  - "4q64s1k"
+  - "4q128s1k"
+  - "4q256s1k"
+  - "4q512s1k"
 
   # Batch size: 8
   - "8q1s1k"
-  - "8q1s2k"
-  - "8q1s4k"
-  - "8q1s8k"
-  - "8q1s16k"
-  - "8q1s32k"
-  - "8q1s64k"
-  - "8q1s128k"
+  - "8q2s1k"
+  - "8q4s1k"
+  - "8q8s1k"
+  - "8q16s1k"
+  - "8q32s1k"
+  - "8q64s1k"
+  - "8q128s1k"
+  - "8q256s1k"
+  - "8q512s1k"
 
   # Batch size: 16
   - "16q1s1k"
-  - "16q1s2k"
-  - "16q1s4k"
-  - "16q1s8k"
-  - "16q1s16k"
-  - "16q1s32k"
-  - "16q1s64k"
-  - "16q1s128k"
+  - "16q2s1k"
+  - "16q4s1k"
+  - "16q8s1k"
+  - "16q16s1k"
+  - "16q32s1k"
+  - "16q64s1k"
+  - "16q128s1k"
+  - "16q256s1k"
+  - "16q512s1k"
 
   # Batch size: 24
   - "24q1s1k"
-  - "24q1s2k"
-  - "24q1s4k"
-  - "24q1s8k"
-  - "24q1s16k"
-  - "24q1s32k"
-  - "24q1s64k"
-  - "24q1s128k"
+  - "24q2s1k"
+  - "24q4s1k"
+  - "24q8s1k"
+  - "24q16s1k"
+  - "24q32s1k"
+  - "24q64s1k"
+  - "24q128s1k"
+  - "24q256s1k"
+  - "24q512s1k"
 
   # Batch size: 32
   - "32q1s1k"
-  - "32q1s2k"
-  - "32q1s4k"
-  - "32q1s8k"
-  - "32q1s16k"
-  - "32q1s32k"
-  - "32q1s64k"
-  - "32q1s128k"
+  - "32q2s1k"
+  - "32q4s1k"
+  - "32q8s1k"
+  - "32q16s1k"
+  - "32q32s1k"
+  - "32q64s1k"
+  - "32q128s1k"
+  - "32q256s1k"
+  - "32q512s1k"
 
   # Batch size: 48
   - "48q1s1k"
-  - "48q1s2k"
-  - "48q1s4k"
-  - "48q1s8k"
-  - "48q1s16k"
-  - "48q1s32k"
-  - "48q1s64k"
-  - "48q1s128k"
+  - "48q2s1k"
+  - "48q4s1k"
+  - "48q8s1k"
+  - "48q16s1k"
+  - "48q32s1k"
+  - "48q64s1k"
+  - "48q128s1k"
+  - "48q256s1k"
+  - "48q512s1k"
 
   # Batch size: 64
   - "64q1s1k"
-  - "64q1s2k"
-  - "64q1s4k"
-  - "64q1s8k"
-  - "64q1s16k"
-  - "64q1s32k"
-  - "64q1s64k"
-  - "64q1s128k"
+  - "64q2s1k"
+  - "64q4s1k"
+  - "64q8s1k"
+  - "64q16s1k"
+  - "64q32s1k"
+  - "64q64s1k"
+  - "64q128s1k"
+  - "64q256s1k"
+  - "64q512s1k"
 
   # Batch size: 96
   - "96q1s1k"
-  - "96q1s2k"
-  - "96q1s4k"
-  - "96q1s8k"
-  - "96q1s16k"
-  - "96q1s32k"
-  - "96q1s64k"
-  - "96q1s128k"
+  - "96q2s1k"
+  - "96q4s1k"
+  - "96q8s1k"
+  - "96q16s1k"
+  - "96q32s1k"
+  - "96q64s1k"
+  - "96q128s1k"
+  - "96q256s1k"
+  - "96q512s1k"
 
   # Batch size: 128
   - "128q1s1k"
-  - "128q1s2k"
-  - "128q1s4k"
-  - "128q1s8k"
-  - "128q1s16k"
-  - "128q1s32k"
-  - "128q1s64k"
-  - "128q1s128k"
+  - "128q2s1k"
+  - "128q4s1k"
+  - "128q8s1k"
+  - "128q16s1k"
+  - "128q32s1k"
+  - "128q64s1k"
+  - "128q128s1k"
+  - "128q256s1k"
+  - "128q512s1k"
 
 # Model configuration
 model:
@@ -129,7 +152,7 @@ model:
   head_dim: 576        # MLA uses 576
   num_q_heads: 128     # Default value (will be overridden by sweep)
   num_kv_heads: 1      # MLA uses single KV head
-  block_size: 128
+  block_size: 64
 
 # Model parameter sweep - test different head counts
 model_parameter_sweep:

From c26b90a71f65d219d4a9fcc08a8a392a7be86f97 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 21 Oct 2025 18:41:49 +0000
Subject: [PATCH 28/45] remove batch size > 16

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/hopper_head_count.yaml            | 75 +------------------
 1 file changed, 2 insertions(+), 73 deletions(-)

diff --git a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
index cbe6f28a8bd8..b7baa5abe142 100644
--- a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
@@ -10,7 +10,8 @@ backends:
   - flashmla
 
 # Comprehensive batch spec matrix: batch sizes × query lengths
-# Batch sizes (num requests): 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128
+# Batch sizes (num requests): 1, 2, 4, 8, 16
+# Note: FlashMLA has IMA issues for batch_size > 16, so limiting to ≤ 16
 # Query lengths: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512
 # KV cache length: 1k (fixed)
 batch_specs:
@@ -74,78 +75,6 @@ batch_specs:
   - "16q256s1k"
   - "16q512s1k"
 
-  # Batch size: 24
-  - "24q1s1k"
-  - "24q2s1k"
-  - "24q4s1k"
-  - "24q8s1k"
-  - "24q16s1k"
-  - "24q32s1k"
-  - "24q64s1k"
-  - "24q128s1k"
-  - "24q256s1k"
-  - "24q512s1k"
-
-  # Batch size: 32
-  - "32q1s1k"
-  - "32q2s1k"
-  - "32q4s1k"
-  - "32q8s1k"
-  - "32q16s1k"
-  - "32q32s1k"
-  - "32q64s1k"
-  - "32q128s1k"
-  - "32q256s1k"
-  - "32q512s1k"
-
-  # Batch size: 48
-  - "48q1s1k"
-  - "48q2s1k"
-  - "48q4s1k"
-  - "48q8s1k"
-  - "48q16s1k"
-  - "48q32s1k"
-  - "48q64s1k"
-  - "48q128s1k"
-  - "48q256s1k"
-  - "48q512s1k"
-
-  # Batch size: 64
-  - "64q1s1k"
-  - "64q2s1k"
-  - "64q4s1k"
-  - "64q8s1k"
-  - "64q16s1k"
-  - "64q32s1k"
-  - "64q64s1k"
-  - "64q128s1k"
-  - "64q256s1k"
-  - "64q512s1k"
-
-  # Batch size: 96
-  - "96q1s1k"
-  - "96q2s1k"
-  - "96q4s1k"
-  - "96q8s1k"
-  - "96q16s1k"
-  - "96q32s1k"
-  - "96q64s1k"
-  - "96q128s1k"
-  - "96q256s1k"
-  - "96q512s1k"
-
-  # Batch size: 128
-  - "128q1s1k"
-  - "128q2s1k"
-  - "128q4s1k"
-  - "128q8s1k"
-  - "128q16s1k"
-  - "128q32s1k"
-  - "128q64s1k"
-  - "128q128s1k"
-  - "128q256s1k"
-  - "128q512s1k"
-
 # Model configuration
 model:
   num_layers: 10

From 0522e0be4293b34ac368997bfb58eddb8dcf54e8 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 22 Oct 2025 18:55:15 +0000
Subject: [PATCH 29/45] update configs

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/hopper_head_count.yaml            | 41 +++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
index b7baa5abe142..620cc2050c14 100644
--- a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
+++ b/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
@@ -10,8 +10,7 @@ backends:
   - flashmla
 
 # Comprehensive batch spec matrix: batch sizes × query lengths
-# Batch sizes (num requests): 1, 2, 4, 8, 16
-# Note: FlashMLA has IMA issues for batch_size > 16, so limiting to ≤ 16
+# Batch sizes (num requests): 1, 2, 4, 8, 16, 32, 64, 128
 # Query lengths: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512
 # KV cache length: 1k (fixed)
 batch_specs:
@@ -73,7 +72,43 @@ batch_specs:
   - "16q64s1k"
   - "16q128s1k"
   - "16q256s1k"
-  - "16q512s1k"
+  # - "16q512s1k"
+
+  # Batch size: 32
+  - "32q1s1k"
+  - "32q2s1k"
+  - "32q4s1k"
+  - "32q8s1k"
+  - "32q16s1k"
+  - "32q32s1k"
+  - "32q64s1k"
+  - "32q128s1k"
+  - "32q256s1k"
+  # - "32q512s1k"
+
+  # # Batch size: 64
+  - "64q1s1k"
+  - "64q2s1k"
+  - "64q4s1k"
+  - "64q8s1k"
+  - "64q16s1k"
+  - "64q32s1k"
+  - "64q64s1k"
+  - "64q128s1k"
+  - "64q256s1k"
+  # - "64q512s1k"
+
+  # # Batch size: 128
+  - "128q1s1k"
+  - "128q2s1k"
+  - "128q4s1k"
+  - "128q8s1k"
+  - "128q16s1k"
+  - "128q32s1k"
+  - "128q64s1k"
+  - "128q128s1k"
+  - "128q256s1k"
+  # - "128q512s1k"
 
 # Model configuration
 model:

From bcc63d0d8012ab6b725b428e5075527a2e251159 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 22 Oct 2025 19:55:01 +0000
Subject: [PATCH 30/45] rename

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/{hopper_head_count.yaml => famla_vs_fmla.yaml}        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename benchmarks/attention_benchmarks/configs/{hopper_head_count.yaml => famla_vs_fmla.yaml} (100%)

diff --git a/benchmarks/attention_benchmarks/configs/hopper_head_count.yaml b/benchmarks/attention_benchmarks/configs/famla_vs_fmla.yaml
similarity index 100%
rename from benchmarks/attention_benchmarks/configs/hopper_head_count.yaml
rename to benchmarks/attention_benchmarks/configs/famla_vs_fmla.yaml

From fdc1a597a890f0abd069b02d29c81815625f78f2 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 3 Nov 2025 16:28:29 -0500
Subject: [PATCH 31/45] fix pre-commit

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 41fa60b05fca..9a9047fd06c5 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -39,7 +39,7 @@ Express workloads concisely using query length and sequence length:
 
 ### Grammar Rule
 
-```
+```text
 Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
 
 - count:   Number of identical requests (optional, default=1)
@@ -99,15 +99,18 @@ Compares FlashInfer-MLA against CUTLASS MLA with optimized `num_kv_splits` value
 **Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
 
 **Methodology:** Reproduces the original `benchmark_mla_threshold.py` study using the new interface:
+
 - For each query length (1-2048), test BOTH decode and prefill pipelines
 - Find the crossover point where prefill becomes faster
 - Analyze how this varies across batch sizes (1-256)
 
+
 ```bash
 python benchmark.py --config configs/reorder_threshold.yaml
 ```
 
 Tests query lengths from 1-2048 (fine-grained steps at low values, coarser at high values) across 9 batch sizes. For each query length, compares:
+
 - **Decode pipeline**: `threshold >= query_length`
 - **Prefill pipeline**: `threshold < query_length`
 
@@ -169,7 +172,7 @@ python benchmark.py \
 
 ### All Command-Line Options
 
-```
+```text
 --backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
                                     # flashinfer_mla, flashattn_mla, flashmla
 --backend BACKEND                   # Single backend (alternative to --backends)
@@ -272,7 +275,7 @@ formatter.save_json(results, "output.json")
 
 ## File Structure
 
-```
+```text
 attention_benchmarks/
 ├── README.md                      # This file
 │
@@ -308,15 +311,19 @@ attention_benchmarks/
 ## Troubleshooting
 
 **Import errors?**
+
 ```bash
 source /path/to/vllm/.venv/bin/activate
 ```
 
 **Backend not supported?**
+
 - Check hardware requirements above
 - Some backends need Hopper/Blackwell
 
+
 **OOM?**
+
 - Reduce batch size: `"32q1s1k"` → `"16q1s1k"`
 - Reduce sequence length: `"64q1s16k"` → `"64q1s4k"`
 

From b60e5fcb2e37395b4cc7f8ba2cbae1bf7048a251 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Mon, 3 Nov 2025 16:43:39 -0500
Subject: [PATCH 32/45] fix pre-commit

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 9a9047fd06c5..16e6be5497f6 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -104,7 +104,6 @@ Compares FlashInfer-MLA against CUTLASS MLA with optimized `num_kv_splits` value
 - Find the crossover point where prefill becomes faster
 - Analyze how this varies across batch sizes (1-256)
 
-
 ```bash
 python benchmark.py --config configs/reorder_threshold.yaml
 ```
@@ -321,7 +320,6 @@ source /path/to/vllm/.venv/bin/activate
 - Check hardware requirements above
 - Some backends need Hopper/Blackwell
 
-
 **OOM?**
 
 - Reduce batch size: `"32q1s1k"` → `"16q1s1k"`

From 55c00fa2225b2575da41f86d8a0d615846ce997e Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 19:16:38 +0000
Subject: [PATCH 33/45] Fix

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/runner.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index 9e465b5808d2..aa32138c9eea 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -175,7 +175,16 @@ def _build_block_table(
     Returns:
         BlockTable instance
     """
-    bt = BlockTable(len(requests), max_num_blocks, total_q, False, device)
+    bt = BlockTable(
+        block_size=block_size,
+        max_num_reqs=len(requests),
+        max_num_blocks_per_req=max_num_blocks,
+        max_num_batched_tokens=total_q,
+        pin_memory=False,
+        device=device,
+        kernel_block_size=block_size,
+        cp_kv_cache_interleave_size=1,
+    )
     for i in range(len(requests)):
         num_blocks = (kv_lens[i] + block_size - 1) // block_size
         bt.add_row(list(range(num_blocks)), i)
@@ -235,7 +244,6 @@ def _create_backend_impl(
         num_kv_heads=config.num_kv_heads,
         head_size=config.head_dim,
         dtype=dtype,
-        use_mla=False,
     )
 
     # Create mock layer
@@ -287,7 +295,6 @@ def _create_metadata_builder(
             num_kv_heads=config.num_kv_heads,
             head_size=config.head_dim,
             dtype=dtype,
-            use_mla=False,
         ),
         block_table=block_table,
     )

From af0578fd1a316c108c04301049d790d8481422f5 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 19:17:07 +0000
Subject: [PATCH 34/45] Cleanup

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/runner.py | 28 -----------------------
 1 file changed, 28 deletions(-)

diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index aa32138c9eea..444f7c90fc28 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -585,31 +585,3 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
         memory_allocated_mb=mem_stats.get("allocated_mb"),
         memory_reserved_mb=mem_stats.get("reserved_mb"),
     )
-
-
-# ============================================================================
-# Backwards Compatibility
-# ============================================================================
-
-
-# Keep old function names for backwards compatibility
-def build_common_metadata(*args, **kwargs):
-    """Deprecated: Use _build_attention_metadata instead."""
-    return _build_attention_metadata(*args, **kwargs)
-
-
-def run_attention_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
-    """Deprecated: Use run_attention_benchmark instead."""
-    return run_attention_benchmark(config)
-
-
-def run_mla_benchmark_impl(config: BenchmarkConfig) -> BenchmarkResult:
-    """
-    Run MLA benchmark with real kernels.
-
-    This is a stub - use mla_runner.py for MLA benchmarks.
-    """
-    raise NotImplementedError(
-        "MLA benchmark runner is in mla_runner.py. "
-        "Use run_mla_benchmark() from that module."
-    )

From bf50878d4bb67b4c0ba4163dd56f1638f64e2f90 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 19:20:58 +0000
Subject: [PATCH 35/45] Remove visualize_numsplits.py

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../tools/visualize_numsplits.py              | 746 ------------------
 1 file changed, 746 deletions(-)
 delete mode 100644 benchmarks/attention_benchmarks/tools/visualize_numsplits.py

diff --git a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py b/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
deleted file mode 100644
index bb463de5b8dd..000000000000
--- a/benchmarks/attention_benchmarks/tools/visualize_numsplits.py
+++ /dev/null
@@ -1,746 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Visualize CUTLASS MLA num_kv_splits benchmark results.
-
-Usage:
-    python visualize_numsplits.py cutlass_numsplits_results.json
-"""
-
-import json
-import sys
-from collections.abc import Mapping
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import numpy as np
-import regex as re
-from matplotlib.colors import LinearSegmentedColormap, ListedColormap
-
-
-def parse_batch_spec(spec: str) -> tuple[int, int]:
-    """Parse batch spec like '32q1s16k' into (batch_size, seq_length_k)."""
-    match = re.match(r"(\d+)q1s(\d+)k", spec)
-    if not match:
-        raise ValueError(f"Cannot parse batch spec: {spec}")
-    batch_size = int(match.group(1))
-    seq_length_k = int(match.group(2))
-    return batch_size, seq_length_k
-
-
-def load_results(json_path: str) -> list:
-    """Load benchmark results from JSON file."""
-    with open(json_path) as f:
-        return json.load(f)
-
-
-def extract_optimal_splits(
-    results: list, exclude_auto: bool = False
-) -> dict[tuple[int, int], int]:
-    """
-    Extract optimal num_kv_splits for each (batch_size, seq_length) pair.
-
-    Args:
-        results: List of benchmark results
-        exclude_auto: If True, exclude "auto" backend from consideration
-
-    Returns:
-        Dict mapping (batch_size, seq_length_k) -> optimal_num_kv_splits
-    """
-    # Group results by batch_spec
-    by_batch_spec = {}
-    for result in results:
-        batch_spec = result["config"]["batch_spec"]
-        backend_name = result["config"]["backend"]
-
-        # Skip auto if requested
-        if exclude_auto and "auto" in backend_name:
-            continue
-
-        if batch_spec not in by_batch_spec:
-            by_batch_spec[batch_spec] = []
-        by_batch_spec[batch_spec].append(result)
-
-    optimal_splits = {}
-
-    for batch_spec, batch_results in by_batch_spec.items():
-        batch_size, seq_length_k = parse_batch_spec(batch_spec)
-
-        # Find the configuration with minimum time
-        min_time = float("inf")
-        optimal_split = 1
-
-        for result in batch_results:
-            if result["error"] is None and "mean_time" in result:
-                time = result["mean_time"]
-                if time < min_time:
-                    min_time = time
-                    # Extract num_kv_splits from backend name
-                    backend_name = result["config"]["backend"]
-                    match = re.search(r"numsplits_(\d+)", backend_name)
-                    if match:
-                        optimal_split = int(match.group(1))
-
-        optimal_splits[(batch_size, seq_length_k)] = optimal_split
-
-    return optimal_splits
-
-
-def _get_axes_from_splits_dict(
-    splits_dict: Mapping[tuple[int, int], int | float],
-) -> tuple[list[int], list[int]]:
-    """Extract sorted batch sizes and sequence lengths from splits dictionary."""
-    batch_sizes = sorted(set(b for b, _ in splits_dict))
-    seq_lengths = sorted(set(s for _, s in splits_dict), reverse=True)
-    return batch_sizes, seq_lengths
-
-
-def _create_splits_matrix(
-    splits_dict: Mapping[tuple[int, int], int | float],
-    batch_sizes: list[int],
-    seq_lengths: list[int],
-) -> np.ndarray:
-    """Create matrix from splits dictionary."""
-    matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
-    for i, seq_len in enumerate(seq_lengths):
-        for j, batch_size in enumerate(batch_sizes):
-            matrix[i, j] = splits_dict.get((batch_size, seq_len), np.nan)
-    return matrix
-
-
-def _setup_heatmap_axes(ax, batch_sizes: list[int], seq_lengths: list[int], title: str):
-    """Setup common axes properties for heatmaps."""
-    ax.set_xticks(np.arange(len(batch_sizes)))
-    ax.set_yticks(np.arange(len(seq_lengths)))
-    ax.set_xticklabels(batch_sizes)
-    ax.set_yticklabels([f"{s}k" for s in seq_lengths])
-    ax.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
-    ax.set_ylabel("Sequence Length", fontsize=12, fontweight="bold")
-    ax.set_title(title, fontsize=14, fontweight="bold", pad=20)
-
-
-def _create_log2_colormap(min_log2: float, max_log2: float) -> tuple:
-    """Create discrete log2 colormap and bounds."""
-    n_colors = int(max_log2 - min_log2 + 1)
-    viridis = plt.cm.viridis
-    indices = np.linspace(0, 1, n_colors)
-    colors = [viridis(i) for i in indices]
-    cmap = ListedColormap(colors)
-    vmin = min_log2 - 0.5
-    vmax = max_log2 + 0.5
-    return cmap, vmin, vmax
-
-
-def _add_log2_colorbar(im, ax, label: str, min_log2: float, max_log2: float):
-    """Add colorbar with power-of-2 labels."""
-    cbar = plt.colorbar(im, ax=ax)
-    cbar.set_label(label, rotation=270, labelpad=20, fontsize=12)
-    tick_positions = np.arange(min_log2, max_log2 + 1)
-    tick_labels = [str(int(2**i)) for i in tick_positions]
-    cbar.set_ticks(tick_positions)
-    cbar.set_ticklabels(tick_labels)
-
-
-def _annotate_splits_matrix(
-    ax, matrix: np.ndarray, batch_sizes: list[int], seq_lengths: list[int]
-):
-    """Add text annotations showing split values."""
-    for i in range(len(seq_lengths)):
-        for j in range(len(batch_sizes)):
-            value = matrix[i, j]
-            if not np.isnan(value):
-                ax.text(
-                    j,
-                    i,
-                    int(value),
-                    ha="center",
-                    va="center",
-                    color="black",
-                    fontsize=10,
-                    fontweight="bold",
-                )
-
-
-def create_heatmap(optimal_splits: dict[tuple[int, int], int], output_path: str):
-    """Create heatmap showing optimal num_kv_splits."""
-    batch_sizes, seq_lengths = _get_axes_from_splits_dict(optimal_splits)
-    matrix = _create_splits_matrix(optimal_splits, batch_sizes, seq_lengths)
-
-    _fig, ax = plt.subplots(figsize=(12, 8))
-
-    # Convert to log2 scale for coloring
-    matrix_log2 = np.log2(matrix)
-    valid_values = matrix_log2[~np.isnan(matrix_log2)]
-    min_log2 = np.floor(valid_values.min())
-    max_log2 = np.ceil(valid_values.max())
-
-    cmap, vmin, vmax = _create_log2_colormap(min_log2, max_log2)
-    im = ax.imshow(matrix_log2, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax)
-
-    _setup_heatmap_axes(
-        ax,
-        batch_sizes,
-        seq_lengths,
-        "Optimal num_kv_splits for CUTLASS MLA\n"
-        "(Lower is simpler, higher is more parallelism)",
-    )
-    _annotate_splits_matrix(ax, matrix, batch_sizes, seq_lengths)
-    _add_log2_colorbar(im, ax, "Optimal num_kv_splits", min_log2, max_log2)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight")
-    print(f"Saved heatmap to {output_path}")
-    plt.close()
-
-
-def _create_speedup_colormap():
-    """Create colormap for speedup: 1.0 = white, higher = green."""
-    colors_dict = {
-        "red": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
-        "green": [(0.0, 1.0, 1.0), (1.0, 0.5, 0.5)],
-        "blue": [(0.0, 1.0, 1.0), (1.0, 0.0, 0.0)],
-    }
-    return LinearSegmentedColormap("Speedup", colors_dict)
-
-
-def _annotate_speedup_matrix(
-    ax, matrix: np.ndarray, batch_sizes: list[int], seq_lengths: list[int]
-):
-    """Add text annotations showing speedup values."""
-    for i in range(len(seq_lengths)):
-        for j in range(len(batch_sizes)):
-            value = matrix[i, j]
-            if not np.isnan(value):
-                ax.text(
-                    j,
-                    i,
-                    f"{value:.2f}x",
-                    ha="center",
-                    va="center",
-                    color="black",
-                    fontsize=9,
-                    fontweight="bold",
-                )
-
-
-def _compute_speedup_matrix(
-    results: list, exclude_auto: bool = False
-) -> dict[tuple[int, int], float]:
-    """Compute speedup matrix from results (optimal vs splits=1)."""
-    by_batch_spec = {}
-    for result in results:
-        batch_spec = result["config"]["batch_spec"]
-        backend_name = result["config"]["backend"]
-
-        if exclude_auto and "auto" in backend_name:
-            continue
-
-        if batch_spec not in by_batch_spec:
-            by_batch_spec[batch_spec] = []
-        by_batch_spec[batch_spec].append(result)
-
-    speedup_matrix = {}
-    for batch_spec, batch_results in by_batch_spec.items():
-        batch_size, seq_length_k = parse_batch_spec(batch_spec)
-
-        baseline_time = None
-        min_time = float("inf")
-
-        for result in batch_results:
-            if result["error"] is None and "mean_time" in result:
-                time = result["mean_time"]
-                backend_name = result["config"]["backend"]
-                if backend_name.endswith("numsplits_1"):
-                    baseline_time = time
-                if time < min_time:
-                    min_time = time
-
-        if baseline_time:
-            speedup = baseline_time / min_time
-            speedup_matrix[(batch_size, seq_length_k)] = speedup
-
-    return speedup_matrix
-
-
-def create_performance_heatmap(
-    results: list, output_path: str, exclude_auto: bool = False
-):
-    """Create heatmap showing speedup from optimal splits vs splits=1."""
-    speedup_dict = _compute_speedup_matrix(results, exclude_auto)
-    batch_sizes, seq_lengths = _get_axes_from_splits_dict(speedup_dict)
-    matrix = _create_splits_matrix(speedup_dict, batch_sizes, seq_lengths)
-
-    _fig, ax = plt.subplots(figsize=(12, 8))
-
-    max_speedup = np.nanmax(matrix)
-    speedup_cmap = _create_speedup_colormap()
-    im = ax.imshow(matrix, cmap=speedup_cmap, aspect="auto", vmin=1.0, vmax=max_speedup)
-
-    _setup_heatmap_axes(
-        ax,
-        batch_sizes,
-        seq_lengths,
-        "Speedup from Optimal num_kv_splits vs. splits=1\n"
-        "(Green = better with splits, White = same)",
-    )
-    _annotate_speedup_matrix(ax, matrix, batch_sizes, seq_lengths)
-
-    cbar = plt.colorbar(im, ax=ax)
-    cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight")
-    print(f"Saved speedup heatmap to {output_path}")
-    plt.close()
-
-
-def heuristic_ratio_based(batch_size: int, seq_length_k: int) -> int:
-    """Original ratio-based heuristic (from visualize_numsplits.py)."""
-    ratio = seq_length_k / batch_size
-    if ratio >= 2.5:
-        return 8
-    elif ratio >= 1.2:
-        return 4
-    elif ratio >= 0.5:
-        return 2
-    else:
-        return 1
-
-
-def heuristic_constant(batch_size: int, seq_length_k: int) -> int:
-    """Ultra-simple constant heuristic: always use 2 for small batches."""
-    if batch_size <= 32:
-        return 2
-    else:
-        return 1
-
-
-def heuristic_batch_based(batch_size: int, seq_length_k: int) -> int:
-    """
-    Simple batch-based heuristic with zero slowdowns.
-    """
-    if batch_size <= 4 and seq_length_k >= 8:
-        return 16
-    elif batch_size <= 8 and seq_length_k >= 2:
-        return 8
-    elif (batch_size <= 16 and seq_length_k >= 4) or (
-        batch_size == 48 and seq_length_k >= 32
-    ):
-        return 4
-    elif (batch_size <= 32 and seq_length_k >= 8) or (
-        batch_size == 96 and seq_length_k >= 16
-    ):
-        return 2
-    else:
-        return 1
-
-
-def _annotate_heuristic_matrix(
-    ax,
-    matrix: np.ndarray,
-    batch_sizes: list[int],
-    seq_lengths: list[int],
-    optimal_splits: dict[tuple[int, int], int],
-):
-    """Add text annotations showing heuristic values and mismatches."""
-    for i in range(len(seq_lengths)):
-        for j in range(len(batch_sizes)):
-            value = matrix[i, j]
-            seq_len = seq_lengths[i]
-            batch_size = batch_sizes[j]
-            optimal = optimal_splits.get((batch_size, seq_len), None)
-
-            if not np.isnan(value):
-                # Mark mismatches with red text
-                if optimal is not None and int(value) != optimal:
-                    color = "red"
-                    text = f"{int(value)}\n✗"
-                else:
-                    color = "black"
-                    text = str(int(value))
-
-                ax.text(
-                    j,
-                    i,
-                    text,
-                    ha="center",
-                    va="center",
-                    color=color,
-                    fontsize=10,
-                    fontweight="bold",
-                )
-
-
-def create_heuristic_policy_heatmaps(
-    optimal_splits: dict[tuple[int, int], int], output_dir: Path
-):
-    """Create heatmaps showing num_splits chosen by each heuristic policy."""
-    heuristics = {
-        "Ratio-based": heuristic_ratio_based,
-        "Batch-based (improved)": heuristic_batch_based,
-        "Constant (batch<=32)": heuristic_constant,
-    }
-
-    batch_sizes, seq_lengths = _get_axes_from_splits_dict(optimal_splits)
-
-    for heuristic_name, heuristic_func in heuristics.items():
-        # Build matrix of chosen num_splits
-        matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
-        for i, seq_len in enumerate(seq_lengths):
-            for j, batch_size in enumerate(batch_sizes):
-                matrix[i, j] = heuristic_func(batch_size, seq_len)
-
-        _fig, ax = plt.subplots(figsize=(12, 8))
-
-        # Convert to log2 scale for coloring
-        matrix_log2 = np.log2(matrix)
-        valid_values = matrix_log2[~np.isnan(matrix_log2)]
-        min_log2 = np.floor(valid_values.min())
-        max_log2 = np.ceil(valid_values.max())
-
-        cmap, vmin, vmax = _create_log2_colormap(min_log2, max_log2)
-        im = ax.imshow(matrix_log2, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax)
-
-        _setup_heatmap_axes(
-            ax,
-            batch_sizes,
-            seq_lengths,
-            f"num_kv_splits Chosen by {heuristic_name} Policy",
-        )
-        _annotate_heuristic_matrix(ax, matrix, batch_sizes, seq_lengths, optimal_splits)
-        _add_log2_colorbar(im, ax, "num_kv_splits", min_log2, max_log2)
-
-        plt.tight_layout()
-
-        safe_name = (
-            heuristic_name.lower().replace(" ", "_").replace("(", "").replace(")", "")
-        )
-        output_path = output_dir / f"numsplits_policy_{safe_name}.png"
-        plt.savefig(output_path, dpi=300, bbox_inches="tight")
-        print(f"Saved {heuristic_name} policy heatmap to {output_path}")
-        plt.close()
-
-
-def _build_timings_lookup(results: list) -> dict[str, dict[int, float]]:
-    """Build lookup table of timings by batch_spec and num_splits."""
-    by_batch_spec = {}
-    for result in results:
-        batch_spec = result["config"]["batch_spec"]
-        if batch_spec not in by_batch_spec:
-            by_batch_spec[batch_spec] = {}
-
-        if result["error"] is None and "mean_time" in result:
-            backend_name = result["config"]["backend"]
-            match = re.search(r"numsplits_(\d+)", backend_name)
-            if match:
-                num_splits = int(match.group(1))
-                by_batch_spec[batch_spec][num_splits] = result["mean_time"]
-    return by_batch_spec
-
-
-def create_heuristic_speedup_heatmaps(
-    results: list, optimal_splits: dict[tuple[int, int], int], output_dir: Path
-):
-    """Create speedup heatmaps for each heuristic policy."""
-    heuristics = {
-        "Ratio-based (Original)": heuristic_ratio_based,
-        "Batch-based (improved)": heuristic_batch_based,
-        "Constant (batch<=32)": heuristic_constant,
-    }
-
-    by_batch_spec = _build_timings_lookup(results)
-    batch_sizes, seq_lengths = _get_axes_from_splits_dict(optimal_splits)
-
-    for heuristic_name, heuristic_func in heuristics.items():
-        speedup_matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
-        total_speedup = 0.0
-        count = 0
-
-        for i, seq_len in enumerate(seq_lengths):
-            for j, batch_size in enumerate(batch_sizes):
-                batch_spec = f"{batch_size}q1s{seq_len}k"
-                timings = by_batch_spec.get(batch_spec, {})
-                baseline_time = timings.get(1)
-
-                if baseline_time:
-                    predicted_splits = heuristic_func(batch_size, seq_len)
-                    predicted_time = timings.get(predicted_splits, baseline_time)
-                    speedup = baseline_time / predicted_time
-                    speedup_matrix[i, j] = speedup
-                    total_speedup += speedup
-                    count += 1
-                else:
-                    speedup_matrix[i, j] = np.nan
-
-        avg_speedup = total_speedup / count if count > 0 else 1.0
-
-        _fig, ax = plt.subplots(figsize=(12, 8))
-
-        max_speedup = np.nanmax(speedup_matrix)
-        speedup_cmap = _create_speedup_colormap()
-        im = ax.imshow(
-            speedup_matrix,
-            cmap=speedup_cmap,
-            aspect="auto",
-            vmin=1.0,
-            vmax=max_speedup,
-        )
-
-        _setup_heatmap_axes(
-            ax,
-            batch_sizes,
-            seq_lengths,
-            f"Speedup with {heuristic_name} Policy\n"
-            f"(Average speedup: {avg_speedup:.3f}x vs. splits=1)",
-        )
-        _annotate_speedup_matrix(ax, speedup_matrix, batch_sizes, seq_lengths)
-
-        cbar = plt.colorbar(im, ax=ax)
-        cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
-
-        plt.tight_layout()
-
-        safe_name = (
-            heuristic_name.lower().replace(" ", "_").replace("(", "").replace(")", "")
-        )
-        output_path = output_dir / f"numsplits_speedup_{safe_name}.png"
-        plt.savefig(output_path, dpi=300, bbox_inches="tight")
-        print(f"Saved {heuristic_name} speedup heatmap to {output_path}")
-        plt.close()
-
-
-def create_auto_heatmap(results: list, output_path: str):
-    """Create heatmap showing num_kv_splits chosen by auto policy."""
-    # Find all configs with auto results
-    auto_configs = set()
-    for result in results:
-        if "auto" in result["config"]["backend"] and result["error"] is None:
-            batch_spec = result["config"]["batch_spec"]
-            batch_size, seq_length_k = parse_batch_spec(batch_spec)
-            auto_configs.add((batch_size, seq_length_k))
-
-    if not auto_configs:
-        print("Skipping auto heatmap (no auto results found)")
-        return
-
-    batch_sizes, seq_lengths = _get_axes_from_splits_dict({k: 1 for k in auto_configs})
-    matrix = np.zeros((len(seq_lengths), len(batch_sizes)))
-    for i, seq_len in enumerate(seq_lengths):
-        for j, batch_size in enumerate(batch_sizes):
-            matrix[i, j] = 1 if (batch_size, seq_len) in auto_configs else np.nan
-
-    _fig, ax = plt.subplots(figsize=(12, 8))
-
-    cmap = ListedColormap(["#2ca02c"])  # Green for auto
-    _im = ax.imshow(matrix, cmap=cmap, aspect="auto", vmin=0, vmax=1)
-
-    _setup_heatmap_axes(
-        ax, batch_sizes, seq_lengths, "Auto num_kv_splits Policy Coverage"
-    )
-
-    # Add "AUTO" text annotations
-    for i in range(len(seq_lengths)):
-        for j in range(len(batch_sizes)):
-            if not np.isnan(matrix[i, j]):
-                ax.text(
-                    j,
-                    i,
-                    "AUTO",
-                    ha="center",
-                    va="center",
-                    color="white",
-                    fontsize=10,
-                    fontweight="bold",
-                )
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight")
-    print(f"Saved auto heatmap to {output_path}")
-    plt.close()
-
-
-def create_auto_speedup_heatmap(results: list, output_path: str):
-    """Create heatmap showing speedup from auto vs splits=1."""
-    # Build speedup dictionary
-    speedup_dict = {}
-    timings_by_spec = {}
-
-    for result in results:
-        if result["error"] is not None or "mean_time" not in result:
-            continue
-
-        batch_spec = result["config"]["batch_spec"]
-        backend_name = result["config"]["backend"]
-
-        if batch_spec not in timings_by_spec:
-            timings_by_spec[batch_spec] = {}
-
-        if "auto" in backend_name:
-            timings_by_spec[batch_spec]["auto"] = result["mean_time"]
-        elif backend_name.endswith("numsplits_1"):
-            timings_by_spec[batch_spec]["baseline"] = result["mean_time"]
-
-    for batch_spec, timings in timings_by_spec.items():
-        if "baseline" in timings and "auto" in timings:
-            batch_size, seq_length_k = parse_batch_spec(batch_spec)
-            speedup = timings["baseline"] / timings["auto"]
-            speedup_dict[(batch_size, seq_length_k)] = speedup
-
-    if not speedup_dict:
-        print("Skipping auto speedup heatmap (no auto results found)")
-        return
-
-    batch_sizes, seq_lengths = _get_axes_from_splits_dict(speedup_dict)
-    matrix = _create_splits_matrix(speedup_dict, batch_sizes, seq_lengths)
-
-    _fig, ax = plt.subplots(figsize=(12, 8))
-
-    max_speedup = np.nanmax(matrix)
-    speedup_cmap = _create_speedup_colormap()
-    im = ax.imshow(matrix, cmap=speedup_cmap, aspect="auto", vmin=1.0, vmax=max_speedup)
-
-    _setup_heatmap_axes(
-        ax,
-        batch_sizes,
-        seq_lengths,
-        "Speedup from Auto Policy vs. splits=1\n(Green = better with auto)",
-    )
-    _annotate_speedup_matrix(ax, matrix, batch_sizes, seq_lengths)
-
-    cbar = plt.colorbar(im, ax=ax)
-    cbar.set_label("Speedup Factor", rotation=270, labelpad=20, fontsize=12)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight")
-    print(f"Saved auto speedup heatmap to {output_path}")
-    plt.close()
-
-
-def analyze_pattern(optimal_splits: dict[tuple[int, int], int]):
-    """Analyze the pattern and suggest a formula."""
-    print("\n" + "=" * 80)
-    print("PATTERN ANALYSIS")
-    print("=" * 80)
-
-    # Group by optimal split value
-    by_split_value = {}
-    for (batch, seq), split in optimal_splits.items():
-        if split not in by_split_value:
-            by_split_value[split] = []
-        by_split_value[split].append((batch, seq))
-
-    print("\nConfigurations grouped by optimal num_kv_splits:")
-    for split in sorted(by_split_value.keys()):
-        configs = by_split_value[split]
-        print(f"\n  num_kv_splits = {split} ({len(configs)} configs):")
-        for batch, seq in sorted(configs)[:5]:  # Show first 5
-            print(f"    - batch={batch:3d}, seq={seq:3d}k")
-        if len(configs) > 5:
-            print(f"    ... and {len(configs) - 5} more")
-
-    # Analyze ratio: seq_length / batch_size
-    print("\n" + "-" * 80)
-    print("Analysis of seq_length/batch_size ratio:")
-    print("-" * 80)
-
-    ratio_by_split = {split: [] for split in by_split_value}
-    for (batch, seq), split in optimal_splits.items():
-        ratio = seq / batch
-        ratio_by_split[split].append(ratio)
-
-    print(f"\n{'Split':<8} {'Min Ratio':<12} {'Max Ratio':<12} {'Avg Ratio':<12}")
-    print("-" * 50)
-    for split in sorted(ratio_by_split.keys()):
-        ratios = ratio_by_split[split]
-        if ratios:
-            print(
-                f"{split:<8} {min(ratios):<12.1f} {max(ratios):<12.1f} "
-                f"{np.mean(ratios):<12.1f}"
-            )
-
-    # Test heuristics
-    print("\n" + "=" * 80)
-    print("HEURISTIC COMPARISON")
-    print("=" * 80)
-
-    heuristics = {
-        "Ratio-based": heuristic_ratio_based,
-        "Batch-based (improved)": heuristic_batch_based,
-        "Constant (batch<=32)": heuristic_constant,
-    }
-
-    for name, heuristic_func in heuristics.items():
-        correct = 0
-        total = 0
-        mismatches = []
-
-        for (batch, seq), actual_split in optimal_splits.items():
-            predicted_split = heuristic_func(batch, seq)
-            total += 1
-            if predicted_split == actual_split:
-                correct += 1
-            else:
-                mismatches.append((batch, seq, predicted_split, actual_split))
-
-        accuracy = 100 * correct / total
-        print(f"\n{name}:")
-        print(f"  Accuracy: {correct}/{total} = {accuracy:.1f}%")
-
-        if mismatches and len(mismatches) <= 10:
-            print("  Mismatches:")
-            for batch, seq, pred, actual in mismatches:
-                print(
-                    f"    batch={batch:3d}, seq={seq:3d}k -> "
-                    f"predicted={pred}, actual={actual}"
-                )
-        elif mismatches:
-            print(f"  {len(mismatches)} mismatches (showing first 5):")
-            for batch, seq, pred, actual in mismatches[:5]:
-                print(
-                    f"    batch={batch:3d}, seq={seq:3d}k -> "
-                    f"predicted={pred}, actual={actual}"
-                )
-
-    print("\n" + "=" * 80 + "\n")
-
-
-def main():
-    if len(sys.argv) < 2:
-        print("Usage: python visualize_numsplits.py <results.json>")
-        sys.exit(1)
-
-    json_path = sys.argv[1]
-    output_dir = Path(json_path).parent
-
-    print(f"Loading results from {json_path}...")
-    results = load_results(json_path)
-
-    print("Extracting optimal splits (excluding auto)...")
-    optimal_splits = extract_optimal_splits(results, exclude_auto=True)
-
-    print(f"Found {len(optimal_splits)} configurations")
-
-    # Create visualizations
-    print("\nGenerating visualizations...")
-
-    print("\n--- Manual Configuration Plots (excluding auto) ---")
-    create_heatmap(optimal_splits, str(output_dir / "numsplits_heatmap.png"))
-    create_performance_heatmap(
-        results, str(output_dir / "numsplits_speedup.png"), exclude_auto=True
-    )
-    create_heuristic_policy_heatmaps(optimal_splits, output_dir)
-    create_heuristic_speedup_heatmaps(results, optimal_splits, output_dir)
-
-    print("\n--- Auto Policy Plots ---")
-    create_auto_heatmap(results, str(output_dir / "numsplits_heatmap_auto.png"))
-    create_auto_speedup_heatmap(results, str(output_dir / "numsplits_speedup_auto.png"))
-
-    # Analyze pattern
-    analyze_pattern(optimal_splits)
-
-    print("\nDone! Check the output directory for visualization files.")
-
-
-if __name__ == "__main__":
-    main()

From 0a3f9877d18f54eac31ef87bc53cb8423c287d95 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 19:41:36 +0000
Subject: [PATCH 36/45] Refactor and simplify

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/benchmark.py |   4 +-
 benchmarks/attention_benchmarks/runner.py    | 392 +++++++------------
 2 files changed, 144 insertions(+), 252 deletions(-)

diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index bfdbba1d7e8b..004dcc9521a3 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -48,9 +48,9 @@
 
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
     """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
-    from runner import run_attention_benchmark_impl
+    from runner import run_attention_benchmark
 
-    return run_attention_benchmark_impl(config)
+    return run_attention_benchmark(config)
 
 
 def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index 444f7c90fc28..68b385b70a63 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -8,14 +8,25 @@
 (FlashAttention, Triton, FlashInfer) with real vLLM integration.
 """
 
+import types
+
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec, reorder_for_flashinfer
 from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
 
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
-from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.block_table import BlockTable
+from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 # ============================================================================
 # Backend Configuration
@@ -47,18 +58,6 @@
 
 
 def _get_backend_config(backend: str) -> dict:
-    """
-    Get backend configuration.
-
-    Args:
-        backend: Backend name (flash, triton, flashinfer)
-
-    Returns:
-        Backend configuration dict
-
-    Raises:
-        ValueError: If backend is unknown
-    """
     if backend not in _BACKEND_CONFIG:
         raise ValueError(
             f"Unknown backend: {backend}. "
@@ -72,124 +71,120 @@ def _get_backend_config(backend: str) -> dict:
 # ============================================================================
 
 
-def _build_attention_metadata(
-    requests: list,
+def _build_common_attn_metadata(
+    q_lens: list[int],
+    kv_lens: list[int],
     block_size: int,
     device: torch.device,
-) -> tuple:
-    """
-    Build CommonAttentionMetadata from batch requests.
-
-    Args:
-        requests: List of BatchRequest objects
-        block_size: KV cache block size
-        device: Target device
-
-    Returns:
-        Tuple of (metadata, slot_mapping, max_num_blocks)
-    """
-    q_lens = [r.q_len for r in requests]
-    kv_lens = [r.kv_len for r in requests]
-    total_q = sum(q_lens)
-    max_kv = max(kv_lens)
-
-    # Build query start locations
-    q_start_cpu = np.array(
-        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
-        dtype=np.int32,
-    )
-    q_start_gpu = torch.from_numpy(q_start_cpu).to(device)
-
-    # Build sequence lengths
-    seq_lens_cpu = np.array(kv_lens, dtype=np.int32)
-    seq_lens_gpu = torch.from_numpy(seq_lens_cpu).to(device)
-
-    # Build num_computed_tokens (context length before new query)
-    computed_tokens_cpu = np.array(
-        [kv - q for kv, q in zip(kv_lens, q_lens)],
-        dtype=np.int32,
+) -> CommonAttentionMetadata:
+    """Build CommonAttentionMetadata from query/kv lengths."""
+    batch_size = len(q_lens)
+    total_tokens = sum(q_lens)
+
+    query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
+    query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
+        0
     )
-
-    # Build block table
-    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
-    max_num_blocks = max(num_blocks_per_req)
-
-    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
-    for i, num_blocks in enumerate(num_blocks_per_req):
-        block_table_cpu[i, :num_blocks] = np.arange(num_blocks, dtype=np.int32)
-    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
-
-    # Build slot mapping (maps each token to its KV cache slot)
-    slot_mapping_list = []
-    for i, (q_len, kv_len, num_blocks) in enumerate(
-        zip(q_lens, kv_lens, num_blocks_per_req)
-    ):
-        context_len = kv_len - q_len
-        for j in range(q_len):
-            token_kv_idx = context_len + j
-            block_idx = token_kv_idx // block_size
-            offset_in_block = token_kv_idx % block_size
-            global_block_id = block_table_cpu[i, block_idx]
-            slot_id = global_block_id * block_size + offset_in_block
-            slot_mapping_list.append(slot_id)
-
-    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
-
-    # Create CommonAttentionMetadata
-    metadata = CommonAttentionMetadata(
-        query_start_loc=q_start_gpu,
-        query_start_loc_cpu=torch.from_numpy(q_start_cpu),
-        seq_lens=seq_lens_gpu,
-        seq_lens_cpu=torch.from_numpy(seq_lens_cpu),
-        num_computed_tokens_cpu=torch.from_numpy(computed_tokens_cpu),
-        num_reqs=len(requests),
-        num_actual_tokens=total_q,
-        max_query_len=max(q_lens),
-        max_seq_len=max_kv,
-        block_table_tensor=block_table_gpu,
+    query_start_loc_cpu = query_start_loc.cpu()
+
+    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
+    seq_lens_cpu = seq_lens.cpu()
+    max_seq_len = int(seq_lens_cpu.max())
+
+    context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
+    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
+
+    max_blocks = (max(kv_lens) + block_size - 1) // block_size
+    num_blocks = batch_size * max_blocks
+    block_table_tensor = torch.arange(
+        num_blocks, dtype=torch.int32, device=device
+    ).view(batch_size, max_blocks)
+    slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
+
+    max_query_len = max(q_lens)
+
+    return CommonAttentionMetadata(
+        query_start_loc=query_start_loc,
+        query_start_loc_cpu=query_start_loc_cpu,
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens_cpu,
+        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        num_reqs=batch_size,
+        num_actual_tokens=total_tokens,
+        max_query_len=max_query_len,
+        max_seq_len=max_seq_len,
+        block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
+        causal=True,
     )
 
-    return metadata, slot_mapping, max_num_blocks
 
-
-def _build_block_table(
-    requests: list,
-    kv_lens: list[int],
-    block_size: int,
-    total_q: int,
+def _create_vllm_config(
+    config: BenchmarkConfig,
+    dtype: torch.dtype,
     max_num_blocks: int,
-    device: torch.device,
-) -> BlockTable:
-    """
-    Build BlockTable for metadata builder.
+) -> VllmConfig:
+    """Create a VllmConfig for benchmarking with mock model methods."""
+    model_config = ModelConfig(
+        model="meta-llama/Meta-Llama-3-8B",
+        tokenizer="meta-llama/Meta-Llama-3-8B",
+        trust_remote_code=False,
+        dtype=dtype,
+        seed=0,
+        max_model_len=1024,
+    )
 
-    Args:
-        requests: List of BatchRequest objects
-        kv_lens: List of KV sequence lengths
-        block_size: KV cache block size
-        total_q: Total number of query tokens
-        max_num_blocks: Maximum number of blocks per request
-        device: Target device
+    cache_config = CacheConfig(
+        block_size=config.block_size,
+        cache_dtype="auto",
+        swap_space=0,
+    )
+    cache_config.num_gpu_blocks = max_num_blocks
+    cache_config.num_cpu_blocks = 0
+
+    parallel_config = ParallelConfig(tensor_parallel_size=1)
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=256,
+        max_num_batched_tokens=8192,
+        enable_chunked_prefill=True,
+    )
+    device_config = DeviceConfig()
+    load_config = LoadConfig()
+    compilation_config = CompilationConfig()
 
-    Returns:
-        BlockTable instance
-    """
-    bt = BlockTable(
-        block_size=block_size,
-        max_num_reqs=len(requests),
-        max_num_blocks_per_req=max_num_blocks,
-        max_num_batched_tokens=total_q,
-        pin_memory=False,
-        device=device,
-        kernel_block_size=block_size,
-        cp_kv_cache_interleave_size=1,
+    # Add mock methods for benchmark config values
+    model_config.get_num_layers = types.MethodType(
+        lambda self: config.num_layers, model_config
+    )
+    model_config.get_sliding_window_for_layer = types.MethodType(
+        lambda self, i: None, model_config
+    )
+    model_config.get_logits_soft_cap_for_layer = types.MethodType(
+        lambda self, i: 0.0, model_config
+    )
+    model_config.get_sm_scale_for_layer = types.MethodType(
+        lambda self, i: 1.0 / config.head_dim**0.5, model_config
+    )
+    model_config.get_num_attention_heads = types.MethodType(
+        lambda self, parallel_config=None: config.num_q_heads, model_config
+    )
+    model_config.get_num_kv_heads = types.MethodType(
+        lambda self, parallel_config=None: config.num_kv_heads, model_config
+    )
+    model_config.get_head_size = types.MethodType(
+        lambda self: config.head_dim, model_config
+    )
+    model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
+
+    return VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        parallel_config=parallel_config,
+        scheduler_config=scheduler_config,
+        device_config=device_config,
+        load_config=load_config,
+        compilation_config=compilation_config,
     )
-    for i in range(len(requests)):
-        num_blocks = (kv_lens[i] + block_size - 1) // block_size
-        bt.add_row(list(range(num_blocks)), i)
-    bt.commit(len(requests))
-    return bt
 
 
 # ============================================================================
@@ -202,30 +197,15 @@ def _create_backend_impl(
     config: BenchmarkConfig,
     device: torch.device,
 ):
-    """
-    Create backend implementation instance.
-
-    Args:
-        backend_cfg: Backend configuration dict
-        config: BenchmarkConfig instance
-        device: Target device
-
-    Returns:
-        Tuple of (backend_class, impl, layer, dtype)
-    """
-    # Import backend class
+    """Create backend implementation instance."""
     import importlib
 
     backend_module = importlib.import_module(backend_cfg["module"])
     backend_class = getattr(backend_module, backend_cfg["backend_class"])
 
-    # Calculate scale
     scale = get_attention_scale(config.head_dim)
-
-    # Get dtype
     dtype = backend_cfg["dtype"]
 
-    # Create attention impl
     impl = backend_class.get_impl_cls()(
         num_heads=config.num_q_heads,
         head_size=config.head_dim,
@@ -236,17 +216,13 @@ def _create_backend_impl(
         kv_cache_dtype="auto",
     )
 
-    # Create KV cache spec for MockLayer
-    from vllm.v1.kv_cache_interface import AttentionSpec
-
-    kv_cache_spec = AttentionSpec(
+    kv_cache_spec = FullAttentionSpec(
         block_size=config.block_size,
         num_kv_heads=config.num_kv_heads,
         head_size=config.head_dim,
         dtype=dtype,
     )
 
-    # Create mock layer
     layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
 
     return backend_class, impl, layer, dtype
@@ -254,53 +230,18 @@ def _create_backend_impl(
 
 def _create_metadata_builder(
     backend_class,
-    common_metadata: CommonAttentionMetadata,
-    block_table: BlockTable,
-    config: BenchmarkConfig,
-    dtype: torch.dtype,
+    kv_cache_spec: FullAttentionSpec,
+    vllm_config: VllmConfig,
     device: torch.device,
 ):
-    """
-    Create metadata builder instance.
-
-    Args:
-        backend_class: Backend class
-        common_metadata: CommonAttentionMetadata instance
-        block_table: BlockTable instance
-        config: BenchmarkConfig instance
-        dtype: Tensor dtype
-        device: Target device
-
-    Returns:
-        Built attention metadata
-    """
-    # Create mock runner for builder
-    from common import MockRunner
-
-    runner = MockRunner(
-        seq_lens=common_metadata.seq_lens_cpu.numpy(),
-        query_start_locs=common_metadata.query_start_loc_cpu.numpy(),
+    """Create metadata builder instance."""
+    return backend_class.get_builder_cls()(
+        kv_cache_spec=kv_cache_spec,
+        layer_names=["layer_0"],
+        vllm_config=vllm_config,
         device=device,
-        num_q_heads=config.num_q_heads,
-        num_kv_heads=config.num_kv_heads,
-        head_dim=config.head_dim,
-        dtype=dtype,
     )
 
-    # Create metadata builder
-    builder = backend_class.get_builder_cls()(
-        runner=runner,
-        kv_cache_spec=AttentionSpec(
-            block_size=config.block_size,
-            num_kv_heads=config.num_kv_heads,
-            head_size=config.head_dim,
-            dtype=dtype,
-        ),
-        block_table=block_table,
-    )
-
-    return builder
-
 
 # ============================================================================
 # Tensor Creation Helpers
@@ -313,18 +254,7 @@ def _create_input_tensors(
     device: torch.device,
     dtype: torch.dtype,
 ) -> tuple:
-    """
-    Create Q, K, V input tensors for all layers.
-
-    Args:
-        config: BenchmarkConfig instance
-        total_q: Total number of query tokens
-        device: Target device
-        dtype: Tensor dtype
-
-    Returns:
-        Tuple of (q_list, k_list, v_list)
-    """
+    """Create Q, K, V input tensors for all layers."""
     q_list = [
         torch.randn(
             total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
@@ -353,19 +283,7 @@ def _create_kv_cache(
     device: torch.device,
     dtype: torch.dtype,
 ) -> list:
-    """
-    Create KV cache tensors for all layers.
-
-    Args:
-        config: BenchmarkConfig instance
-        max_num_blocks: Maximum number of blocks
-        cache_layout: Cache layout type ("standard" or "flashinfer")
-        device: Target device
-        dtype: Tensor dtype
-
-    Returns:
-        List of KV cache tensors (one per layer)
-    """
+    """Create KV cache tensors for all layers."""
     if cache_layout == "flashinfer":
         # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
         cache_list = [
@@ -414,25 +332,7 @@ def _run_single_benchmark(
     device: torch.device,
     dtype: torch.dtype,
 ) -> tuple:
-    """
-    Run single benchmark iteration with warmup and timing loop.
-
-    Args:
-        config: BenchmarkConfig instance
-        impl: Backend implementation instance
-        layer: MockLayer instance
-        q_list: List of Q tensors
-        k_list: List of K tensors
-        v_list: List of V tensors
-        cache_list: List of KV cache tensors
-        attn_metadata: Attention metadata
-        device: Target device
-        dtype: Tensor dtype
-
-    Returns:
-        Tuple of (times, mem_stats)
-    """
-    # Create output buffer
+    """Run single benchmark iteration with warmup and timing loop."""
     total_q = q_list[0].shape[0]
     out = torch.empty(
         total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
@@ -475,7 +375,6 @@ def _run_single_benchmark(
         elapsed_ms = start.elapsed_time(end)
         times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
 
-    # Memory stats
     mem_stats = {}
     if config.profile_memory:
         mem_stats = {
@@ -506,58 +405,52 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
     device = torch.device(config.device)
     torch.cuda.set_device(device)
 
-    # Get backend configuration
     backend_cfg = _get_backend_config(config.backend)
 
-    # Parse batch spec
     requests = parse_batch_spec(config.batch_spec)
 
-    # Reorder for FlashInfer if needed
     if config.backend == "flashinfer":
         requests = reorder_for_flashinfer(requests)
 
-    # Extract dimensions
     q_lens = [r.q_len for r in requests]
     kv_lens = [r.kv_len for r in requests]
     total_q = sum(q_lens)
+    max_kv = max(kv_lens)
 
-    # Build common metadata
-    common_metadata, slot_mapping, max_num_blocks = _build_attention_metadata(
-        requests, config.block_size, device
-    )
+    max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
 
-    # Create backend impl, layer, and dtype
     backend_class, impl, layer, dtype = _create_backend_impl(
         backend_cfg, config, device
     )
 
-    # Build block table
-    block_table = _build_block_table(
-        requests, kv_lens, config.block_size, total_q, max_num_blocks, device
+    common_metadata = _build_common_attn_metadata(
+        q_lens, kv_lens, config.block_size, device
+    )
+
+    kv_cache_spec = FullAttentionSpec(
+        block_size=config.block_size,
+        num_kv_heads=config.num_kv_heads,
+        head_size=config.head_dim,
+        dtype=dtype,
     )
 
-    # Create metadata builder and build metadata
+    vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
+
     builder = _create_metadata_builder(
-        backend_class, common_metadata, block_table, config, dtype, device
+        backend_class, kv_cache_spec, vllm_config, device
     )
 
     attn_metadata = builder.build(
-        num_reqs=len(requests),
-        num_actual_tokens=total_q,
-        max_query_len=max(q_lens),
         common_prefix_len=0,
         common_attn_metadata=common_metadata,
     )
 
-    # Create input tensors
     q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
 
-    # Create KV cache
     cache_list = _create_kv_cache(
         config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
     )
 
-    # Run benchmark
     times, mem_stats = _run_single_benchmark(
         config,
         impl,
@@ -571,7 +464,6 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
         dtype,
     )
 
-    # Calculate throughput
     mean_time = np.mean(times)
     throughput = total_q / mean_time if mean_time > 0 else 0
 

From 332df87b46e1c3f9555179aa657faa3661204a75 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 19:52:20 +0000
Subject: [PATCH 37/45] Fix MLA

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/mla_runner.py | 45 ++-----------------
 1 file changed, 3 insertions(+), 42 deletions(-)

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index d4818efa7374..023291f04015 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -143,60 +143,21 @@ def create_minimal_vllm_config(
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
-        num_gpu_blocks=None,
-        num_cpu_blocks=None,
-        sliding_window=None,
         enable_prefix_caching=False,
-        cpu_offload_gb=0,
     )
 
     scheduler_config = SchedulerConfig(
-        task="auto",
         max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=None,
+        max_num_batched_tokens=8192,
         max_model_len=32768,
-        num_scheduler_steps=1,
-        multi_step_stream_outputs=False,
-        enable_chunked_prefill=None,
-        preemption_mode="swap",
-        num_lookahead_slots=0,
-        delay_factor=0.0,
-        enable_prefix_caching=False,
-        policy="fcfs",
-        send_delta_data=False,
+        enable_chunked_prefill=True,
     )
 
     parallel_config = ParallelConfig(
-        pipeline_parallel_size=1,
         tensor_parallel_size=1,
-        worker_cls="auto",
-        max_parallel_loading_workers=None,
-        disable_custom_all_reduce=False,
-        tokenizer_pool_config=None,
-        ray_workers_use_nsight=False,
-        placement_group=None,
-        distributed_executor_backend=None,
     )
 
-    compilation_config = CompilationConfig(
-        level=0,
-        backend="",
-        custom_ops=[],
-        splitting_ops=[],
-        use_inductor=True,
-        enable_fusion=True,
-        use_cudagraph=False,
-        cudagraph_num_of_warmups=0,
-        cudagraph_capture_sizes=None,
-        cudagraph_copy_inputs=False,
-        use_cudagraph_for_prefill=False,
-        enabled_custom_ops=None,
-        disabled_custom_ops=None,
-        inductor_compile_sizes=[],
-        inductor_compile_config={},
-        inductor_passes={},
-        cudagraph_backend="flashinfer",
-    )
+    compilation_config = CompilationConfig()
 
     return VllmConfig(
         model_config=model_config,

From 73de818c2e607f126980bab4fdf4eca9c57d64e5 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 20:40:42 +0000
Subject: [PATCH 38/45] Fix

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/mla_runner.py | 18 +++++++++++-------
 benchmarks/attention_benchmarks/runner.py     |  2 ++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index 023291f04015..2f8725738222 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -91,7 +91,7 @@ def create_minimal_vllm_config(
         "num_attention_heads": mla_dims["num_q_heads"],
         "num_key_value_heads": mla_dims["num_kv_heads"],
         "hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
-        "torch_dtype": "float16",
+        "torch_dtype": "bfloat16",
         "max_position_embeddings": 163840,  # DeepSeek V3 default
         "rope_theta": 10000.0,
         "vocab_size": 128256,
@@ -110,7 +110,7 @@ def create_minimal_vllm_config(
             tokenizer=None,
             tokenizer_mode="auto",
             trust_remote_code=True,
-            dtype="float16",
+            dtype="bfloat16",
             seed=0,
             max_model_len=32768,
             quantization=None,
@@ -150,6 +150,7 @@ def create_minimal_vllm_config(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=8192,
         max_model_len=32768,
+        is_encoder_decoder=False,
         enable_chunked_prefill=True,
     )
 
@@ -187,6 +188,9 @@ def create_minimal_vllm_config(
         "query_format": "concat",  # Single concatenated tensor (vs tuple)
         "block_size": 64,  # FlashMLA uses fixed block size
     },
+    "flashinfer_mla": {
+        "block_size": 64,  # FlashInfer MLA only supports 32 or 64
+    },
 }
 
 
@@ -306,8 +310,8 @@ def _build_attention_metadata(
         query_start_loc=q_start_gpu,
         query_start_loc_cpu=q_start_cpu,
         seq_lens=seq_lens_gpu,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
         slot_mapping=slot_mapping,
         block_table_tensor=block_table_gpu,
         dcp_local_seq_lens=None,
@@ -489,7 +493,7 @@ def _create_backend_impl(
         block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
         num_kv_heads=1,  # MLA uses 1 KV head
         head_size=576,  # MLA head dim
-        dtype=torch.float16,
+        dtype=torch.bfloat16,
     )
 
     # Create mock layer
@@ -611,7 +615,7 @@ def _run_single_benchmark(
         block_size,
         mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
         device=device,
-        dtype=torch.float16,
+        dtype=torch.bfloat16,
     )
 
     # Create input tensors for both decode and prefill modes
@@ -620,7 +624,7 @@ def _run_single_benchmark(
         mla_dims,
         backend_cfg["query_format"],
         device,
-        torch.float16,
+        torch.bfloat16,
     )
 
     # Determine which forward method to use based on metadata
diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index 68b385b70a63..bf08a1550c0c 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -146,6 +146,8 @@ def _create_vllm_config(
     scheduler_config = SchedulerConfig(
         max_num_seqs=256,
         max_num_batched_tokens=8192,
+        max_model_len=8192,
+        is_encoder_decoder=False,
         enable_chunked_prefill=True,
     )
     device_config = DeviceConfig()

From 2d69eaf21f8a926e925ee87213dc6bda4eb03666 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 20:49:47 +0000
Subject: [PATCH 39/45] Fix

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/mla_runner.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index 2f8725738222..d533263aa0c8 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -413,11 +413,19 @@ def _create_input_tensors(
     )
     k_scale = torch.ones(1, device=device, dtype=torch.float32)
 
+    output = torch.zeros(
+        total_q,
+        mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
+        device=device,
+        dtype=dtype,
+    )
+
     prefill_inputs = {
         "q": prefill_q,
         "k_c_normed": k_c_normed,
         "k_pe": k_pe,
         "k_scale": k_scale,
+        "output": output,
     }
 
     return decode_inputs, prefill_inputs
@@ -640,6 +648,7 @@ def _run_single_benchmark(
             kv_cache,
             metadata,
             prefill_inputs["k_scale"],
+            prefill_inputs["output"],
         )
     else:
         raise RuntimeError("Metadata has neither decode nor prefill metadata")

From 2cdfbca8b2242a5810ff4ae819e1a486ea9537a1 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 20:56:15 +0000
Subject: [PATCH 40/45] Fix

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/mla_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index d533263aa0c8..eefef2ec6ee0 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -490,7 +490,7 @@ def _create_backend_impl(
     )
 
     # Initialize DCP attributes
-    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size is None:
+    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
         impl.dcp_world_size = 1
         impl.dcp_rank = 0
 

From b9d9573e8d81a5aa6f85d63c051022ee9b1d2731 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 21:54:23 +0000
Subject: [PATCH 41/45] Cleanup

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/__init__.py   |   6 +-
 benchmarks/attention_benchmarks/batch_spec.py |  26 ---
 benchmarks/attention_benchmarks/benchmark.py  | 167 +++++++-----------
 benchmarks/attention_benchmarks/common.py     |  78 ++------
 benchmarks/attention_benchmarks/mla_runner.py |  36 ++--
 .../attention_benchmarks/test_batch_spec.py   |  20 ---
 6 files changed, 105 insertions(+), 228 deletions(-)

diff --git a/benchmarks/attention_benchmarks/__init__.py b/benchmarks/attention_benchmarks/__init__.py
index 617ea863499d..df7a6328569d 100644
--- a/benchmarks/attention_benchmarks/__init__.py
+++ b/benchmarks/attention_benchmarks/__init__.py
@@ -8,18 +8,17 @@
     format_batch_spec,
     get_batch_stats,
     parse_batch_spec,
-    parse_manual_batch,
     reorder_for_flashinfer,
     split_by_type,
 )
 from .common import (
     BenchmarkConfig,
     BenchmarkResult,
-    BenchmarkRunner,
     MockLayer,
     MockModelConfig,
     ResultsFormatter,
     get_attention_scale,
+    is_mla_backend,
     setup_mla_dims,
 )
 
@@ -27,7 +26,6 @@
     # Batch specification
     "BatchRequest",
     "parse_batch_spec",
-    "parse_manual_batch",
     "format_batch_spec",
     "reorder_for_flashinfer",
     "split_by_type",
@@ -35,7 +33,6 @@
     # Benchmarking infrastructure
     "BenchmarkConfig",
     "BenchmarkResult",
-    "BenchmarkRunner",
     "ResultsFormatter",
     # Mock objects
     "MockLayer",
@@ -43,4 +40,5 @@
     # Utilities
     "setup_mla_dims",
     "get_attention_scale",
+    "is_mla_backend",
 ]
diff --git a/benchmarks/attention_benchmarks/batch_spec.py b/benchmarks/attention_benchmarks/batch_spec.py
index 0b981ecc099c..41681796e2e6 100644
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -64,32 +64,6 @@ def as_tuple(self) -> tuple[int, int]:
         return (self.q_len, self.kv_len)
 
 
-def parse_manual_batch(batch_args: list[str]) -> list[BatchRequest]:
-    """
-    Parse manual batch pairs ['q,kv', ...] into list of BatchRequest.
-
-    Args:
-        batch_args: List of strings in format "q_len,kv_len"
-
-    Returns:
-        List of BatchRequest objects
-
-    Raises:
-        ValueError: If format is invalid or kv_len < q_len
-    """
-    requests = []
-    for s in batch_args:
-        try:
-            q_str, kv_str = s.split(",")
-            q, kv = int(q_str), int(kv_str)
-            if kv < q:
-                raise ValueError(f"kv_len ({kv}) must be >= q_len ({q})")
-            requests.append(BatchRequest(q_len=q, kv_len=kv))
-        except Exception as e:
-            raise ValueError(f"Invalid batch pair '{s}': {e}") from e
-    return requests
-
-
 def _parse_size(size_str: str, k_suffix: str) -> int:
     """Parse size string with optional 'k' suffix."""
     size = int(size_str)
diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py
index 004dcc9521a3..ba11fca7452f 100644
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -43,6 +43,7 @@
     ModelParameterSweep,
     ParameterSweep,
     ResultsFormatter,
+    is_mla_backend,
 )
 
 
@@ -57,16 +58,34 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
     """Run MLA benchmark with appropriate backend."""
     from mla_runner import run_mla_benchmark as run_mla
 
-    result_dict = run_mla(config.backend, config, **kwargs)
+    return run_mla(config.backend, config, **kwargs)
 
-    return BenchmarkResult(
-        config=config,
-        mean_time=result_dict["mean"],
-        std_time=result_dict["std"],
-        min_time=result_dict["min"],
-        max_time=result_dict["max"],
-        throughput_tokens_per_sec=result_dict["throughput"],
-    )
+
+def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
+    """
+    Run a single benchmark with proper backend selection.
+
+    Args:
+        config: BenchmarkConfig with backend, batch_spec, and model params
+        **kwargs: Additional arguments passed to MLA benchmarks
+
+    Returns:
+        BenchmarkResult (may have error field set on failure)
+    """
+    try:
+        if is_mla_backend(config.backend):
+            return run_mla_benchmark(config, **kwargs)
+        else:
+            return run_standard_attention_benchmark(config)
+    except Exception as e:
+        return BenchmarkResult(
+            config=config,
+            mean_time=float("inf"),
+            std_time=0,
+            min_time=float("inf"),
+            max_time=float("inf"),
+            error=str(e),
+        )
 
 
 def run_model_parameter_sweep(
@@ -105,45 +124,25 @@ def run_model_parameter_sweep(
                     config_args = base_config_args.copy()
                     config_args[sweep.param_name] = value
 
-                    # Create descriptive backend name
-                    backend_label = sweep.get_label(backend, value)
-                    config = BenchmarkConfig(
-                        backend=backend_label, batch_spec=spec, **config_args
+                    # Create config with original backend for running
+                    clean_config = BenchmarkConfig(
+                        backend=backend, batch_spec=spec, **config_args
                     )
 
-                    try:
-                        # Create clean config with original backend name for actual run
-                        clean_config = replace(config, backend=backend)
-
-                        # Determine if MLA backend
-                        if backend in [
-                            "cutlass_mla",
-                            "flashinfer_mla",
-                            "flashattn_mla",
-                            "flashmla",
-                        ]:
-                            result = run_mla_benchmark(clean_config)
-                        else:
-                            result = run_standard_attention_benchmark(clean_config)
-
-                        # Replace result's config with labeled version
-                        result = replace(result, config=config)
-                        all_results.append(result)
+                    # Run benchmark
+                    result = run_benchmark(clean_config)
 
-                    except Exception as e:
+                    # Replace backend with labeled version for display
+                    backend_label = sweep.get_label(backend, value)
+                    labeled_config = replace(result.config, backend=backend_label)
+                    result = replace(result, config=labeled_config)
+                    all_results.append(result)
+
+                    if not result.success:
                         console.print(
                             f"[red]Error {backend} {spec} {sweep.param_name}="
-                            f"{value}: {e}[/]"
-                        )
-                        result = BenchmarkResult(
-                            config=config,
-                            mean_time=float("inf"),
-                            std_time=0,
-                            min_time=float("inf"),
-                            max_time=float("inf"),
-                            error=str(e),
+                            f"{value}: {result.error}[/]"
                         )
-                        all_results.append(result)
 
                     pbar.update(1)
 
@@ -288,50 +287,30 @@ def run_parameter_sweep(
         for backend in backends:
             for spec in batch_specs:
                 for value in sweep_values:
-                    # Create config with descriptive backend name
-                    backend_label = sweep.get_label(backend, value)
+                    # Create config with original backend for running
                     config = BenchmarkConfig(
-                        backend=backend_label, batch_spec=spec, **base_config_args
+                        backend=backend, batch_spec=spec, **base_config_args
                     )
 
-                    try:
-                        # Create clean config with original backend name for actual run
-                        clean_config = replace(config, backend=backend)
-
-                        # Prepare kwargs for benchmark runner
-                        kwargs = {}
-                        if value != "auto":
-                            kwargs[sweep.param_name] = value
-
-                        # Determine if MLA backend
-                        if backend in [
-                            "cutlass_mla",
-                            "flashinfer_mla",
-                            "flashattn_mla",
-                            "flashmla",
-                        ]:
-                            result = run_mla_benchmark(clean_config, **kwargs)
-                        else:
-                            result = run_standard_attention_benchmark(clean_config)
-
-                        # Replace result's config with labeled version
-                        result = replace(result, config=config)
-                        all_results.append(result)
+                    # Prepare kwargs for benchmark runner
+                    kwargs = {}
+                    if value != "auto":
+                        kwargs[sweep.param_name] = value
+
+                    # Run benchmark
+                    result = run_benchmark(config, **kwargs)
+
+                    # Replace backend with labeled version for display
+                    backend_label = sweep.get_label(backend, value)
+                    labeled_config = replace(result.config, backend=backend_label)
+                    result = replace(result, config=labeled_config)
+                    all_results.append(result)
 
-                    except Exception as e:
+                    if not result.success:
                         console.print(
                             f"[red]Error {backend} {spec} {sweep.param_name}="
-                            f"{value}: {e}[/]"
-                        )
-                        result = BenchmarkResult(
-                            config=config,
-                            mean_time=float("inf"),
-                            std_time=0,
-                            min_time=float("inf"),
-                            max_time=float("inf"),
-                            error=str(e),
+                            f"{value}: {result.error}[/]"
                         )
-                        all_results.append(result)
 
                     pbar.update(1)
 
@@ -881,33 +860,11 @@ def main():
                         profile_memory=args.profile_memory,
                     )
 
-                    try:
-                        # Determine if MLA backend
-                        if backend in [
-                            "cutlass_mla",
-                            "flashinfer_mla",
-                            "flashattn_mla",
-                            "flashmla",
-                        ]:
-                            result = run_mla_benchmark(config)
-                        else:
-                            result = run_standard_attention_benchmark(config)
-
-                        all_results.append(result)
-                    except Exception as e:
-                        console.print(f"[red]Error {backend} {spec}: {e}[/]")
-                        import traceback
+                    result = run_benchmark(config)
+                    all_results.append(result)
 
-                        traceback.print_exc()
-                        result = BenchmarkResult(
-                            config=config,
-                            mean_time=float("inf"),
-                            std_time=0,
-                            min_time=float("inf"),
-                            max_time=float("inf"),
-                            error=str(e),
-                        )
-                        all_results.append(result)
+                    if not result.success:
+                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
 
                     pbar.update(1)
 
diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index 0d25489e5815..7155bdc3fc5b 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -6,7 +6,6 @@
 import csv
 import json
 import math
-import time
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
@@ -297,64 +296,6 @@ def to_dict(self) -> dict[str, Any]:
         }
 
 
-class BenchmarkRunner:
-    """Base class for running attention benchmarks."""
-
-    def __init__(self, config: BenchmarkConfig):
-        self.config = config
-        self.device = torch.device(config.device)
-        torch.cuda.set_device(self.device)
-
-    def run(self, **kwargs) -> BenchmarkResult:
-        """
-        Run benchmark with current configuration.
-
-        Returns:
-            BenchmarkResult with timing and memory statistics
-        """
-        raise NotImplementedError
-
-    def _time_kernel(self, fn, warmup: int = 3, repeats: int = 10) -> dict:
-        """
-        Time a kernel function with warmup and multiple repeats.
-
-        Args:
-            fn: Callable to time
-            warmup: Number of warmup iterations
-            repeats: Number of measurement iterations
-
-        Returns:
-            Dict with timing statistics
-        """
-        # Warmup
-        for _ in range(warmup):
-            fn()
-        torch.cuda.synchronize()
-
-        # Measure
-        times = []
-        for _ in range(repeats):
-            torch.cuda.synchronize()
-            start = time.time()
-            fn()
-            torch.cuda.synchronize()
-            times.append(time.time() - start)
-
-        return {
-            "mean": np.mean(times),
-            "std": np.std(times),
-            "min": np.min(times),
-            "max": np.max(times),
-        }
-
-    def _get_memory_stats(self) -> dict:
-        """Get current CUDA memory statistics."""
-        return {
-            "allocated_mb": torch.cuda.memory_allocated(self.device) / 1024**2,
-            "reserved_mb": torch.cuda.memory_reserved(self.device) / 1024**2,
-        }
-
-
 class ResultsFormatter:
     """Format and display benchmark results."""
 
@@ -541,3 +482,22 @@ def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
 def get_attention_scale(head_dim: int) -> float:
     """Compute attention scale factor (1/sqrt(d))."""
     return 1.0 / math.sqrt(head_dim)
+
+
+def is_mla_backend(backend: str) -> bool:
+    """
+    Check if backend is an MLA backend using the backend's is_mla() property.
+
+    Args:
+        backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
+
+    Returns:
+        True if the backend is an MLA backend, False otherwise
+    """
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+    try:
+        backend_class = AttentionBackendEnum[backend.upper()].get_class()
+        return backend_class.is_mla()
+    except (KeyError, ValueError, ImportError):
+        return False
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index eefef2ec6ee0..2c6c3aaac360 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -13,7 +13,13 @@
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec
-from common import MockHfConfig, MockKVBProj, MockLayer, setup_mla_dims
+from common import (
+    BenchmarkResult,
+    MockHfConfig,
+    MockKVBProj,
+    MockLayer,
+    setup_mla_dims,
+)
 
 from vllm.config import (
     CacheConfig,
@@ -588,7 +594,7 @@ def _run_single_benchmark(
     backend_cfg: dict,
     mla_dims: dict,
     device: torch.device,
-) -> dict:
+) -> BenchmarkResult:
     """
     Run a single benchmark iteration.
 
@@ -602,7 +608,7 @@ def _run_single_benchmark(
         device: Target device
 
     Returns:
-        Dict with timing statistics
+        BenchmarkResult with timing statistics
     """
     # Parse batch spec
     requests = parse_batch_spec(config.batch_spec)
@@ -673,19 +679,21 @@ def _run_single_benchmark(
         elapsed_ms = start.elapsed_time(end)
         times.append(elapsed_ms / 1000.0 / config.num_layers)
 
-    return {
-        "mean": np.mean(times),
-        "std": np.std(times),
-        "min": np.min(times),
-        "max": np.max(times),
-        "throughput": total_q / np.mean(times) if times else 0,
-    }
+    mean_time = float(np.mean(times))
+    return BenchmarkResult(
+        config=config,
+        mean_time=mean_time,
+        std_time=float(np.std(times)),
+        min_time=float(np.min(times)),
+        max_time=float(np.max(times)),
+        throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
+    )
 
 
 def _run_mla_benchmark_batched(
     backend: str,
     configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
-) -> list[dict]:
+) -> list[BenchmarkResult]:
     """
     Unified batched MLA benchmark runner for all backends.
 
@@ -701,7 +709,7 @@ def _run_mla_benchmark_batched(
             - num_splits: num_kv_splits (CUTLASS only)
 
     Returns:
-        List of dicts with timing statistics
+        List of BenchmarkResult objects
     """
     if not configs_with_params:
         return []
@@ -785,7 +793,7 @@ def run_mla_benchmark(
     config,
     reorder_batch_threshold: int | None = None,
     num_kv_splits: int | None = None,
-) -> dict:
+) -> BenchmarkResult | list[BenchmarkResult]:
     """
     Unified MLA benchmark runner for all backends.
 
@@ -801,7 +809,7 @@ def run_mla_benchmark(
         num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
 
     Returns:
-        Dict with timing statistics (single mode) or list of dicts (batched mode)
+        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
     """
     # Normalize to batched mode: (config, threshold, num_splits)
     if isinstance(config, list):
diff --git a/benchmarks/attention_benchmarks/test_batch_spec.py b/benchmarks/attention_benchmarks/test_batch_spec.py
index 84367d0ed2e9..05b7fe6bef1c 100644
--- a/benchmarks/attention_benchmarks/test_batch_spec.py
+++ b/benchmarks/attention_benchmarks/test_batch_spec.py
@@ -14,7 +14,6 @@
     format_batch_spec,
     get_batch_stats,
     parse_batch_spec,
-    parse_manual_batch,
 )
 from benchmark import generate_batch_specs_from_ranges
 
@@ -117,18 +116,6 @@ def test_batch_stats():
     )
 
 
-def test_manual_batch():
-    """Test manual batch specification."""
-    print("\nTesting manual batch...")
-
-    requests = parse_manual_batch(["1,1024", "2048,2048", "1,2048"])
-    assert len(requests) == 3
-    assert requests[0].as_tuple() == (1, 1024)
-    assert requests[1].as_tuple() == (2048, 2048)
-    assert requests[2].as_tuple() == (1, 2048)
-    print("  ✓ Manual batch: 3 requests")
-
-
 def test_error_handling():
     """Test error handling."""
     print("\nTesting error handling...")
@@ -139,12 +126,6 @@ def test_error_handling():
     except ValueError:
         print("  ✓ Invalid spec raises ValueError")
 
-    try:
-        parse_manual_batch(["1024,512"])  # kv < q
-        raise AssertionError("Should have raised ValueError")
-    except ValueError:
-        print("  ✓ Invalid kv_len raises ValueError")
-
 
 def test_range_generation_simple():
     """Test simple range generation."""
@@ -255,7 +236,6 @@ def main():
     test_extend_patterns()
     test_formatting()
     test_batch_stats()
-    test_manual_batch()
     test_error_handling()
     test_range_generation_simple()
     test_range_generation_multiple()

From 9f3a76e4ba2028b2057059d2ee66ed8adba2485e Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 22:10:22 +0000
Subject: [PATCH 42/45] Clean up

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../configs/cutlass_numsplits.yaml            | 152 ------------------
 .../configs/famla_vs_fmla.yaml                | 142 ----------------
 .../configs/flashinfer_vs_cutlass.yaml        |  52 ------
 3 files changed, 346 deletions(-)
 delete mode 100644 benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
 delete mode 100644 benchmarks/attention_benchmarks/configs/famla_vs_fmla.yaml
 delete mode 100644 benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml

diff --git a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml b/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
deleted file mode 100644
index faeef5ffc09c..000000000000
--- a/benchmarks/attention_benchmarks/configs/cutlass_numsplits.yaml
+++ /dev/null
@@ -1,152 +0,0 @@
-# Study 1: What is the optimal CUTLASS_MLA num_kv_splits for different batch sizes?
-
-description: "CUTLASS MLA num-splits optimization study"
-
-# Single backend for this study
-backend: cutlass_mla
-
-# Fine-grained matrix sweep: batch sizes × sequence lengths
-# Batch sizes: 8, 16, 24, 32, 48, 64, 96, 128
-# Sequence lengths: 1k, 2k, 4k, 8k, 16k, 32k, 64k
-batch_specs:
-  # Batch size: 1
-  - "1q1s1k"
-  - "1q1s2k"
-  - "1q1s4k"
-  - "1q1s8k"
-  - "1q1s16k"
-  - "1q1s32k"
-  - "1q1s64k"
-  - "1q1s128k"
-
-  # Batch size: 2
-  - "2q1s1k"
-  - "2q1s2k"
-  - "2q1s4k"
-  - "2q1s8k"
-  - "2q1s16k"
-  - "2q1s32k"
-  - "2q1s64k"
-  - "2q1s128k"
-
-  # Batch size: 4
-  - "4q1s1k"
-  - "4q1s2k"
-  - "4q1s4k"
-  - "4q1s8k"
-  - "4q1s16k"
-  - "4q1s32k"
-  - "4q1s64k"
-  - "4q1s128k"
-
-  # Batch size: 8
-  - "8q1s1k"
-  - "8q1s2k"
-  - "8q1s4k"
-  - "8q1s8k"
-  - "8q1s16k"
-  - "8q1s32k"
-  - "8q1s64k"
-  - "8q1s128k"
-
-  # Batch size: 16
-  - "16q1s1k"
-  - "16q1s2k"
-  - "16q1s4k"
-  - "16q1s8k"
-  - "16q1s16k"
-  - "16q1s32k"
-  - "16q1s64k"
-  - "16q1s128k"
-
-  # Batch size: 24
-  - "24q1s1k"
-  - "24q1s2k"
-  - "24q1s4k"
-  - "24q1s8k"
-  - "24q1s16k"
-  - "24q1s32k"
-  - "24q1s64k"
-  - "24q1s128k"
-
-  # Batch size: 32
-  - "32q1s1k"
-  - "32q1s2k"
-  - "32q1s4k"
-  - "32q1s8k"
-  - "32q1s16k"
-  - "32q1s32k"
-  - "32q1s64k"
-  - "32q1s128k"
-
-  # Batch size: 48
-  - "48q1s1k"
-  - "48q1s2k"
-  - "48q1s4k"
-  - "48q1s8k"
-  - "48q1s16k"
-  - "48q1s32k"
-  - "48q1s64k"
-  - "48q1s128k"
-
-  # Batch size: 64
-  - "64q1s1k"
-  - "64q1s2k"
-  - "64q1s4k"
-  - "64q1s8k"
-  - "64q1s16k"
-  - "64q1s32k"
-  - "64q1s64k"
-  - "64q1s128k"
-
-  # Batch size: 96
-  - "96q1s1k"
-  - "96q1s2k"
-  - "96q1s4k"
-  - "96q1s8k"
-  - "96q1s16k"
-  - "96q1s32k"
-  - "96q1s64k"
-  - "96q1s128k"
-
-  # Batch size: 128
-  - "128q1s1k"
-  - "128q1s2k"
-  - "128q1s4k"
-  - "128q1s8k"
-  - "128q1s16k"
-  - "128q1s32k"
-  - "128q1s64k"
-  - "128q1s128k"
-
-# Unified parameter sweep configuration
-parameter_sweep:
-  param_name: "num_kv_splits"
-  values: [1, 2, 4, 8, 16]
-  include_auto: true
-  label_format: "{backend}_numsplits_{value}"
-
-# Model configuration (DeepSeek V2/V3 defaults)
-model:
-  num_layers: 10
-  head_dim: 576        # MLA uses 576 (kv_lora_rank=512 + 64)
-  num_q_heads: 128
-  num_kv_heads: 1      # MLA uses single KV head
-  block_size: 128
-
-# Benchmark settings
-benchmark:
-  device: "cuda:0"
-  repeats: 20
-  warmup_iters: 5
-  profile_memory: false
-
-# Output
-output:
-  csv: "cutlass_numsplits_results.csv"
-  json: "cutlass_numsplits_results.json"
-
-# Expected outcome:
-# - Identify if auto-selection heuristic is optimal
-# - Determine if we should revert PRs #24966, #25509
-# - Find optimal num_kv_splits per batch configuration
diff --git a/benchmarks/attention_benchmarks/configs/famla_vs_fmla.yaml b/benchmarks/attention_benchmarks/configs/famla_vs_fmla.yaml
deleted file mode 100644
index 620cc2050c14..000000000000
--- a/benchmarks/attention_benchmarks/configs/famla_vs_fmla.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-# Study 2: Does head count matter for FlashAttn MLA vs FlashMLA on Hopper?
-# Question: Which backend performs better on Hopper GPUs (SM90+)?
-# Question: Does the number of attention heads affect relative performance?
-
-description: "FlashAttn MLA vs FlashMLA head count comparison on Hopper"
-
-# Compare these two Hopper backends
-backends:
-  - flashattn_mla
-  - flashmla
-
-# Comprehensive batch spec matrix: batch sizes × query lengths
-# Batch sizes (num requests): 1, 2, 4, 8, 16, 32, 64, 128
-# Query lengths: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512
-# KV cache length: 1k (fixed)
-batch_specs:
-  # Batch size: 1
-  - "1q1s1k"
-  - "1q2s1k"
-  - "1q4s1k"
-  - "1q8s1k"
-  - "1q16s1k"
-  - "1q32s1k"
-  - "1q64s1k"
-  - "1q128s1k"
-  - "1q256s1k"
-  - "1q512s1k"
-
-  # Batch size: 2
-  - "2q1s1k"
-  - "2q2s1k"
-  - "2q4s1k"
-  - "2q8s1k"
-  - "2q16s1k"
-  - "2q32s1k"
-  - "2q64s1k"
-  - "2q128s1k"
-  - "2q256s1k"
-  - "2q512s1k"
-
-  # Batch size: 4
-  - "4q1s1k"
-  - "4q2s1k"
-  - "4q4s1k"
-  - "4q8s1k"
-  - "4q16s1k"
-  - "4q32s1k"
-  - "4q64s1k"
-  - "4q128s1k"
-  - "4q256s1k"
-  - "4q512s1k"
-
-  # Batch size: 8
-  - "8q1s1k"
-  - "8q2s1k"
-  - "8q4s1k"
-  - "8q8s1k"
-  - "8q16s1k"
-  - "8q32s1k"
-  - "8q64s1k"
-  - "8q128s1k"
-  - "8q256s1k"
-  - "8q512s1k"
-
-  # Batch size: 16
-  - "16q1s1k"
-  - "16q2s1k"
-  - "16q4s1k"
-  - "16q8s1k"
-  - "16q16s1k"
-  - "16q32s1k"
-  - "16q64s1k"
-  - "16q128s1k"
-  - "16q256s1k"
-  # - "16q512s1k"
-
-  # Batch size: 32
-  - "32q1s1k"
-  - "32q2s1k"
-  - "32q4s1k"
-  - "32q8s1k"
-  - "32q16s1k"
-  - "32q32s1k"
-  - "32q64s1k"
-  - "32q128s1k"
-  - "32q256s1k"
-  # - "32q512s1k"
-
-  # # Batch size: 64
-  - "64q1s1k"
-  - "64q2s1k"
-  - "64q4s1k"
-  - "64q8s1k"
-  - "64q16s1k"
-  - "64q32s1k"
-  - "64q64s1k"
-  - "64q128s1k"
-  - "64q256s1k"
-  # - "64q512s1k"
-
-  # # Batch size: 128
-  - "128q1s1k"
-  - "128q2s1k"
-  - "128q4s1k"
-  - "128q8s1k"
-  - "128q16s1k"
-  - "128q32s1k"
-  - "128q64s1k"
-  - "128q128s1k"
-  - "128q256s1k"
-  # - "128q512s1k"
-
-# Model configuration
-model:
-  num_layers: 10
-  head_dim: 576        # MLA uses 576
-  num_q_heads: 128     # Default value (will be overridden by sweep)
-  num_kv_heads: 1      # MLA uses single KV head
-  block_size: 64
-
-# Model parameter sweep - test different head counts
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [16, 32, 64, 128, 256]
-  label_format: "{backend}_heads_{value}"
-
-# Benchmark settings
-benchmark:
-  device: "cuda:0"
-  repeats: 10
-  warmup_iters: 5
-  profile_memory: true  # Track memory usage differences
-
-# Output
-output:
-  csv: "hopper_head_count_results.csv"
-  json: "hopper_head_count_results.json"
-
-# Expected outcome:
-# - Determine which backend is faster on Hopper
-# - Identify if head count impacts relative performance
-# - Inform backend selection for DeepSeek V2/V3 models
diff --git a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml b/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
deleted file mode 100644
index 76593c9b3fd8..000000000000
--- a/benchmarks/attention_benchmarks/configs/flashinfer_vs_cutlass.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Study 3: Is FlashInfer-MLA better than CUTLASS MLA after num-splits optimization?
-
-description: "FlashInfer-MLA vs optimized CUTLASS MLA comparison"
-
-# Compare these two backends
-backends:
-  - cutlass_mla
-  - flashinfer_mla
-
-# Test various decode workloads
-batch_specs:
-  - "32q1s1k"     # 32 decode requests, 1k KV cache
-  - "64q1s1k"     # 64 decode requests, 1k KV cache
-  - "64q1s4k"     # 64 decode requests, 4k KV cache
-  - "64q1s16k"    # 64 decode requests, 16k KV cache
-  - "128q1s1k"    # 128 decode requests, 1k KV cache
-  - "128q1s4k"    # 128 decode requests, 4k KV cache
-  - "128q1s16k"   # 128 decode requests, 16k KV cache
-
-# For CUTLASS, test optimized num_kv_splits
-# Based on Study 1 results, you may want to adjust these values
-parameter_sweep:
-  param_name: "num_kv_splits"
-  values: [4, 8, 16]  # Often optimal for medium to large batches
-  include_auto: true  # Also compare against auto-selection
-  label_format: "{backend}_numsplits_{value}"
-
-# Model configuration (DeepSeek V2/V3 defaults)
-model:
-  num_layers: 10
-  head_dim: 576
-  num_q_heads: 128
-  num_kv_heads: 1
-  block_size: 128
-
-# Benchmark settings
-benchmark:
-  device: "cuda:0"
-  repeats: 10
-  warmup_iters: 5
-  profile_memory: true  # Compare memory efficiency
-
-# Output
-output:
-  csv: "flashinfer_vs_cutlass_results.csv"
-  json: "flashinfer_vs_cutlass_results.json"
-
-# Expected outcome:
-# - Determine if FlashInfer-MLA is competitive with optimized CUTLASS
-# - Identify which backend to use for different batch sizes
-# - Assess memory efficiency trade-offs
-# - Inform default backend selection strategy

From b632ae06df89f55d4dd86d8dac571e2383768d37 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 22:15:19 +0000
Subject: [PATCH 43/45] Remove unused test

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../attention_benchmarks/test_batch_spec.py   | 252 ------------------
 1 file changed, 252 deletions(-)
 delete mode 100644 benchmarks/attention_benchmarks/test_batch_spec.py

diff --git a/benchmarks/attention_benchmarks/test_batch_spec.py b/benchmarks/attention_benchmarks/test_batch_spec.py
deleted file mode 100644
index 05b7fe6bef1c..000000000000
--- a/benchmarks/attention_benchmarks/test_batch_spec.py
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-"""Test suite for batch specification parser."""
-
-import sys
-from pathlib import Path
-
-# Add parent dir to path
-sys.path.insert(0, str(Path(__file__).parent))
-
-from batch_spec import (
-    format_batch_spec,
-    get_batch_stats,
-    parse_batch_spec,
-)
-from benchmark import generate_batch_specs_from_ranges
-
-
-def test_basic_patterns():
-    """Test basic batch specification patterns."""
-    print("Testing basic patterns...")
-
-    # Prefill
-    result = parse_batch_spec("q2k")
-    assert len(result) == 1
-    assert result[0].q_len == 2048
-    assert result[0].kv_len == 2048
-    assert result[0].is_prefill
-    print("  ✓ q2k -> [(2048, 2048)]")
-
-    # Decode
-    result = parse_batch_spec("8q1s1k")
-    assert len(result) == 8
-    assert all(r.q_len == 1 and r.kv_len == 1024 for r in result)
-    assert all(r.is_decode for r in result)
-    print("  ✓ 8q1s1k -> 8 x [(1, 1024)]")
-
-    # Context extension
-    result = parse_batch_spec("q1ks2k")
-    assert len(result) == 1
-    assert result[0].q_len == 1024
-    assert result[0].kv_len == 2048
-    assert result[0].is_extend
-    print("  ✓ q1ks2k -> [(1024, 2048)]")
-
-
-def test_combined_patterns():
-    """Test combined batch specifications."""
-    print("\nTesting combined patterns...")
-
-    result = parse_batch_spec("2q1k_32q1s1k")
-    assert len(result) == 34
-    assert sum(1 for r in result if r.is_prefill) == 2
-    assert sum(1 for r in result if r.is_decode) == 32
-    print("  ✓ 2q1k_32q1s1k -> 2 prefill + 32 decode")
-
-    result = parse_batch_spec("4q2k_8q4s1k_64q1s2k")
-    assert len(result) == 76  # 4 + 8 + 64
-    print("  ✓ 4q2k_8q4s1k_64q1s2k -> complex mix")
-
-
-def test_extend_patterns():
-    """Test context extension (extend) patterns."""
-    print("\nTesting extend patterns...")
-
-    # 4-token extension with 1k context
-    result = parse_batch_spec("q4s1k")
-    assert len(result) == 1
-    assert result[0].q_len == 4
-    assert result[0].kv_len == 1024
-    assert result[0].is_extend
-    assert not result[0].is_decode
-    assert not result[0].is_prefill
-    print("  ✓ q4s1k -> 4-token extend with 1k context")
-
-    # 8 requests of 8-token extension
-    result = parse_batch_spec("8q8s2k")
-    assert len(result) == 8
-    assert all(r.q_len == 8 and r.kv_len == 2048 for r in result)
-    assert all(r.is_extend for r in result)
-    print("  ✓ 8q8s2k -> 8 x 8-token extend with 2k context")
-
-
-def test_formatting():
-    """Test batch spec formatting."""
-    print("\nTesting formatting...")
-
-    requests = parse_batch_spec("2q2k_32q1s1k")
-    formatted = format_batch_spec(requests)
-    assert "2 prefill" in formatted
-    assert "32 decode" in formatted
-    print(f"  ✓ Format: {formatted}")
-
-    requests = parse_batch_spec("q4s1k_8q1s1k")
-    formatted = format_batch_spec(requests)
-    assert "1 extend" in formatted
-    assert "8 decode" in formatted
-    print(f"  ✓ Format with extend: {formatted}")
-
-
-def test_batch_stats():
-    """Test batch statistics."""
-    print("\nTesting batch statistics...")
-
-    requests = parse_batch_spec("2q2k_32q1s1k")
-    stats = get_batch_stats(requests)
-
-    assert stats["total_requests"] == 34
-    assert stats["num_prefill"] == 2
-    assert stats["num_decode"] == 32
-    assert stats["total_tokens"] == 2048 * 2 + 32 * 1
-    print(
-        f"  ✓ Stats: {stats['total_requests']} requests, {stats['total_tokens']} tokens"
-    )
-
-
-def test_error_handling():
-    """Test error handling."""
-    print("\nTesting error handling...")
-
-    try:
-        parse_batch_spec("invalid")
-        raise AssertionError("Should have raised ValueError")
-    except ValueError:
-        print("  ✓ Invalid spec raises ValueError")
-
-
-def test_range_generation_simple():
-    """Test simple range generation."""
-    print("\nTesting range generation (simple)...")
-
-    ranges = [{"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 5, "step": 1}}]
-    specs = generate_batch_specs_from_ranges(ranges)
-    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q4ks1k", "q5ks1k"]
-    assert specs == expected, f"Expected {expected}, got {specs}"
-    print(f"  ✓ Simple range: {len(specs)} specs generated")
-
-
-def test_range_generation_multiple():
-    """Test multiple range specifications."""
-    print("\nTesting range generation (multiple ranges)...")
-
-    ranges = [
-        {"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 3, "step": 1}},
-        {"template": "q{q_len}ks1k", "q_len": {"start": 10, "stop": 20, "step": 5}},
-    ]
-    specs = generate_batch_specs_from_ranges(ranges)
-    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q10ks1k", "q15ks1k", "q20ks1k"]
-    assert specs == expected, f"Expected {expected}, got {specs}"
-    print(f"  ✓ Multiple ranges: {len(specs)} specs generated")
-
-
-def test_range_generation_large():
-    """Test large range similar to study4 config."""
-    print("\nTesting range generation (large range)...")
-
-    ranges = [
-        {"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 16, "step": 1}},
-        {"template": "q{q_len}ks1k", "q_len": {"start": 17, "stop": 64, "step": 2}},
-        {"template": "q{q_len}ks1k", "q_len": {"start": 65, "stop": 128, "step": 4}},
-    ]
-    specs = generate_batch_specs_from_ranges(ranges)
-    expected_count = 16 + 24 + 16  # (1-16) + (17,19,21...63) + (65,69,73...125)
-    assert len(specs) == expected_count, (
-        f"Expected {expected_count} specs, got {len(specs)}"
-    )
-    print(f"  ✓ Large range: {len(specs)} specs generated")
-
-
-def test_range_generation_cartesian():
-    """Test Cartesian product with multiple parameters."""
-    print("\nTesting range generation (Cartesian product)...")
-
-    ranges = [
-        {
-            "template": "q{q_len}ks{kv_len}k",
-            "q_len": {"start": 1, "stop": 2, "step": 1},
-            "kv_len": {"start": 1, "stop": 2, "step": 1},
-        }
-    ]
-    specs = generate_batch_specs_from_ranges(ranges)
-    # Should generate Cartesian product: (1,1), (1,2), (2,1), (2,2)
-    expected = ["q1ks1k", "q1ks2k", "q2ks1k", "q2ks2k"]
-    assert specs == expected, f"Expected {expected}, got {specs}"
-    print(f"  ✓ Cartesian product: {len(specs)} specs generated")
-
-
-def test_range_generation_end_inclusive():
-    """Test end_inclusive parameter."""
-    print("\nTesting range generation (end_inclusive)...")
-
-    # Test inclusive (default)
-    ranges_inclusive = [
-        {"template": "q{q_len}ks1k", "q_len": {"start": 1, "stop": 3, "step": 1}}
-    ]
-    specs = generate_batch_specs_from_ranges(ranges_inclusive)
-    expected = ["q1ks1k", "q2ks1k", "q3ks1k"]
-    assert specs == expected, f"Expected {expected}, got {specs}"
-    print(f"  ✓ end_inclusive default (true): {specs}")
-
-    # Test explicit inclusive
-    ranges_explicit_inclusive = [
-        {
-            "template": "q{q_len}ks1k",
-            "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": True},
-        }
-    ]
-    specs = generate_batch_specs_from_ranges(ranges_explicit_inclusive)
-    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q4ks1k", "q5ks1k"]
-    assert specs == expected, f"Expected {expected}, got {specs}"
-    print("  ✓ end_inclusive=true: includes stop value")
-
-    # Test exclusive
-    ranges_exclusive = [
-        {
-            "template": "q{q_len}ks1k",
-            "q_len": {"start": 1, "stop": 5, "step": 1, "end_inclusive": False},
-        }
-    ]
-    specs = generate_batch_specs_from_ranges(ranges_exclusive)
-    expected = ["q1ks1k", "q2ks1k", "q3ks1k", "q4ks1k"]
-    assert specs == expected, f"Expected {expected}, got {specs}"
-    print("  ✓ end_inclusive=false: excludes stop value")
-
-
-def main():
-    """Run all tests."""
-    print("=" * 60)
-    print("Batch Specification Parser Tests")
-    print("=" * 60)
-
-    test_basic_patterns()
-    test_combined_patterns()
-    test_extend_patterns()
-    test_formatting()
-    test_batch_stats()
-    test_error_handling()
-    test_range_generation_simple()
-    test_range_generation_multiple()
-    test_range_generation_large()
-    test_range_generation_cartesian()
-    test_range_generation_end_inclusive()
-
-    print("\n" + "=" * 60)
-    print("All tests passed! ✓")
-    print("=" * 60)
-
-
-if __name__ == "__main__":
-    main()

From 10a68b7e4974aa5c83a7503a7a1ac4b837abdf23 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 22:21:32 +0000
Subject: [PATCH 44/45] Update README and sample

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md     | 163 ++++++++----------
 .../configs/standard_attention.yaml           |   4 +-
 2 files changed, 72 insertions(+), 95 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 16e6be5497f6..4fd20daaec71 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -7,14 +7,11 @@ Fast, flexible benchmarking for vLLM attention and MLA backends with an extended
 ```bash
 cd benchmarks/attention_benchmarks
 
-# Test the parser
-python test_batch_spec.py
-# ✓ All tests pass
-
-# Run one of the 4 research studies
-python benchmark.py --config configs/cutlass_numsplits.yaml
-python benchmark.py --config configs/hopper_head_count.yaml
-python benchmark.py --config configs/flashinfer_vs_cutlass.yaml
+# Run a pre-configured benchmark
+python benchmark.py --config configs/mla_decode.yaml
+python benchmark.py --config configs/mla_mixed_batch.yaml
+python benchmark.py --config configs/speculative_decode.yaml
+python benchmark.py --config configs/standard_attention.yaml
 python benchmark.py --config configs/reorder_threshold.yaml
 
 # Or run custom benchmarks
@@ -52,69 +49,52 @@ Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
 
 **Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
 
-## Research Studies
+## Pre-configured Benchmarks
 
-The suite includes 4 pre-configured studies to answer key MLA optimization questions. Each study is a single YAML file you can run directly:
+The suite includes several pre-configured YAML benchmark configurations:
 
-### Study 1: CUTLASS MLA num-splits Optimization
+### MLA Decode Benchmark
 
-**Question:** Should we revert the CUTLASS MLA num-splits heuristic (PRs #24966, #25509)?
+Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
 
 ```bash
-python benchmark.py --config configs/cutlass_numsplits.yaml
+python benchmark.py --config configs/mla_decode.yaml
 ```
 
-Tests CUTLASS MLA with different `num_kv_splits` values (1, 2, 4, 8, 16, 32) across various batch sizes and compares against auto-selection.
+### MLA Mixed Batch Benchmark
 
-### Study 2: FlashAttn MLA vs FlashMLA on Hopper
-
-**Question:** Does head count matter for FlashAttn MLA vs FlashMLA on Hopper GPUs?
+Tests chunked prefill performance with mixed prefill + decode batches.
 
 ```bash
-# Test with default head count (128)
-python benchmark.py --config configs/hopper_head_count.yaml
-
-# Test with different head counts
-for heads in 16 32 64 128 256; do
-    python benchmark.py --config configs/hopper_head_count.yaml \
-        --num-q-heads $heads \
-        --output-csv hopper_heads_${heads}.csv
-done
+python benchmark.py --config configs/mla_mixed_batch.yaml
 ```
 
-Compares FlashAttn MLA and FlashMLA performance with varying attention head counts.
-
-### Study 3: FlashInfer-MLA vs Optimized CUTLASS
+### Speculative Decoding Benchmark
 
-**Question:** Is FlashInfer-MLA better than CUTLASS MLA after num-splits optimization?
+Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
 
 ```bash
-python benchmark.py --config configs/flashinfer_vs_cutlass.yaml
+python benchmark.py --config configs/speculative_decode.yaml
 ```
 
-Compares FlashInfer-MLA against CUTLASS MLA with optimized `num_kv_splits` values.
+### Standard Attention Benchmark
 
-### Study 4: Reorder Batch Threshold Optimization (Decode vs Prefill Crossover)
+Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
 
-**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
+```bash
+python benchmark.py --config configs/standard_attention.yaml
+```
+
+### Reorder Threshold Study
 
-**Methodology:** Reproduces the original `benchmark_mla_threshold.py` study using the new interface:
+**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
 
-- For each query length (1-2048), test BOTH decode and prefill pipelines
-- Find the crossover point where prefill becomes faster
-- Analyze how this varies across batch sizes (1-256)
+Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
 
 ```bash
 python benchmark.py --config configs/reorder_threshold.yaml
 ```
 
-Tests query lengths from 1-2048 (fine-grained steps at low values, coarser at high values) across 9 batch sizes. For each query length, compares:
-
-- **Decode pipeline**: `threshold >= query_length`
-- **Prefill pipeline**: `threshold < query_length`
-
-Outputs the optimal threshold (last query length where decode is faster) for each batch size.
-
 ---
 
 ## Universal Benchmark
@@ -144,14 +124,16 @@ python benchmark.py \
 
 ### Parameter Sweeps
 
+Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
+
 #### CUTLASS MLA num-splits Optimization
 
 ```bash
 python benchmark.py \
     --backend cutlass_mla \
     --batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
-    --num-splits 1 2 4 8 16 \
-    --compare-auto \
+    --sweep-param num_kv_splits \
+    --sweep-values 1 2 4 8 16 \
     --output-json optimal_splits.json
 ```
 
@@ -163,7 +145,8 @@ python benchmark.py \
 python benchmark.py \
     --backend flashmla \
     --batch-specs "q4s1k" "q8s2k" \
-    --thresholds 1 4 16 64 256 512 \
+    --sweep-param reorder_batch_threshold \
+    --sweep-values 1 4 16 64 256 512 \
     --output-csv threshold_sweep.csv
 ```
 
@@ -172,28 +155,29 @@ python benchmark.py \
 ### All Command-Line Options
 
 ```text
+--config CONFIG                     # Path to YAML config file (overrides other args)
 --backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
                                     # flashinfer_mla, flashattn_mla, flashmla
 --backend BACKEND                   # Single backend (alternative to --backends)
---batch-specs SPEC [SPEC ...]       # Batch specifications (default: ["q2k", "8q1s1k"])
+--batch-specs SPEC [SPEC ...]       # Batch specifications using extended grammar
 
 # Model configuration
---num-layers N                      # Number of layers (default: 10)
---head-dim N                        # Head dimension (default: 128)
---num-q-heads N                     # Query heads (default: 32)
---num-kv-heads N                    # KV heads (default: 8)
---block-size N                      # Block size (default: 16)
+--num-layers N                      # Number of layers
+--head-dim N                        # Head dimension
+--num-q-heads N                     # Query heads
+--num-kv-heads N                    # KV heads
+--block-size N                      # Block size
 
 # Benchmark settings
 --device DEVICE                     # Device (default: cuda:0)
---repeats N                         # Repetitions (default: 1)
---warmup-iters N                    # Warmup iterations (default: 3)
+--repeats N                         # Repetitions
+--warmup-iters N                    # Warmup iterations
 --profile-memory                    # Profile memory usage
 
-# MLA-specific parameter sweeps
---num-splits N [N ...]              # CUTLASS MLA: Test multiple num_kv_splits
---thresholds N [N ...]              # FlashMLA/FlashAttn MLA: Test multiple thresholds
---compare-auto                      # CUTLASS MLA: Also test auto num_kv_splits
+# Parameter sweeps
+--sweep-param PARAM                 # Parameter name to sweep (e.g., num_kv_splits,
+                                    # reorder_batch_threshold)
+--sweep-values N [N ...]            # Values to sweep for the parameter
 
 # Output
 --output-csv FILE                   # Save to CSV
@@ -212,15 +196,10 @@ python benchmark.py \
 
 ## Using MLA Runner Directly
 
-All MLA backends are available in `mla_runner.py`:
+All MLA backends are available through `mla_runner.run_mla_benchmark()`:
 
 ```python
-from mla_runner import (
-    run_cutlass_mla_benchmark,
-    run_flashinfer_mla_benchmark,
-    run_flashattn_mla_benchmark,
-    run_flashmla_benchmark,
-)
+from mla_runner import run_mla_benchmark
 from common import BenchmarkConfig
 
 config = BenchmarkConfig(
@@ -237,17 +216,17 @@ config = BenchmarkConfig(
 )
 
 # CUTLASS MLA with specific num_kv_splits
-result = run_cutlass_mla_benchmark(config, num_kv_splits=4)
-print(f"Time: {result['mean']:.6f}s, Throughput: {result['throughput']:.1f} tok/s")
+result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
+print(f"Time: {result.mean_time:.6f}s")
 
 # FlashInfer-MLA
-result = run_flashinfer_mla_benchmark(config)
+result = run_mla_benchmark("flashinfer_mla", config)
 
 # FlashAttn MLA (Hopper SM90+)
-result = run_flashattn_mla_benchmark(config, reorder_batch_threshold=64)
+result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
 
 # FlashMLA (Hopper SM90+)
-result = run_flashmla_benchmark(config, reorder_batch_threshold=64)
+result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
 ```
 
 ## Python API
@@ -278,19 +257,19 @@ formatter.save_json(results, "output.json")
 attention_benchmarks/
 ├── README.md                      # This file
 │
-├── batch_spec.py                  # Grammar parser (tested)
-├── common.py                      # Infrastructure
-├── runner.py                      # Standard attention helpers
-├── mla_runner.py                  # MLA helpers (ALL 4 backends)
-├── test_batch_spec.py             # Tests (all passing)
+├── batch_spec.py                  # Batch specification grammar parser
+├── common.py                      # Shared utilities and data classes
+├── runner.py                      # Standard attention benchmark runner
+├── mla_runner.py                  # MLA benchmark runner (all 4 backends)
 │
-├── benchmark.py                   # Universal benchmark script
+├── benchmark.py                   # Universal benchmark CLI
 │
-└── configs/                       # Pre-configured studies
-    ├── cutlass_numsplits.yaml         # CUTLASS num-splits optimization
-    ├── hopper_head_count.yaml         # FlashAttn vs FlashMLA head count
-    ├── flashinfer_vs_cutlass.yaml     # FlashInfer vs optimized CUTLASS
-    └── reorder_threshold.yaml         # Reorder threshold optimization
+└── configs/                       # Pre-configured benchmarks
+    ├── mla_decode.yaml            # MLA decode-only benchmark
+    ├── mla_mixed_batch.yaml       # MLA mixed prefill/decode benchmark
+    ├── speculative_decode.yaml    # Speculative decoding benchmark
+    ├── standard_attention.yaml    # Standard attention benchmark
+    └── reorder_threshold.yaml     # Reorder threshold optimization study
 ```
 
 ## Tips
@@ -305,7 +284,7 @@ attention_benchmarks/
 
 **5. Extended grammar** - Leverage spec decode, chunked prefill patterns
 
-**6. Parameter sweeps** - Use `--num-splits` or `--thresholds` to find optimal values
+**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
 
 ## Troubleshooting
 
@@ -327,12 +306,10 @@ source /path/to/vllm/.venv/bin/activate
 
 ## What's Included
 
-✅ Extended batch spec grammar with tests (all passing!)
-✅ Universal benchmark script for all backends
-✅ Standard attention support (Flash/Triton/FlashInfer)
-✅ MLA runner with ALL 4 backends
-✅ Parameter sweep modes (num-splits, thresholds)
-✅ Rich console output + CSV/JSON export
-✅ Pre-built configuration files (optional)
-
-**~5,000 lines of code, fully simplified, ready to benchmark!** 🚀
+- Extended batch spec grammar
+- Universal benchmark script for all backends
+- Standard attention support (Flash/Triton/FlashInfer)
+- MLA runner with all 4 backends (CUTLASS, FlashInfer, FlashAttn, FlashMLA)
+- Parameter sweep support via `--sweep-param` and `--sweep-values`
+- Rich console output + CSV/JSON export
+- Pre-built YAML configuration files
diff --git a/benchmarks/attention_benchmarks/configs/standard_attention.yaml b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
index 5376b62f23f5..c0bdb98fbf62 100644
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -26,8 +26,8 @@ batch_specs:
   - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
 
   # Context extension
-  - "q1kkv2k"         # 1k query, 2k KV (chunked prefill)
-  - "2q1kkv4k"        # 2 requests: 1k query, 4k KV
+  - "q1ks2k"          # 1k query, 2k sequence (chunked prefill)
+  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
 
 backends:
   - flash

From 32e9280c5d5c3563db85b9c435c3db5c73bd61b5 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 27 Jan 2026 22:28:30 +0000
Subject: [PATCH 45/45] Update README

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 benchmarks/attention_benchmarks/README.md | 57 ++---------------------
 1 file changed, 4 insertions(+), 53 deletions(-)

diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md
index 4fd20daaec71..788ce94f23fb 100644
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -128,6 +128,8 @@ Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
 
 #### CUTLASS MLA num-splits Optimization
 
+**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
+
 ```bash
 python benchmark.py \
     --backend cutlass_mla \
@@ -137,10 +139,10 @@ python benchmark.py \
     --output-json optimal_splits.json
 ```
 
-**Answers:** What is the optimal `num_kv_splits` for CUTLASS MLA?
-
 #### Reorder Batch Threshold Optimization
 
+**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
+
 ```bash
 python benchmark.py \
     --backend flashmla \
@@ -150,8 +152,6 @@ python benchmark.py \
     --output-csv threshold_sweep.csv
 ```
 
-**Answers:** What's the optimal `reorder_batch_threshold` for speculative decoding?
-
 ### All Command-Line Options
 
 ```text
@@ -251,27 +251,6 @@ formatter.save_csv(results, "output.csv")
 formatter.save_json(results, "output.json")
 ```
 
-## File Structure
-
-```text
-attention_benchmarks/
-├── README.md                      # This file
-│
-├── batch_spec.py                  # Batch specification grammar parser
-├── common.py                      # Shared utilities and data classes
-├── runner.py                      # Standard attention benchmark runner
-├── mla_runner.py                  # MLA benchmark runner (all 4 backends)
-│
-├── benchmark.py                   # Universal benchmark CLI
-│
-└── configs/                       # Pre-configured benchmarks
-    ├── mla_decode.yaml            # MLA decode-only benchmark
-    ├── mla_mixed_batch.yaml       # MLA mixed prefill/decode benchmark
-    ├── speculative_decode.yaml    # Speculative decoding benchmark
-    ├── standard_attention.yaml    # Standard attention benchmark
-    └── reorder_threshold.yaml     # Reorder threshold optimization study
-```
-
 ## Tips
 
 **1. Warmup matters** - Use `--warmup-iters 10` for stable results
@@ -285,31 +264,3 @@ attention_benchmarks/
 **5. Extended grammar** - Leverage spec decode, chunked prefill patterns
 
 **6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
-
-## Troubleshooting
-
-**Import errors?**
-
-```bash
-source /path/to/vllm/.venv/bin/activate
-```
-
-**Backend not supported?**
-
-- Check hardware requirements above
-- Some backends need Hopper/Blackwell
-
-**OOM?**
-
-- Reduce batch size: `"32q1s1k"` → `"16q1s1k"`
-- Reduce sequence length: `"64q1s16k"` → `"64q1s4k"`
-
-## What's Included
-
-- Extended batch spec grammar
-- Universal benchmark script for all backends
-- Standard attention support (Flash/Triton/FlashInfer)
-- MLA runner with all 4 backends (CUTLASS, FlashInfer, FlashAttn, FlashMLA)
-- Parameter sweep support via `--sweep-param` and `--sweep-values`
-- Rich console output + CSV/JSON export
-- Pre-built YAML configuration files