vllm-project · vllm-bot · Apr 15, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
@@ -91,6 +91,16 @@ steps:
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
 
 
+- label: LM Eval TurboQuant KV Cache
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization/turboquant/
+  - vllm/v1/attention/backends/turboquant_attn.py
+  - vllm/v1/attention/ops/triton_turboquant_decode.py
+  - vllm/v1/attention/ops/triton_turboquant_store.py
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-turboquant.txt
+
 - label: GPQA Eval (GPT-OSS) (H100)
   timeout_in_minutes: 120
   device: h100

diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
@@ -178,6 +178,7 @@ Priority is **1 = highest** (tried first).
 | `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ✅ | ❌ | Decoder, Encoder, Encoder Only | N/A |
 | `TREE_ATTN` | | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
 | `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `int8_per_token_head`, `fp8_per_token_head` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
+| `TURBOQUANT` | | fp16, bf16 | `turboquant_k8v4`, `turboquant_4bit_nc`, `turboquant_k3v4_nc`, `turboquant_3bit_nc` | 16, 32, 64, 128 | Any | ❌ | ❌ | ❌ | Decoder | Any |
 
 > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`.
 >

diff --git a/pyproject.toml b/pyproject.toml
@@ -170,6 +170,9 @@ eles = "eles"
 datas = "datas"
 ser = "ser"
 ure = "ure"
+# Walsh-Hadamard Transform
+wht = "wht"
+WHT = "WHT"
 
 [tool.uv]
 no-build-isolation-package = ["torch"]
@@ -0,0 +1,5 @@
+model_name: "Qwen/Qwen3-4B"
+accuracy_threshold: 0.78
+num_questions: 1319
+num_fewshot: 5
+server_args: "--kv-cache-dtype turboquant_k3v4_nc --enforce-eager --max-model-len 4096"
@@ -0,0 +1,5 @@
+model_name: "Qwen/Qwen3-4B"
+accuracy_threshold: 0.80
+num_questions: 1319
+num_fewshot: 5
+server_args: "--kv-cache-dtype turboquant_k8v4 --enforce-eager --max-model-len 4096"
@@ -0,0 +1,5 @@
+model_name: "Qwen/Qwen3-4B"
+accuracy_threshold: 0.75
+num_questions: 1319
+num_fewshot: 5
+server_args: "--kv-cache-dtype turboquant_3bit_nc --enforce-eager --max-model-len 4096"
@@ -0,0 +1,5 @@
+model_name: "Qwen/Qwen3-4B"
+accuracy_threshold: 0.80
+num_questions: 1319
+num_fewshot: 5
+server_args: "--kv-cache-dtype turboquant_4bit_nc --enforce-eager --max-model-len 4096"
@@ -0,0 +1,4 @@
+Qwen3-4B-TQ-k8v4.yaml
+Qwen3-4B-TQ-t4nc.yaml
+Qwen3-4B-TQ-k3v4nc.yaml
+Qwen3-4B-TQ-t3nc.yaml