vllm-project · caozuoba · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fla.ops import (
+    fused_recurrent_gated_delta_rule,
+    fused_recurrent_gated_delta_rule_packed_decode_fwd,
+)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device")
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("strided_mixed_qkv", [False, True])
+def test_fused_recurrent_packed_decode_matches_reference(
+    dtype: torch.dtype, strided_mixed_qkv: bool
+):
+    torch.manual_seed(0)
+
+    # Small but representative GDN config (Qwen3Next defaults are K=128, V=128).
+    B = 32
+    H = 4
+    HV = 8  # grouped value attention: HV must be divisible by H
+    K = 128
+    V = 128
+    qkv_dim = 2 * (H * K) + (HV * V)
+
+    device = torch.device("cuda")
+
+    if strided_mixed_qkv:
+        # Simulate a packed view into a larger projection buffer:
+        # mixed_qkv.stride(0) > mixed_qkv.shape[1]
+        proj = torch.randn((B, qkv_dim + 64), device=device, dtype=dtype)
+        mixed_qkv = proj[:, :qkv_dim]
+    else:
+        mixed_qkv = torch.randn((B, qkv_dim), device=device, dtype=dtype)
+
+    a = torch.randn((B, HV), device=device, dtype=dtype)
+    b = torch.randn((B, HV), device=device, dtype=dtype)
+    A_log = torch.randn((HV,), device=device, dtype=dtype)
+    dt_bias = torch.randn((HV,), device=device, dtype=dtype)
+
+    # Continuous batching indices (include PAD_SLOT_ID=-1 cases).
+    ssm_state_indices = torch.arange(B, device=device, dtype=torch.int32)
+    ssm_state_indices[-3:] = -1
+
+    state0 = torch.randn((B, HV, V, K), device=device, dtype=dtype)
+    state_ref = state0.clone()
+    state_packed = state0.clone()
+
+    out_ref = torch.empty((B, 1, HV, V), device=device, dtype=dtype)
+    out_packed = torch.empty((B, 1, HV, V), device=device, dtype=dtype)
+
+    # Reference path: materialize contiguous Q/K/V + explicit gating.
+    q, k, v = torch.split(mixed_qkv, [H * K, H * K, HV * V], dim=-1)
+    q = q.view(B, H, K).unsqueeze(1).contiguous()
+    k = k.view(B, H, K).unsqueeze(1).contiguous()
+    v = v.view(B, HV, V).unsqueeze(1).contiguous()
+
+    x = a.float() + dt_bias.float()
+    softplus_x = torch.where(
+        x <= 20.0, torch.log1p(torch.exp(torch.clamp(x, max=20.0))), x
+    )
+    g = (-torch.exp(A_log.float()) * softplus_x).unsqueeze(1)
+    beta = torch.sigmoid(b.float()).to(dtype).unsqueeze(1)
+
+    fused_recurrent_gated_delta_rule(
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        initial_state=state_ref,
+        out=out_ref,
+        inplace_final_state=True,
+        cu_seqlens=None,
+        ssm_state_indices=ssm_state_indices,
+        use_qk_l2norm_in_kernel=True,
+    )
+
+    # Packed path: fused gating + recurrent directly from packed mixed_qkv.
+    fused_recurrent_gated_delta_rule_packed_decode_fwd(
+        mixed_qkv=mixed_qkv,
+        a=a,
+        b=b,
+        A_log=A_log,
+        dt_bias=dt_bias,
+        scale=K**-0.5,
+        initial_state=state_packed,
+        out=out_packed,
+        ssm_state_indices=ssm_state_indices,
+        use_qk_l2norm_in_kernel=True,
+    )
+
+    torch.testing.assert_close(out_packed, out_ref, rtol=1e-2, atol=2e-2)
+    torch.testing.assert_close(state_packed, state_ref, rtol=1e-2, atol=2e-2)
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -218,6 +218,7 @@
     VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False
     VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
     VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER"
+    VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE: bool = False
     VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024
     VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False
     VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False
@@ -899,6 +900,12 @@ def _get_or_set_default() -> str:
     "VLLM_DISABLED_KERNELS": lambda: []
     if "VLLM_DISABLED_KERNELS" not in os.environ
     else os.environ["VLLM_DISABLED_KERNELS"].split(","),
+    # Enable an experimental packed recurrent decode fast path for FLA models.
+    # Disabled by default for safety; when disabled, code falls back to the
+    # default unfused path.
+    "VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE", "0"))
+    ),
     # Disable pynccl (using torch.distributed instead)
     "VLLM_DISABLE_PYNCCL": lambda: (
         os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")

@@ -7,11 +7,15 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 from .chunk import chunk_gated_delta_rule
-from .fused_recurrent import fused_recurrent_gated_delta_rule
+from .fused_recurrent import (
+    fused_recurrent_gated_delta_rule,
+    fused_recurrent_gated_delta_rule_packed_decode_fwd,
+)
 from .layernorm_guard import RMSNormGated
 
 __all__ = [
     "RMSNormGated",
     "chunk_gated_delta_rule",
     "fused_recurrent_gated_delta_rule",
+    "fused_recurrent_gated_delta_rule_packed_decode_fwd",
 ]