From 7a4d2fe6e88f8169f0da61b85929c86edc49a063 Mon Sep 17 00:00:00 2001
From: Harish Subramony <harish.subramony@intel.com>
Date: Fri, 15 May 2026 00:31:45 -0700
Subject: [PATCH 01/29] Mrope accuracy fix for qwen (#1437)

When mrope_interleaved is enabled, HPUMRotaryEmbedding was still using
the non-interleaved split/concat section mapping for cos/sin.
This produced incorrect rotary channel ordering for multimodal MRoPE
inputs and could cause sample-level mismatches against upstream vLLM
behavior.
Use apply_interleaved_rope for the interleaved branch, and preserve the
existing split/concat logic for non-interleaved layouts.

Signed-off-by: Harish Subramony <harish.subramony@intel.com>
Co-authored-by: Jimin Ha <jimin.ha@intel.com>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Co-authored-by: Seunghyuk Park (shepark) <separk@habana.ai>
---
 vllm_gaudi/ops/hpu_rotary_embedding.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm_gaudi/ops/hpu_rotary_embedding.py b/vllm_gaudi/ops/hpu_rotary_embedding.py
index 38d02d23f9..cfc452927a 100644
--- a/vllm_gaudi/ops/hpu_rotary_embedding.py
+++ b/vllm_gaudi/ops/hpu_rotary_embedding.py
@@ -668,9 +668,14 @@ def forward_oot(
         cos, sin = cos_sin.chunk(2, dim=-1)
         if positions.ndim == 2:
             assert self.mrope_section
-
-            cos = torch.cat([m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], dim=-1)
-            sin = torch.cat([m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], dim=-1)
+            if getattr(self, "mrope_interleaved", False):
+                from vllm.model_executor.layers.rotary_embedding.mrope import apply_interleaved_rope
+
+                cos = apply_interleaved_rope(cos, self.mrope_section)
+                sin = apply_interleaved_rope(sin, self.mrope_section)
+            else:
+                cos = torch.cat([m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], dim=-1)
+                sin = torch.cat([m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], dim=-1)
         if self.is_neox_style:
             cos = torch.cat((cos, cos), dim=-1).unsqueeze(-2)
             sin = torch.cat((sin, sin), dim=-1).unsqueeze(-2)

From f1abfec3e7ee4ecb7bb937624911ce112569ecca Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iryna.boiko@intel.com>
Date: Fri, 15 May 2026 10:20:13 +0200
Subject: [PATCH 02/29] Fix for MoE refactor #35178 (#1442)

Signed-off-by: Iryna Boiko <iboiko@habana.ai>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 27 ++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index e0478d3e2b..9665faf742 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -4667,10 +4667,29 @@ def _remove_duplicate_submodules(self):
                 if mlp is not None:
                     block_gate = getattr(mlp, 'gate', None) or getattr(mlp, 'router', None)
                     experts = getattr(mlp, 'experts', None)
-                    if (block_gate is not None and experts is not None
-                            and getattr(experts, '_gate', None) is block_gate):
-                        experts._gate = None
-                        self._detached_moe_gates.add(id(experts))
+                    if block_gate is not None and experts is not None:
+                        if getattr(experts, '_gate', None) is block_gate:
+                            experts._gate = None
+                            self._detached_moe_gates.add(id(experts))
+                        # With upstream vLLM PR #35178 MoERunner is an
+                        # nn.Module, so `self.runner.gate = gate` in
+                        # FusedMoE.__init__ registers the shared gate
+                        # as a child of runner. INC's
+                        # generate_model_info() walks named_children()
+                        # and the last-seen parent wins, so INC patches
+                        # runner._modules['gate'] and leaves
+                        # mlp._modules['gate'] pointing at a stale
+                        # module whose weight Parameter has been
+                        # mutated in-place to fp8. Unregister the gate
+                        # from runner._modules (but keep runner.gate
+                        # as a plain attribute so is_internal_router()
+                        # and the runner's internal forward path keep
+                        # working) so INC sees mlp as the sole parent.
+                        runner = getattr(experts, 'runner', None)
+                        if (runner is not None and isinstance(runner, torch.nn.Module)
+                                and runner._modules.get('gate', None) is block_gate):
+                            del runner._modules['gate']
+                            object.__setattr__(runner, 'gate', block_gate)
 
     def _sync_shared_moe_gates(self):
         """Apply SharedFusedMoE post-INC synchronization and compatibility.

From 9566f70a4fa50f317b66a469af45f9e61efd2b5c Mon Sep 17 00:00:00 2001
From: Harish Subramony <harish.subramony@intel.com>
Date: Sun, 17 May 2026 23:00:22 -0700
Subject: [PATCH 03/29] fix: HPU-specific bug fixes for KV-offload + async
 spec-decode (#1264) (#1401)

Bug 1 (hpu_async_scheduler): clamp num_external_computed_tokens to 0 in
_update_requests_with_invalid_blocks() override. When OOM causes block
invalidation the affected-token span can exceed the externally-computed
prefix, incorrectly driving num_external_computed_tokens negative.

Bug 2 (hpu_async_scheduler): fix stale num_cached_tokens after
preemption. After OOM preemption and requeue a request restarts from
num_computed_tokens=0; the OffloadingConnector may assign new external
cache hits leaving num_cached_tokens inconsistent (<
num_external_computed_tokens). A schedule() post-processing pass detects
and corrects this.

Bug 2b (utils): clamp PromptTokenStats.get_by_source() to 0 via
monkey-patch. During the brief inconsistency window the Prometheus
counter would crash with "Counters can only be incremented by
non-negative amounts".

Bug 3 (hpu_model_runner): fix tensor shape mismatch [N,1] vs [N,M] in
the async scheduling path of _create_decode_input_data when a
spec-decode request has num_tokens > 1.

Bug 4 (hpu_model_runner): prevent Habana workspace OOM triggered by
OffloadingConnector requeuing a decode request with many scheduled
tokens. Route multi-token non-spec-decode requests through the prefill
bucket path (which handles large context correctly) instead of the
decode bucket path (which has no prepared bucket for
batch_size=N*blocks, causing JIT recompile with a 107 GiB workspace
allocation).

    Co-authored-by: GitHub Copilot

---------

---------

Signed-off-by: Harish Subramony <harish.subramony@intel.com>
Signed-off-by: Artur Fierka <artur.fierka@intel.com>
Co-authored-by: Iryna Boiko <iryna.boiko@intel.com>
Co-authored-by: Artur Fierka <artur.fierka@intel.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Co-authored-by: Kamil Kaczor <kamil.kaczor@intel.com>
---
 .../test_ensure_multi_token_decodes_last.py   | 160 ++++++++++++++++++
 vllm_gaudi/utils.py                           |  18 ++
 .../v1/core/sched/hpu_async_scheduler.py      | 110 +++++++++++-
 vllm_gaudi/v1/worker/hpu_model_runner.py      |  77 ++++++++-
 4 files changed, 357 insertions(+), 8 deletions(-)
 create mode 100644 tests/unit_tests/worker/test_ensure_multi_token_decodes_last.py

diff --git a/tests/unit_tests/worker/test_ensure_multi_token_decodes_last.py b/tests/unit_tests/worker/test_ensure_multi_token_decodes_last.py
new file mode 100644
index 0000000000..c3cb83a956
--- /dev/null
+++ b/tests/unit_tests/worker/test_ensure_multi_token_decodes_last.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for hpu_model_runner.ensure_multi_token_decodes_last.
+
+Covers the routing invariant introduced for KV-offload + async spec-decode
+(PR #1401, originally #1264): when speculative decoding is disabled, a decode
+request with more than one scheduled token (a resumed/catch-up request from
+e.g. OffloadingConnector requeue) must be sorted to the end of the decode
+region so that `_get_prompts_and_decodes` routes it through the prefill path,
+avoiding bucket overflow / Habana workspace OOM.
+"""
+
+import pytest
+import torch
+import habana_frameworks.torch  # noqa: F401
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils.platform_utils import is_pin_memory_available
+
+from vllm_gaudi.v1.worker.hpu_input_batch import (CachedRequestState, InputBatch)
+from vllm_gaudi.v1.worker.hpu_model_runner import ensure_multi_token_decodes_last
+
+
+def _make_request(req_id: str, prompt_len: int, num_computed_tokens: int) -> CachedRequestState:
+    return CachedRequestState(
+        req_id=req_id,
+        prompt_token_ids=[0] * prompt_len,
+        sampling_params=SamplingParams(),
+        pooling_params=None,
+        mm_features=[],
+        block_ids=([], ),
+        generator=None,
+        num_computed_tokens=num_computed_tokens,
+        output_token_ids=[],
+    )
+
+
+def _make_input_batch(reqs: list[CachedRequestState]) -> InputBatch:
+    batch = InputBatch(
+        max_num_reqs=max(len(reqs), 1),
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device("hpu"),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+        block_sizes=[1],
+        kernel_block_sizes=[1],
+    )
+    for i, req in enumerate(reqs):
+        assigned = batch.add_request(req)
+        assert assigned == i
+    return batch
+
+
+def test_multi_token_decode_sorted_to_end_of_decode_region():
+    """[1-tok decode, multi-tok decode, 1-tok decode, prompt] should become
+    [1-tok decode, 1-tok decode, multi-tok decode, prompt]."""
+    reqs = [
+        # 1-tok decode: num_computed >= num_prompt
+        _make_request("d0", prompt_len=4, num_computed_tokens=4),
+        # multi-tok catch-up decode (num_scheduled_tokens > 1)
+        _make_request("d_multi", prompt_len=4, num_computed_tokens=4),
+        # another 1-tok decode
+        _make_request("d1", prompt_len=4, num_computed_tokens=5),
+        # prompt: num_computed < num_prompt
+        _make_request("p0", prompt_len=8, num_computed_tokens=2),
+    ]
+    batch = _make_input_batch(reqs)
+    scheduled = {"d0": 1, "d_multi": 5, "d1": 1, "p0": 8}
+
+    ensure_multi_token_decodes_last(batch, scheduled)
+
+    # Expected layout: 1-tok decodes first, then multi-tok decode, then prompt.
+    assert list(batch.req_ids[:batch.num_reqs]) == ["d0", "d1", "d_multi", "p0"]
+    # Decode region (first 3) preserves the prompt boundary.
+    for i in range(3):
+        assert batch.num_computed_tokens_cpu[i] >= batch.num_prompt_tokens[i]
+    # Prompt stays last.
+    assert batch.num_computed_tokens_cpu[3] < batch.num_prompt_tokens[3]
+
+
+def test_no_op_when_only_single_token_decodes():
+    reqs = [
+        _make_request("d0", prompt_len=4, num_computed_tokens=4),
+        _make_request("d1", prompt_len=4, num_computed_tokens=5),
+        _make_request("p0", prompt_len=8, num_computed_tokens=2),
+    ]
+    batch = _make_input_batch(reqs)
+    scheduled = {"d0": 1, "d1": 1, "p0": 8}
+    original_order = list(batch.req_ids[:batch.num_reqs])
+
+    ensure_multi_token_decodes_last(batch, scheduled)
+
+    assert list(batch.req_ids[:batch.num_reqs]) == original_order
+
+
+def test_no_op_when_only_multi_token_decodes():
+    """All decodes are multi-token: order of decode region is preserved."""
+    reqs = [
+        _make_request("d0", prompt_len=4, num_computed_tokens=4),
+        _make_request("d1", prompt_len=4, num_computed_tokens=5),
+        _make_request("p0", prompt_len=8, num_computed_tokens=2),
+    ]
+    batch = _make_input_batch(reqs)
+    scheduled = {"d0": 3, "d1": 4, "p0": 8}
+    original_order = list(batch.req_ids[:batch.num_reqs])
+
+    ensure_multi_token_decodes_last(batch, scheduled)
+
+    # Both d0 and d1 are multi-tok; write_pos never advances, no swaps occur.
+    assert list(batch.req_ids[:batch.num_reqs]) == original_order
+
+
+def test_decode_only_batch_no_prompt():
+    """No prompt in the batch: decode_end == num_reqs."""
+    reqs = [
+        _make_request("d_multi", prompt_len=4, num_computed_tokens=4),
+        _make_request("d0", prompt_len=4, num_computed_tokens=4),
+        _make_request("d1", prompt_len=4, num_computed_tokens=5),
+    ]
+    batch = _make_input_batch(reqs)
+    scheduled = {"d_multi": 7, "d0": 1, "d1": 1}
+
+    ensure_multi_token_decodes_last(batch, scheduled)
+
+    assert list(batch.req_ids[:batch.num_reqs]) == ["d0", "d1", "d_multi"]
+
+
+def test_prompt_only_batch_unchanged():
+    """No decodes: function should be a no-op."""
+    reqs = [
+        _make_request("p0", prompt_len=8, num_computed_tokens=2),
+        _make_request("p1", prompt_len=8, num_computed_tokens=0),
+    ]
+    batch = _make_input_batch(reqs)
+    scheduled = {"p0": 6, "p1": 8}
+    original_order = list(batch.req_ids[:batch.num_reqs])
+
+    ensure_multi_token_decodes_last(batch, scheduled)
+
+    assert list(batch.req_ids[:batch.num_reqs]) == original_order
+
+
+def test_missing_req_id_treated_as_single_token():
+    """Defensive: scheduled_tokens.get(req_id, 1) defaults to 1 if missing."""
+    reqs = [
+        _make_request("d0", prompt_len=4, num_computed_tokens=4),
+        _make_request("d_multi", prompt_len=4, num_computed_tokens=4),
+    ]
+    batch = _make_input_batch(reqs)
+    # d_multi is the only key; d0 absent -> treated as 1-tok decode.
+    scheduled = {"d_multi": 3}
+
+    ensure_multi_token_decodes_last(batch, scheduled)
+
+    assert list(batch.req_ids[:batch.num_reqs]) == ["d0", "d_multi"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm_gaudi/utils.py b/vllm_gaudi/utils.py
index ecceb0e4bf..5f7dbb8298 100644
--- a/vllm_gaudi/utils.py
+++ b/vllm_gaudi/utils.py
@@ -287,6 +287,24 @@ def get_compile_args(self) -> dict[str, Any]:
 
 _async_sched_module.AsyncScheduler = HPUAsyncScheduler  # type: ignore[misc]
 
+# Guard Prometheus counters against negative prompt-token counts that can arise
+# when KV-cache blocks are invalidated during OOM and token-count bookkeeping
+# becomes temporarily inconsistent.  Prometheus counters require non-negative
+# increments; clamping here prevents a crash in PrometheusStatLogger.record().
+# noqa: E402 cannot be hoisted to the module top: this monkey-patch must run
+# AFTER the `_async_sched_module.AsyncScheduler = HPUAsyncScheduler` rebind
+# above so vllm.v1.metrics.stats picks up the HPU scheduler symbol.
+import vllm.v1.metrics.stats as _stats_module  # noqa: E402
+
+_stats_get_by_source_orig = _stats_module.PromptTokenStats.get_by_source
+
+
+def _hpu_get_by_source(self, source: str) -> int:
+    return max(0, _stats_get_by_source_orig(self, source))
+
+
+_stats_module.PromptTokenStats.get_by_source = _hpu_get_by_source
+
 
 def patch_nixl_utils_for_hpu():
     """Patch vllm.distributed.nixl_utils to use nixl._api instead of rixl._api.
diff --git a/vllm_gaudi/v1/core/sched/hpu_async_scheduler.py b/vllm_gaudi/v1/core/sched/hpu_async_scheduler.py
index 9fd41b1130..19a8ddebb0 100644
--- a/vllm_gaudi/v1/core/sched/hpu_async_scheduler.py
+++ b/vllm_gaudi/v1/core/sched/hpu_async_scheduler.py
@@ -1,10 +1,118 @@
 # SPDX-License-Identifier: Apache-2.0
+from collections.abc import Iterable
+
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
-from vllm.v1.request import Request
+from vllm.v1.request import Request, RequestStatus
 
 
 class HPUAsyncScheduler(AsyncScheduler):
 
+    def schedule(self):
+        """HPU override: fix stale cached-token accounting after preemption.
+
+        After preemption a request is requeued with num_computed_tokens reset.
+        On the next schedule() the OffloadingConnector may assign new external
+        cache hits, raising num_external_computed_tokens above the stale
+        num_cached_tokens (which upstream only refreshes when negative). After
+        super().schedule() has advanced num_computed_tokens for this step, we
+        post-process running requests to detect this staleness
+        (num_cached_tokens < num_external_computed_tokens) and resync
+        num_cached_tokens.
+
+        NOTE: only requests that were actually scheduled this step land in
+        self.running here; a request requeued by the connector but not yet
+        re-scheduled stays in self.waiting and the inconsistency persists
+        until it is picked up. The Prometheus clamp in vllm_gaudi/utils.py
+        guards the metrics path during that window.
+        """
+        output = super().schedule()
+        for request in self.running:
+            # vLLM Request no longer exposes num_cached_tokens on newer
+            # branches. Keep the old fix only when the field exists.
+            if (hasattr(request, "num_cached_tokens")
+                    and request.num_cached_tokens < request.num_external_computed_tokens):
+                request.num_cached_tokens = request.num_computed_tokens
+        return output
+
+    def _update_requests_with_invalid_blocks(
+        self,
+        requests: Iterable[Request],
+        invalid_block_ids: set[int],
+        num_scheduled_tokens: dict[str, int],
+        evict_blocks: bool = True,
+    ) -> tuple[set[str], int, set[int]]:
+        """HPU override: clamp num_external_computed_tokens to 0 instead of
+        allowing it to go negative when OOM-invalidated blocks span both
+        externally-computed and locally-computed token ranges.
+
+        NOTE: This is a near-verbatim copy of the upstream
+        ``vllm.v1.core.sched.async_scheduler.AsyncScheduler
+        ._update_requests_with_invalid_blocks``. The only functional delta is
+        the ``max(0, ...)`` clamp on ``request.num_external_computed_tokens``
+        below (search for "HPU delta"). Keep this method in sync with
+        upstream when that routine evolves (hybrid memory allocator support,
+        new connector types, etc.). An upstream issue tracking the negative
+        clamp should be filed against vllm-project/vllm.
+        """
+        affected_req_ids: set[str] = set()
+        total_affected_tokens = 0
+        blocks_to_evict: set[int] = set()
+        marked_invalid_block_ids: set[int] = set()
+        for request in requests:
+            is_affected = False
+            marked_invalid_block = False
+            req_id = request.request_id
+            # TODO (davidb): add support for hybrid memory allocator
+            (req_block_ids, ) = self.kv_cache_manager.get_block_ids(req_id)
+            if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                req_num_computed_tokens = (request.num_computed_tokens - num_scheduled_tokens.get(req_id, 0) if req_id
+                                           in self.failed_recving_kv_req_ids else len(req_block_ids) * self.block_size)
+            else:
+                # vLLM removed Request.num_cached_tokens in newer branches.
+                # Fall back to upstream-equivalent computed-token accounting.
+                req_num_computed_tokens = (request.num_cached_tokens if hasattr(request, "num_cached_tokens") else
+                                           request.num_computed_tokens - num_scheduled_tokens.get(req_id, 0))
+
+            req_num_computed_blocks = (req_num_computed_tokens + self.block_size - 1) // self.block_size
+            for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids):
+                if block_id not in invalid_block_ids:
+                    continue
+
+                is_affected = True
+
+                if block_id in marked_invalid_block_ids:
+                    continue
+
+                marked_invalid_block_ids.add(block_id)
+
+                if marked_invalid_block:
+                    continue
+
+                marked_invalid_block = True
+                request.num_computed_tokens = idx * self.block_size
+                num_affected_tokens = (req_num_computed_tokens - request.num_computed_tokens)
+                total_affected_tokens += num_affected_tokens
+                # HPU delta vs upstream: clamp to 0. num_affected_tokens may
+                # exceed the number of externally-computed tokens when
+                # OOM-invalidation spans locally-computed blocks too, which
+                # would otherwise drive num_external_computed_tokens negative.
+                if hasattr(request, "num_external_computed_tokens"):
+                    request.num_external_computed_tokens = max(
+                        0,
+                        request.num_external_computed_tokens - num_affected_tokens,
+                    )
+                if evict_blocks:
+                    blocks_to_evict.update(req_block_ids[idx:])
+
+            if is_affected:
+                if not marked_invalid_block:
+                    total_affected_tokens += (request.num_computed_tokens - req_num_computed_tokens)
+                    request.num_computed_tokens = req_num_computed_tokens
+
+                affected_req_ids.add(request.request_id)
+
+        return affected_req_ids, total_affected_tokens, blocks_to_evict
+
     def _mamba_block_aligned_split(
         self,
         request: Request,
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 9665faf742..9d4983f6ab 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import collections
+from collections.abc import Mapping
 import copy
 import contextlib
 from copy import deepcopy
@@ -427,6 +428,34 @@ def gather_list(input, indices, v):
     return [input[i] if i is not None else v for i in indices]
 
 
+def ensure_multi_token_decodes_last(b: InputBatch, scheduled_tokens: Mapping[str, int]) -> None:
+    """Within the decode region, sort single-token decodes before multi-token ones.
+
+    When spec-decode is not configured, resumed/catch-up decode requests with
+    many scheduled tokens (e.g. from KV offloading requeue) must be processed
+    via the prefill path to avoid bucket overflow in get_habana_paged_attn_buffers.
+    Moving them to the end of the decode region lets _get_prompts_and_decodes
+    route them to the prefill batch where the large scheduled-token count is
+    handled correctly.
+
+    After _ensure_decodes_first the layout is: [decodes ... | prompts ...]
+    This function produces:                    [1-tok decodes | multi-tok decodes | prompts]
+    """
+    num_reqs = b.num_reqs
+    decode_end = num_reqs
+    for i in range(num_reqs):
+        if b.num_computed_tokens_cpu[i] < b.num_prompt_tokens[i]:
+            decode_end = i
+            break
+    write_pos = 0
+    for read_pos in range(decode_end):
+        req_id = b.req_ids[read_pos]
+        if scheduled_tokens.get(req_id, 1) == 1:
+            if read_pos != write_pos:
+                b.swap_states(write_pos, read_pos)
+            write_pos += 1
+
+
 def get_target_layer_suffix_list(model_type) -> list[str]:
     # This sets the suffix for the hidden layer name, which is controlled by
     # VLLM_CONFIG_HIDDEN_LAYERS. The default suffix is "DecoderLayer," which is
@@ -2037,6 +2066,17 @@ def _get_prompts_and_decodes(
             if self._is_prompt(i, scheduler_output):
                 break
 
+            # When spec-decode is not configured, a decode request with more
+            # than 1 scheduled token is a resumed/catch-up request that must
+            # be processed via the prefill (prompt) path instead. After
+            # ensure_multi_token_decodes_last these requests are sorted to the
+            # end of the decode region so the break here is correct.
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            if num_scheduled_tokens > 1 and \
+                    not self.vllm_config.speculative_config:
+                break
+
+            # This is decode
             # NOTE(chendi): To support spec decode,
             # we don't assume num_scheduled_tokens == 1.
             decode_req_ids.append(req_id)
@@ -2053,11 +2093,20 @@ def _get_prompts_and_decodes(
             req_id = self.input_batch.req_ids[i]
             assert req_id is not None
 
-            # Must be prompt
-            assert self._is_prompt(i, scheduler_output)
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[i]
+            num_prompt_tokens = self.input_batch.num_prompt_tokens[i]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+
+            # Prompt traversal must follow the exact same predicate used above
+            # to partition decode vs prompt requests, including preempted
+            # prompt / catch-up cases handled by `_is_prompt()`.
+            assert self._is_prompt(i, scheduler_output), (f"Unexpected at prompt-traversal req_id={req_id} idx={i}: "
+                                                          f"computed={num_computed_tokens}, "
+                                                          f"prompt={num_prompt_tokens}, "
+                                                          f"scheduled={num_scheduled_tokens}")
+            # NOTE(kzawora): In preempted sequences, num_output_tokens can be > 0, and still be a valid prefill
 
             prompt_req_ids.append(req_id)
-            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
             prompt_scheduled_tokens.append(int(num_scheduled_tokens))
 
         return PromptDecodeInfo(prompt_req_ids, decode_req_ids, prompt_scheduled_tokens)
@@ -2956,11 +3005,16 @@ def _create_decode_input_data(self,
                 token_ids_device[:num_decodes] = self.input_ids_hpu[:num_decodes].view(-1, 1)
             else:
                 token_ids_split_tensors = torch.split(self.input_ids_hpu[:total_num_scheduled_tokens],
-                                                      num_tokens_per_req)
-                token_ids_device[:num_decodes] = \
+                                                      num_tokens_per_req[:num_decodes])
+                # token_ids was already reshaped to [padded_batch*num_tokens, 1]
+                # (via view(-1,1) in the CPU prepare path above) before the
+                # async_h2d_copy, so token_ids_device has the same flat shape.
+                # Index [:num_decodes*num_tokens] to write all rows for the
+                # decode region (not just the first num_decodes rows).
+                token_ids_device[:num_decodes * num_tokens] = \
                     pad_sequence(list(token_ids_split_tensors),
                                     batch_first=True,
-                                    padding_value=0)[:num_decodes]
+                                    padding_value=0)[:num_decodes].view(-1, 1)
 
             #####################################
             # NOTE(Chendi): Since we can't actually do num_tokens = 2,
@@ -3985,8 +4039,17 @@ def sample_tokens(self, grammar_output: "GrammarOutput | None") -> ModelRunnerOu
         # Return [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2]
 
         batch_changed = self.batch_changed
-        # If necessary, swap decodes/prompts to have all decodes on the start
+        # If necessary, swap decodes/prompts to have all decodes on the start.
+        # Use the method form (not a module-level helper) so that the
+        # partitioning predicate matches `_is_prompt()` exactly, including
+        # preempted-prompt, decoder-only, and spec-decode cases.
         self._ensure_decodes_first(scheduler_output)
+        # When spec-decode is not configured, sort multi-token catch-up
+        # decode requests to the end of the decode region so that
+        # _get_prompts_and_decodes routes them through the prefill path,
+        # preventing bucket overflow and Habana workspace OOM.
+        if not self.vllm_config.speculative_config:
+            ensure_multi_token_decodes_last(self.input_batch, scheduler_output.num_scheduled_tokens)
         # Prepare prompts/decodes info
         pd_info = self._get_prompts_and_decodes(scheduler_output)
         num_decodes = len(pd_info.decode_req_ids)

From 397f5624139f75c77d99fa74230c7a616dc39a7f Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Mon, 18 May 2026 15:37:02 +0800
Subject: [PATCH 04/29] [DOC] Fix torchaudio version (#1425)

Signed-off-by: Youlei Yang <youlei.yang@intel.com>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 74a84f3cc5..14b0682dfc 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,8 @@ The vLLM Hardware Plugin for Intel® Gaudi® integrates [Intel® Gaudi® AI acce
 5. Install torchaudio (required by some upstream vLLM models such as QWEN3_5). Use the CPU wheel with `--no-deps` to avoid pulling a conflicting CUDA torch:
 
     ```bash
-    pip install --no-deps torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+    TORCH_VERSION=$(python3 -c "import re, torch; print(re.match(r'(\d+\.\d+\.\d+)', torch.__version__).group(1))")
+    pip install --no-deps torchaudio==$TORCH_VERSION --extra-index-url https://download.pytorch.org/whl/cpu
     ```
 
     To see all the available installation methods, such as NIXL, see the [Installation](https://vllm-gaudi.readthedocs.io/en/latest/getting_started/installation.html) guide.

From 252970e83a9ad594fa5dd40257c35b0edb7d10d1 Mon Sep 17 00:00:00 2001
From: "Seunghyuk Park (shepark)" <separk@habana.ai>
Date: Mon, 18 May 2026 00:48:56 -0700
Subject: [PATCH 05/29] Harden Qwen3.5 CI test to detect regressions (#1443)

https://github.com/vllm-project/vllm-gaudi/pull/1433 fixed a Qwen3.5
accuracy regression that was only detected
when the prompt bucket batch size is large. Adding
VLLM_PROMPT_BS_BUCKET_MAX=32 to the CI test covers that case.
Also tighten the passing threshold to better catch future regressions.

Signed-off-by: Seunghyuk Park <separk@habana.ai>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Co-authored-by: Libin Tang <libin.tang@intel.com>
---
 tests/full_tests/ci_e2e_discoverable_tests.sh     | 1 +
 tests/full_tests/model_cards/qwen3.5-35b-a3b.yaml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/full_tests/ci_e2e_discoverable_tests.sh b/tests/full_tests/ci_e2e_discoverable_tests.sh
index c4fcc8650a..9f5e5ed0ab 100755
--- a/tests/full_tests/ci_e2e_discoverable_tests.sh
+++ b/tests/full_tests/ci_e2e_discoverable_tests.sh
@@ -415,6 +415,7 @@ run_longbench_qwen3_30b_fp8_static_fp8_fsdpa_slicing_compile_test() {
 run_gsm8k_qwen35_35b_a3b_test() {
     echo "➡️ Testing GSM8K on Qwen3.5-35B-A3B..."
     VLLM_SKIP_WARMUP=True ENABLE_APC=False VLLM_FUSED_BLOCK_SOFTMAX_ADJUSTMENT=False VLLM_GRAPH_RESERVED_MEM=0.8 \
+    VLLM_PROMPT_BS_BUCKET_MAX=32 \
     pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/test_common.py" --model_card_path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/qwen3.5-35b-a3b.yaml"
     echo "✅ Test with Qwen3.5-35B-A3B passed."
 }
diff --git a/tests/full_tests/model_cards/qwen3.5-35b-a3b.yaml b/tests/full_tests/model_cards/qwen3.5-35b-a3b.yaml
index 2f92da9722..fdf42e5366 100644
--- a/tests/full_tests/model_cards/qwen3.5-35b-a3b.yaml
+++ b/tests/full_tests/model_cards/qwen3.5-35b-a3b.yaml
@@ -15,4 +15,4 @@ model_card:
 
 metrics:
   name: exact_match,strict-match
-  value: 0.75
+  value: 0.9

From e9b8f0859ded7a8c52195b5f905c73dc6448ea18 Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Mon, 18 May 2026 15:57:49 +0800
Subject: [PATCH 06/29] Fix decode bucket filter issues from #1122 (#1447)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Fixes

Two bugs introduced by #1122 (commit f24f3f9d):

### 1. IndexError when using file-based bucketing (GAUDISW-248587)
When `VLLM_BUCKETING_FROM_FILE` is used (e.g. GraniteMoeHybrid model),
`ctx_range` is passed as an empty list to `generate_buckets()`. The
`num_ctx_tokens_less_or_equal_batched_max_model_len` filter accessed
`ctx_range[0]` unconditionally, causing `IndexError: list index out of
range`.

**Fix**: Safe access with fallback to 0 when `ctx_range` is empty.

### 2. Contiguous PA decode buckets incorrectly filtered
(GAUDISW-248598)
The ctx filter was applied to contiguous PA decode buckets, incorrectly
dropping valid buckets. For example, with `max_model_len=2048`,
`block_size=256`, `max_num_seqs=256`, bucket `(256, 1, 2112)` was
filtered because `2112 > ceil(2048/256)*256 = 2048`, but 2112 is a valid
user-configured `VLLM_DECODE_BLOCK_BUCKET_MAX`.

**Fix**: Remove the ctx filter from contiguous PA decode buckets. For
contiguous PA, the block range is already bounded by `max_blocks` in the
bucketing strategies.

## Tests
- Added `test_file_buckets_with_empty_ctx_range_no_crash` — reproduces
the server.log IndexError
- Added `test_contiguous_pa_decode_buckets_not_filtered_by_ctx` —
reproduces the std_out.txt issue
- Narrowed `test_decode_buckets_satisfy_ctx_filter` to non-contiguous PA
only
- Updated docstrings

Signed-off-by: Youlei Yang <youlei.yang@intel.com>

---------

Signed-off-by: Youlei Yang <youlei.yang@intel.com>
---
 tests/unit_tests/test_bucketing.py       | 101 ++++++++++++++++++++---
 vllm_gaudi/extension/bucketing/common.py |   9 +-
 2 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py
index 3145c8d3c5..bc2fa92e91 100644
--- a/tests/unit_tests/test_bucketing.py
+++ b/tests/unit_tests/test_bucketing.py
@@ -530,9 +530,9 @@ def test_real_scenario_fallback_ctx_7408_not_truncated():
 def test_exponential_decode_block_limit_uncapped(monkeypatch):
     """Verify that decode block limit is computed from log2(max_decode_blocks).
 
-    With the new approach, excessive warmup buckets are controlled by
-    filters in generate_buckets() (num_ctx_tokens_less_or_equal_batched_max_model_len)
-    rather than by capping the block limit in get_decode_cfgs().
+    For contiguous PA, max_decode_blocks = min(max_blocks, ceil(max_model_len/block_size)*max_num_seqs).
+    The block range is already bounded by max_blocks, so no additional
+    ctx filter is applied to contiguous PA decode buckets.
     """
     monkeypatch.setenv("VLLM_EXPONENTIAL_BUCKETING", "true")
     monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
@@ -669,7 +669,6 @@ def test_padding_aware_decode_cfgs_contiguous_pa_clamps_block_range(mock_get_con
 # --- Tests that num_ctx_tokens_less_or_equal_batched_max_model_len filter is applied ---
 
 
-@pytest.mark.parametrize("use_contiguous_pa", [True, False], ids=["contiguous_pa", "non_contiguous_pa"])
 @pytest.mark.parametrize(
     ("max_model_len", "block_size", "max_num_seqs", "max_blocks", "max_num_batched_tokens"),
     [
@@ -679,13 +678,15 @@ def test_padding_aware_decode_cfgs_contiguous_pa_clamps_block_range(mock_get_con
     ],
     ids=["qwen3_32b", "small_model", "long_ctx"],
 )
-def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_model_len, block_size, max_num_seqs,
-                                           max_blocks, max_num_batched_tokens):
-    """Every decode bucket returned by generate_buckets must satisfy
-    num_ctx_tokens_less_or_equal_batched_max_model_len:
-        ctx <= ceil(max_model_len / block_size) * bs   (when ctx > ctx_range[0])
+def test_decode_buckets_satisfy_ctx_filter_non_contiguous_pa(monkeypatch, max_model_len, block_size, max_num_seqs,
+                                                             max_blocks, max_num_batched_tokens):
+    """For non-contiguous PA, every decode bucket returned by generate_buckets
+    must satisfy ctx <= ceil(max_model_len / block_size) * bs (when ctx > ctx_range[0]).
+
+    The filter is only applied to non-contiguous PA; contiguous PA decode
+    buckets are not filtered since their block range is already bounded by max_blocks.
     """
-    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", str(use_contiguous_pa).lower())
+    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false")
     clear_config()
     get_config()
 
@@ -727,3 +728,83 @@ def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_m
                             f"ctx <= ceil(max_model_len/block_size) * bs "
                             f"(max_blocks_per_seq={max_blocks_per_seq}):\n" +
                             "\n".join(f"  bs={bs}, query={query}, ctx={ctx}" for bs, query, ctx in violations[:20]))
+
+
+def test_contiguous_pa_decode_buckets_not_filtered_by_ctx(monkeypatch):
+    """For contiguous PA, the ctx filter must NOT be applied to decode buckets.
+
+    Reproduces std_out.txt issue: with max_model_len=2048, block_size=256,
+    max_num_seqs=256, the bucket (256, 1, 2112) was incorrectly filtered
+    because 2112 > ceil(2048/256)*256 = 2048.
+    """
+    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
+    clear_config()
+    get_config()
+
+    max_model_len = 2048
+    block_size = 256
+    max_num_seqs = 256
+    max_blocks = 2113
+    max_num_batched_tokens = 1048832
+
+    bs_range = [256]
+    query_range = [1]
+    ctx_range = list(range(1280, 2113, 64))  # 1280, 1344, ..., 2048, 2112
+    ctx_range.append(max_blocks)  # append num_hpu_blocks as done in generate_decode_buckets
+
+    buckets = generate_buckets(
+        bs_range=bs_range,
+        query_range=query_range,
+        ctx_range=ctx_range,
+        is_prompt=False,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        max_num_prefill_seqs=1,
+        max_num_batched_tokens=max_num_batched_tokens,
+        block_size=block_size,
+        max_blocks=max_blocks,
+    )
+
+    bucket_ctxs = [ctx for _, _, ctx in buckets]
+    assert 2112 in bucket_ctxs, (f"Bucket ctx=2112 was incorrectly filtered out. "
+                                 f"Max ctx in buckets: {max(bucket_ctxs)}")
+    assert max_blocks in bucket_ctxs, (f"Bucket ctx={max_blocks} (num_hpu_blocks) was incorrectly filtered out.")
+
+
+def test_file_buckets_bypass_filters(monkeypatch):
+    """File-based bucketing (VLLM_BUCKETING_FROM_FILE) skips all filters.
+
+    Buckets (1,1,256) and (2,1,512) would normally be rejected by the
+    batch_size_smaller_than_blocks or ctx filters in non-file mode.
+    Since file buckets bypass filters entirely, all provided buckets
+    must appear in the output unchanged.
+    """
+    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
+    clear_config()
+    get_config()
+
+    max_model_len = 2048
+    block_size = 256
+    max_num_seqs = 32
+    max_blocks = 2424
+
+    # (512,1,256) would be rejected by batch_size_smaller_than_blocks (bs > ctx)
+    # All buckets pass through because file_buckets bypass filters entirely
+    file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (512, 1, 256), (32, 1, 2424)]
+
+    buckets = generate_buckets(
+        bs_range=[],
+        query_range=[],
+        ctx_range=[],
+        is_prompt=False,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        max_num_prefill_seqs=1,
+        max_num_batched_tokens=8192,
+        block_size=block_size,
+        max_blocks=max_blocks,
+        file_buckets=file_buckets,
+    )
+
+    assert set(buckets) == set(file_buckets), (f"All file buckets should pass through unfiltered.\n"
+                                               f"Expected: {sorted(file_buckets)}\nGot: {sorted(buckets)}")
diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py
index 2fd85dbdae..870cdb0ac3 100644
--- a/vllm_gaudi/extension/bucketing/common.py
+++ b/vllm_gaudi/extension/bucketing/common.py
@@ -448,10 +448,11 @@ def batch_size_smaller_than_blocks(bs, query, ctx):
         return bs <= ctx
 
     def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx):
-        is_valid = ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_range[0] else True
+        ctx_min = ctx_range[0] if ctx_range else 0
+        is_valid = ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True
         if not is_valid:
             omitted_buckets.add(
-                ("condition: ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_range[0] else True",
+                ("condition: ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True",
                  "-> bs, query, ctx: ", bs, query, ctx))
         return is_valid
 
@@ -463,7 +464,7 @@ def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx):
         },
         "decode": {
             # depends only on contiguous PA
-            True: [num_ctx_tokens_less_or_equal_batched_max_model_len],
+            True: [],
             False: [batch_size_smaller_than_blocks, num_ctx_tokens_less_or_equal_batched_max_model_len],
         }
     }
@@ -490,7 +491,7 @@ def is_ctx_allowed(ctx):
     buckets = set()
     buckets_2d = set()
     omitted_buckets = set()
-    filters = get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
+    filters = [] if file_buckets else get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
     corrector = get_corrector(is_prompt, use_contiguous_pa)
 
     if file_buckets:

From e5b23b22af2a32fb572df8b3c75758ba3df1795f Mon Sep 17 00:00:00 2001
From: "Seunghyuk Park (shepark)" <separk@habana.ai>
Date: Mon, 18 May 2026 01:01:01 -0700
Subject: [PATCH 07/29] Fix mamba_type comparison for GDN hybrid cache
 allococation (#1449)

Upstream vllm commit 5536fc0c0 changed MambaSpec.mamba_type from str to
MambaAttentionBackendEnum. The hybrid cache allocation in
hpu_model_runner.py still compared against str literals, causing GDN
layers to fall through to the Mamba2 shared-buffer path. This created
mixed-dtype views (bf16 conv_state+fp32 ssm_state) on the same storage,
triggering an aot_autograd assertion error during compilation.

Use a module-level _GDN_MAMBA_TYPES tuple that includes both enum values
and string literals for backward compatibility with older upstream
versions.

---------

Signed-off-by: Seunghyuk Park <separk@habana.ai>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 21 ++++++++++++++-------
 vllm_gaudi/v1/worker/hpu_worker.py       | 15 +++++----------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 9d4983f6ab..bf5b50293b 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -137,6 +137,14 @@
 except ImportError:
     LMCacheConnectorMetadata = None
 
+_GDN_MAMBA_TYPES: tuple[object, ...] = ("gdn_attention", "linear_attention")
+try:
+    from vllm.v1.attention.backends.registry import MambaAttentionBackendEnum
+    _GDN_MAMBA_TYPES = (MambaAttentionBackendEnum.GDN_ATTN, MambaAttentionBackendEnum.LINEAR, "gdn_attention",
+                        "linear_attention")
+except (ImportError, AttributeError):
+    pass
+
 _TYPE_CACHE: dict[str, dict[str, Any]] = {}
 
 HPU_TORCH_DTYPE_TO_STR_DTYPE = {
@@ -5943,8 +5951,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         if self.num_mamba_like_layers > 0 and self._compact_gdn_enabled:
             self._num_gdn_groups = sum(
                 1 for g in kv_cache_config.kv_cache_groups
-                if isinstance(g.kv_cache_spec, MambaSpec) and g.kv_cache_spec.mamba_type in ("gdn_attention",
-                                                                                             "linear_attention"))
+                if isinstance(g.kv_cache_spec, MambaSpec) and g.kv_cache_spec.mamba_type in _GDN_MAMBA_TYPES)
         # Profiling may request more sequences than max_num_seqs
         # (e.g. VLLM_PROFILE_DECODE=16,64 with max_num_seqs=1).
         # Ensure GDN compact tensors and free-list are large enough.
@@ -5979,7 +5986,7 @@ def _needs_raw_buffer(kv_cache_tensor) -> bool:
                     if isinstance(spec, FullAttentionSpec):
                         continue
                     if isinstance(spec, MambaSpec) and \
-                            spec.mamba_type in ("gdn_attention", "linear_attention"):
+                            spec.mamba_type in _GDN_MAMBA_TYPES:
                         continue
                     # Standard Mamba2 or unknown spec — needs raw buffer
                     return True
@@ -6027,7 +6034,7 @@ def _needs_raw_buffer(kv_cache_tensor) -> bool:
                         vc = torch.zeros(kv_cache_shape, dtype=kv_cache_spec.dtype, device=self.device)
                         kv_caches[layer_name] = (kc, vc, None, None)
                     elif isinstance(kv_cache_spec, MambaSpec) and \
-                            kv_cache_spec.mamba_type in ("gdn_attention", "linear_attention") and \
+                            kv_cache_spec.mamba_type in _GDN_MAMBA_TYPES and \
                             self._compact_gdn_enabled:
                         # GDN/linear_attention: compact allocation.
                         # All GDN groups share the same state tensor, so each
@@ -6054,7 +6061,7 @@ def _needs_raw_buffer(kv_cache_tensor) -> bool:
                                 kv_caches[shared_layer] = tuple(state_tensors)
                             break
                     elif isinstance(kv_cache_spec, MambaSpec) and \
-                            kv_cache_spec.mamba_type in ("gdn_attention", "linear_attention"):
+                            kv_cache_spec.mamba_type in _GDN_MAMBA_TYPES:
                         # GDN/linear_attention: non-compact (baseline) allocation
                         # using contiguous tensors with num_blocks+1 slots.
                         if isinstance(kv_caches.get(layer_name), tuple):
@@ -6119,7 +6126,7 @@ def _needs_raw_buffer(kv_cache_tensor) -> bool:
                         vc = torch.zeros(kv_cache_shape, dtype=kv_cache_spec.dtype, device=self.device)
                         kv_caches[layer_name] = (kc, vc, None, None)
                     elif isinstance(kv_cache_spec, MambaSpec) and \
-                            kv_cache_spec.mamba_type in ("gdn_attention", "linear_attention") and \
+                            kv_cache_spec.mamba_type in _GDN_MAMBA_TYPES and \
                             self._compact_gdn_enabled:
                         # GDN/linear_attention: compact allocation.
                         self._compact_gdn_group_ids.add(group_idx)
@@ -6139,7 +6146,7 @@ def _needs_raw_buffer(kv_cache_tensor) -> bool:
                                 kv_caches[shared_layer] = tuple(state_tensors)
                             break
                     elif isinstance(kv_cache_spec, MambaSpec) and \
-                            kv_cache_spec.mamba_type in ("gdn_attention", "linear_attention"):
+                            kv_cache_spec.mamba_type in _GDN_MAMBA_TYPES:
                         # GDN/linear_attention: non-compact (baseline) allocation.
                         if isinstance(kv_caches.get(layer_name), tuple):
                             continue
diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py
index 89ec62e9d7..b9967a8280 100644
--- a/vllm_gaudi/v1/worker/hpu_worker.py
+++ b/vllm_gaudi/v1/worker/hpu_worker.py
@@ -31,7 +31,7 @@
 from vllm.v1.outputs import (DraftTokenIds, AsyncModelRunnerOutput, ModelRunnerOutput)
 from vllm.v1.worker.utils import bind_kv_cache
 from vllm_gaudi.utils import is_fake_hpu
-from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner
+from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner, _GDN_MAMBA_TYPES
 from vllm.v1.worker.worker_base import CompilationTimes, WorkerBase
 
 from vllm_gaudi.extension.logger import logger as init_logger
@@ -441,12 +441,9 @@ def determine_available_memory(self) -> int:
         # Reduce reported memory so the scheduler computes fewer
         # num_blocks that fit the HPU separate-allocation model.
         has_attn = any(isinstance(s, FullAttentionSpec) for s in kv_cache_spec.values())
-        has_gdn = any(
-            isinstance(s, MambaSpec) and s.mamba_type in ("gdn_attention", "linear_attention")
-            for s in kv_cache_spec.values())
+        has_gdn = any(isinstance(s, MambaSpec) and s.mamba_type in _GDN_MAMBA_TYPES for s in kv_cache_spec.values())
         has_standard_mamba = any(
-            isinstance(s, MambaSpec) and s.mamba_type not in ("gdn_attention", "linear_attention")
-            for s in kv_cache_spec.values())
+            isinstance(s, MambaSpec) and s.mamba_type not in _GDN_MAMBA_TYPES for s in kv_cache_spec.values())
         compact_gdn = os.environ.get("VLLM_COMPACT_GDN", "0").strip().lower() in ("1", "true")
         if has_attn and has_gdn and not compact_gdn:
             # When compact GDN is OFF, GDN state scales with num_blocks
@@ -462,8 +459,7 @@ def determine_available_memory(self) -> int:
             real_attn = next(s.real_page_size_bytes for s in kv_cache_spec.values() if isinstance(s, FullAttentionSpec))
             real_mamba = next(
                 sum(math.prod(sh) * get_dtype_size(dt) for sh, dt in zip(s.shapes, s.dtypes))
-                for s in kv_cache_spec.values()
-                if isinstance(s, MambaSpec) and s.mamba_type in ("gdn_attention", "linear_attention"))
+                for s in kv_cache_spec.values() if isinstance(s, MambaSpec) and s.mamba_type in _GDN_MAMBA_TYPES)
             total_real = real_attn + real_mamba
             if total_real > padded_page:
                 factor = padded_page / total_real
@@ -484,8 +480,7 @@ def determine_available_memory(self) -> int:
             attn_page_size = next(s.page_size_bytes for s in kv_cache_spec.values() if isinstance(s, FullAttentionSpec))
             mamba_state_per_block = next(
                 sum(math.prod(sh) * get_dtype_size(dt) for sh, dt in zip(s.shapes, s.dtypes))
-                for s in kv_cache_spec.values()
-                if isinstance(s, MambaSpec) and s.mamba_type not in ("gdn_attention", "linear_attention"))
+                for s in kv_cache_spec.values() if isinstance(s, MambaSpec) and s.mamba_type not in _GDN_MAMBA_TYPES)
             if attn_page_size > 0:
                 ratio = attn_page_size / (attn_page_size + mamba_state_per_block)
                 adjusted = int(available * ratio)

From 27c367b9f04b3f8a7085862f8a5ca0a1166c617e Mon Sep 17 00:00:00 2001
From: Kamil Kaczor <kamil.kaczor@intel.com>
Date: Mon, 18 May 2026 12:56:07 +0200
Subject: [PATCH 08/29] fix: replace batched_count_greater_than to avoid
 dynamic shape TypeError on HPU (#1412)

## Summary

Upstream vLLM decorates `batched_count_greater_than` with
`@torch.compile(dynamic=True)`, which causes Habana's `recipe_compiler`
to raise `TypeError: Cannot convert symbols to int` when processing
symbolic shapes. Additionally, `mark_unbacked` in the caller
(`gather_logprobs`) prevents `dynamic=False` from being a viable
alternative.

## Fix

Replace with a plain (uncompiled) version of the same function. The
patching is deferred to `load_general_plugins` time via a hook on
`vllm.plugins.load_general_plugins`, because importing
`vllm.v1.sample.sampler` during early plugin registration triggers a
heavy import chain that interferes with platform initialisation.

## Why deferred patching?

- Importing `vllm.v1.sample.sampler` during `apply()` (called from
`register()`) triggers a heavy import chain that resets platform
detection, causing `Device string must not be empty`.
- The patching hooks into `load_general_plugins` which runs in every
process (parent + EngineCore subprocess) after the platform is ready.
- `sampler.py` uses `from ... import batched_count_greater_than` which
creates a module-level global resolved via `LOAD_GLOBAL` at call time,
so patching the module attribute works.

## Testing

- `test_skip_tokenizer_initialization` PASSES
- `test_engine_args` (3 tests) PASS
- Inference with `logprobs=5` produces correct output

Signed-off-by: Kamil Kaczor <kamil.kaczor@intel.com>
---
 vllm_gaudi/patches.py | 56 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/vllm_gaudi/patches.py b/vllm_gaudi/patches.py
index 4a784dd26d..ac3931eb73 100644
--- a/vllm_gaudi/patches.py
+++ b/vllm_gaudi/patches.py
@@ -17,6 +17,16 @@
   requires the device's allocator to be a ``c10::DeviceAllocator``.  We
   replace it with an HPU-safe variant that uses
   ``current_platform.empty_cache()`` instead (see GAUDISW-247825).
+
+* ``vllm.v1.sample.ops.logprobs.batched_count_greater_than`` — upstream
+  decorates this function with ``@torch.compile(dynamic=True, ...)``.
+  Habana's ``recipe_compiler`` backend cannot handle the symbolic shapes
+  produced by ``dynamic=True`` (and by ``mark_unbacked`` in the caller),
+  raising ``TypeError: Cannot convert symbols to int``.  We replace it
+  with a plain (uncompiled) version of the same function.  The replacement
+  is deferred to ``load_general_plugins`` time to avoid importing
+  ``vllm.v1.sample.sampler`` during early plugin registration, which would
+  trigger a heavy import chain that interferes with platform initialisation.
 """
 
 import functools
@@ -74,6 +84,31 @@ def _hpu_cleanup_dist_env_and_memory(shutdown_ray: bool = False) -> None:
         parallel_state.logger.warning("torch._C._host_emptyCache() only available in Pytorch >=2.5")
 
 
+def _hpu_batched_count_greater_than(x: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
+    """HPU-safe replacement for ``batched_count_greater_than``.
+
+    Identical logic to the upstream implementation but *not* wrapped in
+    ``torch.compile``.  The upstream decorator uses ``dynamic=True`` whose
+    symbolic shapes are incompatible with Habana's ``recipe_compiler``
+    backend, and ``mark_unbacked`` in the caller prevents ``dynamic=False``
+    from helping.
+    """
+    return (x >= values).sum(-1)
+
+
+def _patch_batched_count_greater_than() -> None:
+    """Replace ``batched_count_greater_than`` in the sampler & logprobs modules.
+
+    Called from the ``load_general_plugins`` hook so that the heavy
+    ``vllm.v1.sample.*`` import chain runs *after* platform initialisation.
+    """
+    import vllm.v1.sample.ops.logprobs as _logprobs_mod
+    import vllm.v1.sample.sampler as _sampler_mod
+
+    _logprobs_mod.batched_count_greater_than = _hpu_batched_count_greater_than
+    _sampler_mod.batched_count_greater_than = _hpu_batched_count_greater_than
+
+
 def apply() -> None:
     """Install all HPU runtime monkey-patches."""
     # --- torch.accelerator.empty_cache ---
@@ -83,15 +118,28 @@ def apply() -> None:
     if not hasattr(torch._C, "_host_emptyCache"):
         torch._C._host_emptyCache = lambda: None
 
-    # Patch the canonical definition.
+    # --- cleanup_dist_env_and_memory ---
     parallel_state.cleanup_dist_env_and_memory = _hpu_cleanup_dist_env_and_memory
-    # Patch the re-export from ``vllm.distributed`` so ``from vllm.distributed
-    # import cleanup_dist_env_and_memory`` (used by the upstream pytest
-    # conftest) also picks up the HPU-safe version.
     import vllm.distributed as _vllm_distributed
 
     _vllm_distributed.cleanup_dist_env_and_memory = _hpu_cleanup_dist_env_and_memory
 
+    # --- batched_count_greater_than (deferred) ---
+    # We cannot import the sampler modules here because the import chain
+    # triggers platform re-initialisation ("Device string must not be
+    # empty").  Instead we hook into ``load_general_plugins`` which runs
+    # in every process (parent + EngineCore subprocess) after the platform
+    # is ready.
+    import vllm.plugins as _plugins_mod
+
+    _original_load_general = _plugins_mod.load_general_plugins
+
+    def _load_general_with_hpu_patches():
+        _original_load_general()
+        _patch_batched_count_greater_than()
+
+    _plugins_mod.load_general_plugins = _load_general_with_hpu_patches
+
 
 def patch_hf3fs_mock_client():
     """Guard CUDA sync in the HF3FS mock client on non-CUDA platforms.

From 4d6d38c256c754d4c03d611f9820070f656b1b15 Mon Sep 17 00:00:00 2001
From: Kamil Kaczor <kamil.kaczor@intel.com>
Date: Mon, 18 May 2026 14:41:03 +0200
Subject: [PATCH 09/29] fix: bypass _forward_impl for dp_size==1 to fix
 DeepSeek R1 FP8 crash (#1441)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

DeepSeek R1 (671B) crashes during warmup on G3 with FP8 quantization
(GAUDISW-248418).

Two error manifestations:
- `RuntimeError: Incompatible input shapes, broadcast not possible.
Tensor1 Size: 7168 30720 Tensor2 Size: 256 1`
- `RuntimeError: Attempting to broadcast a dimension of length 256 at
-1! Mismatching argument at index 1 had torch.Size([1, 256]); but
expected shape should be broadcastable to [8192, 7168]`

Both crash at `hpu_grouped_topk_router.py:64` during MoE gate
application.

## Root Cause

`_forward_impl` introduces graph breaks via
`_sequence_parallel_context()` (calls `get_forward_context()`). Combined
with double gate application (gate called in `patched_fused_moe_forward`
AND again inside `_forward_impl`), Dynamo miscompiles the graph on HPU
Synapse, causing shape mismatches.

Regression window: Build 254 (good) → Build 260 (broken), introduced by
commit `98863a7` (MoE dynamo recompilation fix).

## Fix

For `dp_size==1` (the common single-node case), bypass `_forward_impl`
entirely and call `_apply_quant_method` + `_maybe_combine` directly.
This:
1. Eliminates graph breaks from `_sequence_parallel_context()` and
`get_forward_context()`
2. Skips the no-op `_maybe_dispatch()` (only needed for dp_size > 1)
3. Prevents double gate application
4. Adds a RuntimeError guard for `pcp_size > 1` (unsupported in fast
path)

The `dp_size > 1` fallback via `_forward_entry` is unchanged.

## Testing

Tested on G3 (8x HL-325L) with DeepSeek R1 671B FP8 TP=8:
- ✅ Prompt warmup: 54/54 items completed (crash site in original bug)
- ✅ Decode warmup: 25/25 items completed
- ✅ End-to-end inference: valid completions returned

Fixes: GAUDISW-248418

---------

Signed-off-by: Kamil Kaczor <kamil.kaczor@intel.com>
Co-authored-by: Iryna Boiko <iryna.boiko@intel.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 vllm_gaudi/ops/hpu_fused_moe.py | 48 ++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py
index c496c79679..21db0b541a 100644
--- a/vllm_gaudi/ops/hpu_fused_moe.py
+++ b/vllm_gaudi/ops/hpu_fused_moe.py
@@ -284,7 +284,8 @@ def patched_fused_moe_forward(
     ensure_moe_quant_config_init, and _sequence_parallel_context — all of
     which access ForwardContext and cause torch.compile graph breaks), we
     use a layer reference stashed on the runner at FusedMoE.__init__ time
-    (self._hpu_layer_ref) and call _forward_impl directly. This also
+    (self._hpu_layer_ref) and bypass _forward_impl for dp_size==1,
+    calling _apply_quant_method + _maybe_combine directly. This also
     bypasses self.layer_name (a per-layer string) so dynamo no longer
     emits per-layer string guards that trigger recompilation.
 
@@ -297,17 +298,27 @@ def patched_fused_moe_forward(
     hidden_states, og_hidden_dims = self._maybe_pad_hidden_states(shared_experts_input, hidden_states)
 
     if self.moe_config.dp_size == 1:
-        # Use layer ref saved at FusedMoE.__init__ to avoid both the
-        # get_layer_from_name(self.layer_name) lookup (graph break) and
-        # the per-layer string guard from accessing self.layer_name.
-        # Replicate the remaining forward_dispatch logic that we bypass:
-        # 1. Sync shared experts stream for multi-stream overlap
+        # Bypass _forward_impl entirely for dp_size==1 to eliminate
+        # graph breaks from _sequence_parallel_context() (which calls
+        # get_forward_context()), skip the no-op _maybe_dispatch(), and
+        # avoid double gate / stream-sync calls that _forward_impl
+        # would redundantly repeat.
+        if self.moe_config.pcp_size > 1:
+            raise RuntimeError("dp_size==1 fast path does not support pcp_size > 1")
+        layer = self._hpu_layer_ref
+        layer.ensure_moe_quant_config_init()
         self._maybe_sync_shared_experts_stream(shared_experts_input)
-        # 2. Apply gate if the runner owns it (internal router mode)
-        if self.gate is not None:
-            router_logits, _ = self.gate(hidden_states)
-
-        result = self._forward_impl(self._hpu_layer_ref, hidden_states, router_logits, shared_experts_input, input_ids)
+        gate = self.gate or getattr(self, "_hpu_gate_ref", None)
+        if gate is not None:
+            router_logits, _ = gate(hidden_states)
+        shared_output, fused_hidden = self._apply_quant_method(
+            layer=layer,
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            shared_experts_input=shared_experts_input,
+            input_ids=input_ids,
+        )
+        result = self._maybe_combine(shared_output, fused_hidden)
     else:
         result = self._forward_entry(hidden_states, router_logits, shared_experts_input, input_ids,
                                      self._encode_layer_name(), self._trtllm_mxfp4_unpadded_dim())
@@ -546,15 +557,14 @@ def _patched_default_moe_runner_forward(self, *args, **kwargs):
 
 def _hpu_fused_moe_init(self, *args, **kwargs):
     _orig_fused_moe_init(self, *args, **kwargs)
-    if hasattr(self, 'runner'):
-        object.__setattr__(self.runner, '_hpu_layer_ref', self)
+    if hasattr(self, "runner"):
+        object.__setattr__(self.runner, "_hpu_layer_ref", self)
+        if self.runner.gate is not None:
+            object.__setattr__(self.runner, "_hpu_gate_ref", self.runner.gate)
 
 
 FusedMoE.__init__ = _hpu_fused_moe_init
 
-vllm.model_executor.layers.fused_moe.layer.get_compressed_expert_map = \
-    get_compressed_expert_map
-vllm.model_executor.layers.fused_moe.router.router_factory.create_fused_moe_router = \
-    create_fused_moe_router
-vllm.model_executor.layers.fused_moe.layer.create_fused_moe_router = \
-    create_fused_moe_router
+vllm.model_executor.layers.fused_moe.layer.get_compressed_expert_map = get_compressed_expert_map
+vllm.model_executor.layers.fused_moe.router.router_factory.create_fused_moe_router = create_fused_moe_router
+vllm.model_executor.layers.fused_moe.layer.create_fused_moe_router = create_fused_moe_router

From fc43efa3d242d7abf6a39ce3dd24c4d5a1de3312 Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Tue, 19 May 2026 00:34:10 +0800
Subject: [PATCH 10/29] Remove
 num_ctx_tokens_less_or_equal_batched_max_model_len filter (#1454)

Revert the decode bucket filter introduced in f24f3f9 that drops buckets
with batched contexts larger than batched max_model_len as it is
functionally duplicate to
[correct_for_max_model_len](https://github.com/vllm-project/vllm-gaudi/blob/e5b23b22af2a32fb572df8b3c75758ba3df1795f/vllm_gaudi/extension/bucketing/common.py#L442).

## Changes:
- Remove the `num_ctx_tokens_less_or_equal_batched_max_model_len` filter
function from `generate_buckets()`
- Revert `filters_map` decode filters to pre-f24f3f9 state (`True: []`,
`False: [batch_size_smaller_than_blocks]`)
- Remove corresponding tests
(`test_exponential_decode_block_limit_uncapped`,
`test_decode_buckets_satisfy_ctx_filter`)

Signed-off-by: Youlei Yang <youlei.yang@intel.com>

---------

Signed-off-by: Youlei Yang <youlei.yang@intel.com>
Co-authored-by: Kamil Kaczor <kamil.kaczor@intel.com>
---
 tests/unit_tests/test_bucketing.py       | 180 +----------------------
 vllm_gaudi/extension/bucketing/common.py |  13 +-
 2 files changed, 3 insertions(+), 190 deletions(-)

diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py
index bc2fa92e91..24dfcfbbf1 100644
--- a/tests/unit_tests/test_bucketing.py
+++ b/tests/unit_tests/test_bucketing.py
@@ -232,11 +232,7 @@ def test_exponential_decode_cfgs_contiguous_pa_uses_max_blocks(mock_get_config):
 
 @patch('vllm_gaudi.extension.bucketing.exponential.get_config')
 def test_exponential_decode_cfgs_non_contiguous_pa_formula(mock_get_config):
-    """Verify non-contiguous PA decode cfg uses ceil(max_model_len/block_size)*max_num_seqs.
-
-    Actual bounding of excessive buckets happens via the
-    num_ctx_tokens_less_or_equal_batched_max_model_len filter in generate_buckets().
-    """
+    """Verify non-contiguous PA decode cfg uses ceil(max_model_len/block_size)*max_num_seqs."""
     mock_get_config.return_value = _MockConfig(use_contiguous_pa=False)
     strategy = ExponentialBucketingStrategy()
 
@@ -527,36 +523,6 @@ def test_real_scenario_fallback_ctx_7408_not_truncated():
     assert new_ctx == calc_fallback_value(7408, 32), (f"Fallback ctx {new_ctx} should equal calc_fallback_value result")
 
 
-def test_exponential_decode_block_limit_uncapped(monkeypatch):
-    """Verify that decode block limit is computed from log2(max_decode_blocks).
-
-    For contiguous PA, max_decode_blocks = min(max_blocks, ceil(max_model_len/block_size)*max_num_seqs).
-    The block range is already bounded by max_blocks, so no additional
-    ctx filter is applied to contiguous PA decode buckets.
-    """
-    monkeypatch.setenv("VLLM_EXPONENTIAL_BUCKETING", "true")
-    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
-    clear_config()
-    get_config()
-
-    strategy = ExponentialBucketingStrategy()
-    max_num_seqs = 21
-    block_size = 128
-    max_num_batched_tokens = 8192
-    max_model_len = 131072
-    max_blocks = 65536
-
-    bs_cfg, query_cfg, block_cfg = strategy.get_decode_cfgs(max_num_seqs, block_size, max_num_batched_tokens,
-                                                            max_model_len, max_blocks)
-
-    # max_decode_blocks = min(65536, ceil(131072/128)*21) = min(65536, 21504) = 21504
-    expected_max_decode_blocks = min(max_blocks, math.ceil(max_model_len / block_size) * max_num_seqs)
-    expected_limit = math.ceil(math.log2(expected_max_decode_blocks)) + 1
-    assert block_cfg[2] == expected_max_decode_blocks, (
-        f"Expected max_decode_blocks={expected_max_decode_blocks}, got {block_cfg[2]}")
-    assert block_cfg[3] == expected_limit, (f"Expected decode_blocks_limit={expected_limit}, got {block_cfg[3]}")
-
-
 # --- Padding-aware bucketing tests ---
 
 
@@ -664,147 +630,3 @@ def test_padding_aware_decode_cfgs_contiguous_pa_clamps_block_range(mock_get_con
                                                max_blocks=3593)
 
     assert block_cfg == [3465, 128, 3593, 899, 25]
-
-
-# --- Tests that num_ctx_tokens_less_or_equal_batched_max_model_len filter is applied ---
-
-
-@pytest.mark.parametrize(
-    ("max_model_len", "block_size", "max_num_seqs", "max_blocks", "max_num_batched_tokens"),
-    [
-        (91964, 128, 256, 3593, 2048),  # Qwen3-32B real scenario
-        (4096, 128, 64, 500, 2048),  # small model
-        (131072, 128, 21, 65536, 8192),  # long context
-    ],
-    ids=["qwen3_32b", "small_model", "long_ctx"],
-)
-def test_decode_buckets_satisfy_ctx_filter_non_contiguous_pa(monkeypatch, max_model_len, block_size, max_num_seqs,
-                                                             max_blocks, max_num_batched_tokens):
-    """For non-contiguous PA, every decode bucket returned by generate_buckets
-    must satisfy ctx <= ceil(max_model_len / block_size) * bs (when ctx > ctx_range[0]).
-
-    The filter is only applied to non-contiguous PA; contiguous PA decode
-    buckets are not filtered since their block range is already bounded by max_blocks.
-    """
-    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false")
-    clear_config()
-    get_config()
-
-    strategy = ExponentialBucketingStrategy()
-
-    bs_cfg, query_cfg, block_cfg = strategy.get_decode_cfgs(
-        max_num_seqs=max_num_seqs,
-        block_size=block_size,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_model_len,
-        max_blocks=max_blocks,
-    )
-    bs_range = strategy.get_range(bs_cfg)
-    query_range = strategy.get_range(query_cfg)
-    ctx_range = strategy.get_range(block_cfg)
-
-    buckets = generate_buckets(
-        bs_range=bs_range,
-        query_range=query_range,
-        ctx_range=ctx_range,
-        is_prompt=False,
-        max_model_len=max_model_len,
-        max_num_seqs=max_num_seqs,
-        max_num_prefill_seqs=1,
-        max_num_batched_tokens=max_num_batched_tokens,
-        block_size=block_size,
-        max_blocks=max_blocks,
-    )
-
-    ctx_min = ctx_range[0]
-    max_blocks_per_seq = math.ceil(max_model_len / block_size)
-
-    violations = []
-    for bs, query, ctx in buckets:
-        if ctx > ctx_min and ctx > max_blocks_per_seq * bs:
-            violations.append((bs, query, ctx))
-
-    assert not violations, (f"Found {len(violations)} decode bucket(s) violating "
-                            f"ctx <= ceil(max_model_len/block_size) * bs "
-                            f"(max_blocks_per_seq={max_blocks_per_seq}):\n" +
-                            "\n".join(f"  bs={bs}, query={query}, ctx={ctx}" for bs, query, ctx in violations[:20]))
-
-
-def test_contiguous_pa_decode_buckets_not_filtered_by_ctx(monkeypatch):
-    """For contiguous PA, the ctx filter must NOT be applied to decode buckets.
-
-    Reproduces std_out.txt issue: with max_model_len=2048, block_size=256,
-    max_num_seqs=256, the bucket (256, 1, 2112) was incorrectly filtered
-    because 2112 > ceil(2048/256)*256 = 2048.
-    """
-    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
-    clear_config()
-    get_config()
-
-    max_model_len = 2048
-    block_size = 256
-    max_num_seqs = 256
-    max_blocks = 2113
-    max_num_batched_tokens = 1048832
-
-    bs_range = [256]
-    query_range = [1]
-    ctx_range = list(range(1280, 2113, 64))  # 1280, 1344, ..., 2048, 2112
-    ctx_range.append(max_blocks)  # append num_hpu_blocks as done in generate_decode_buckets
-
-    buckets = generate_buckets(
-        bs_range=bs_range,
-        query_range=query_range,
-        ctx_range=ctx_range,
-        is_prompt=False,
-        max_model_len=max_model_len,
-        max_num_seqs=max_num_seqs,
-        max_num_prefill_seqs=1,
-        max_num_batched_tokens=max_num_batched_tokens,
-        block_size=block_size,
-        max_blocks=max_blocks,
-    )
-
-    bucket_ctxs = [ctx for _, _, ctx in buckets]
-    assert 2112 in bucket_ctxs, (f"Bucket ctx=2112 was incorrectly filtered out. "
-                                 f"Max ctx in buckets: {max(bucket_ctxs)}")
-    assert max_blocks in bucket_ctxs, (f"Bucket ctx={max_blocks} (num_hpu_blocks) was incorrectly filtered out.")
-
-
-def test_file_buckets_bypass_filters(monkeypatch):
-    """File-based bucketing (VLLM_BUCKETING_FROM_FILE) skips all filters.
-
-    Buckets (1,1,256) and (2,1,512) would normally be rejected by the
-    batch_size_smaller_than_blocks or ctx filters in non-file mode.
-    Since file buckets bypass filters entirely, all provided buckets
-    must appear in the output unchanged.
-    """
-    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
-    clear_config()
-    get_config()
-
-    max_model_len = 2048
-    block_size = 256
-    max_num_seqs = 32
-    max_blocks = 2424
-
-    # (512,1,256) would be rejected by batch_size_smaller_than_blocks (bs > ctx)
-    # All buckets pass through because file_buckets bypass filters entirely
-    file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (512, 1, 256), (32, 1, 2424)]
-
-    buckets = generate_buckets(
-        bs_range=[],
-        query_range=[],
-        ctx_range=[],
-        is_prompt=False,
-        max_model_len=max_model_len,
-        max_num_seqs=max_num_seqs,
-        max_num_prefill_seqs=1,
-        max_num_batched_tokens=8192,
-        block_size=block_size,
-        max_blocks=max_blocks,
-        file_buckets=file_buckets,
-    )
-
-    assert set(buckets) == set(file_buckets), (f"All file buckets should pass through unfiltered.\n"
-                                               f"Expected: {sorted(file_buckets)}\nGot: {sorted(buckets)}")
diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py
index 870cdb0ac3..ba4b11a7a4 100644
--- a/vllm_gaudi/extension/bucketing/common.py
+++ b/vllm_gaudi/extension/bucketing/common.py
@@ -447,15 +447,6 @@ def batch_size_smaller_than_blocks(bs, query, ctx):
             omitted_buckets.add(("condition: bs <= ctx, ", "-> bs, query, ctx: ", bs, query, ctx))
         return bs <= ctx
 
-    def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx):
-        ctx_min = ctx_range[0] if ctx_range else 0
-        is_valid = ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True
-        if not is_valid:
-            omitted_buckets.add(
-                ("condition: ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True",
-                 "-> bs, query, ctx: ", bs, query, ctx))
-        return is_valid
-
     filters_map = {
         "prompt": {
             # depends only on merged_prefill
@@ -465,7 +456,7 @@ def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx):
         "decode": {
             # depends only on contiguous PA
             True: [],
-            False: [batch_size_smaller_than_blocks, num_ctx_tokens_less_or_equal_batched_max_model_len],
+            False: [batch_size_smaller_than_blocks],
         }
     }
 
@@ -491,7 +482,7 @@ def is_ctx_allowed(ctx):
     buckets = set()
     buckets_2d = set()
     omitted_buckets = set()
-    filters = [] if file_buckets else get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
+    filters = get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
     corrector = get_corrector(is_prompt, use_contiguous_pa)
 
     if file_buckets:

From a3319301bbcf3d802c1e2392d9a5f08af0ee79bc Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iryna.boiko@intel.com>
Date: Tue, 19 May 2026 08:36:16 +0200
Subject: [PATCH 11/29] fix kernel block size, port of #1439 (#1453)

Signed-off-by: Iryna Boiko <iboiko@habana.ai>
---
 vllm_gaudi/v1/attention/backends/hpu_attn.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm_gaudi/v1/attention/backends/hpu_attn.py b/vllm_gaudi/v1/attention/backends/hpu_attn.py
index 1a36a7d56c..5f6f10acf7 100644
--- a/vllm_gaudi/v1/attention/backends/hpu_attn.py
+++ b/vllm_gaudi/v1/attention/backends/hpu_attn.py
@@ -35,10 +35,11 @@ def get_metadata_cls() -> type["AttentionMetadata"]:
 
     @staticmethod
     def get_supported_kernel_block_sizes() -> list[Union[int, MultipleOf]]:
-        # 128 is the standard HPU kernel block size; 528 is required for
-        # Granite 4.0-H (granitemoehybrid) without prefix caching (16-token
-        # FA alignment), 768 with prefix caching (chunk-aligned).
-        return [128, 528, 768]
+        # 16 is supported for testing/smaller models; 128 is the standard HPU
+        # kernel block size; 528 is required for Granite 4.0-H
+        # (granitemoehybrid) without prefix caching (16-token FA alignment),
+        # 768 with prefix caching (chunk-aligned).
+        return [16, 128, 528, 768]
 
     @classmethod
     def get_preferred_block_size(cls, default_block_size: int) -> int:

From 0e5850678866a3cf2239179a34d357550f51b43e Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Date: Tue, 19 May 2026 11:06:15 +0200
Subject: [PATCH 12/29] fix: hybrid model warmup block_size mismatch
 (Qwen3.5-35B-A3B) (#1434)

## Problem

For hybrid models (e.g., Qwen3.5-35B-A3B), decode buckets warmed during
startup are later reported as "not warmed-up" during inference. This
causes every decode step to fall back to the `_check_config` warning
path and potentially suboptimal performance.

## Root Cause

Two related issues:

### 1. `initialize_kv_cache` overwrites `block_size` with inflated
KV-manager page size

Lines added in main (not present in v0.19.0) in `initialize_kv_cache`:
```python
self.block_size = self.vllm_config.cache_config.block_size
self.bucketing_manager.block_size = self.block_size
```

For hybrid models, `HybridAttentionMambaModelConfig` sets
`cache_config.block_size` to a large aligned page size (e.g., 1152 for
Qwen3.5 with Mamba layers). This overwrites `self.block_size` from 128
to 1152 **after** the HPU platform's `check_and_update_config` had
already reset it to 128.

This causes `generate_buckets()` to produce decode buckets at 1152-token
granularity (max ~10,260 blocks), while `_create_decode_input_data`
computes `num_blocks` using `attn_block_size=128` (max ~92,160 blocks).
The runtime values exceed warmed buckets, triggering "not warmed-up"
warnings.

### 2. `_prepare_dummy_scenario` used wrong block_size for decode

The decode dummy sequence generation used `self.block_size` instead of
`self.attn_block_size`, causing a mismatch with
`_create_decode_input_data` which uses `self.attn_block_size`.

## Fix

1. **Remove the `block_size` overwrite in `initialize_kv_cache`** -
These lines must not be present because `self.block_size` is already set
correctly during `__init__` and must remain at 128 (the HPU kernel block
size) for proper bucket generation. The KV-manager page size (1152) is a
separate concept used for memory allocation, not for bucketing.

2. **Use `self.attn_block_size` in `_prepare_dummy_scenario`** for
decode sequences, matching what `_create_decode_input_data` uses.

## Verification

- Tested on Gaudi3 (HL-325) with Qwen/Qwen3.5-35B-A3B, TP=2, EP=2
- 247 prompt + 117 decode buckets warmed successfully
- Decode bucket range: 1 to 21,858 blocks (correct, using 128-token
granularity)
- Multiple inference requests completed with **zero** "not warmed-up"
warnings
- Server log (537 lines) contains no `_check_config` or warmup mismatch
warnings

## Why v0.19.0 worked

The `initialize_kv_cache` method in v0.19.0 did **not** have the
`self.block_size = self.vllm_config.cache_config.block_size` lines, so
`block_size` stayed at 128 throughout the lifecycle.

Signed-off-by: Agata Dobrzyniewicz <agata.dobrzyniewicz@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 27 ++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index bf5b50293b..6aa12158c5 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -5288,11 +5288,17 @@ def _prepare_dummy_scenario(self, prompt_cfg, decode_cfg):
                                             is_prompt=True)
         if decode_cfg:
             decode_bs, decode_query_len, decode_num_blocks = decode_cfg
+            # Use attn_block_size (the actual kernel block granularity used in
+            # _create_decode_input_data) rather than block_size (the KV-manager
+            # page size).  For hybrid models these differ after
+            # initialize_kv_cache aligns attn page size to mamba page size,
+            # causing warmup to record wrong num_blocks otherwise.
+            decode_block_size = self.attn_block_size
             if self.use_contiguous_pa:
-                decode_seq_lengths = [self.block_size] * decode_bs
+                decode_seq_lengths = [decode_block_size] * decode_bs
                 block_id = decode_num_blocks - 1
             else:
-                decode_seq_lengths = self._generate_seq_lengths(decode_bs, decode_num_blocks, self.block_size)
+                decode_seq_lengths = self._generate_seq_lengths(decode_bs, decode_num_blocks, decode_block_size)
                 block_id = 0
             for dsl in decode_seq_lengths:
                 self._add_dummy_request(requests,
@@ -5901,10 +5907,19 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self.is_encoder_only_attn = False
         self.may_add_encoder_only_layers_to_kv_cache_config()
         if self.num_mamba_like_layers > 0:
-            # Reassign block size for hybrid models after platform.py alignments
-            self.block_size = self.vllm_config.cache_config.block_size
-            if self.enable_bucketing:
-                self.bucketing_manager.block_size = self.block_size
+            # NOTE: Do NOT reassign self.block_size or
+            # bucketing_manager.block_size from cache_config here.
+            # For hybrid models the upstream HybridAttentionMambaModelConfig
+            # inflates cache_config.block_size to align mamba pages (e.g.
+            # 1152 for Qwen3.5), but the HPU attention kernel operates at
+            # 128-token granularity.  _create_decode_input_data computes
+            # num_blocks using self.attn_block_size (set below from
+            # prepare_kernel_block_sizes), so the bucketing manager must
+            # also use that same granularity.  Overwriting block_size with
+            # the inflated KV-manager page size caused decode buckets to be
+            # generated at 1152-token granularity while runtime used
+            # 128-token granularity, leading to permanent "not warmed-up"
+            # warnings and recompilations.
             maybe_set_mamba_kv_cache_groups_ids(self.model, self.kv_cache_config)
         self.initialize_attn_backend(kv_cache_config)
 

From d999b2e0ab2db13c5b495bda629205da89523077 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Smyrek?= <radoslawx.smyrek@intel.com>
Date: Tue, 19 May 2026 12:37:19 +0200
Subject: [PATCH 13/29] Add Qwen3NextForCausalLM to mamba_like_arch (#1450)

Qwen3Next uses a hybrid GDN+attention architecture that requires
separate KV cache groups for GDN vs standard attention layers. Add it to
the mamba_like_arch list so maybe_set_mamba_kv_cache_groups_ids() sets
up the cache groups correctly.

Signed-off-by: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 6aa12158c5..664d88ef03 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -543,7 +543,8 @@ def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig):
         model = model.model
 
     mamba_like_arch = [
-        "GraniteMoeHybridForCausalLM", "Qwen3_5MoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration"
+        "GraniteMoeHybridForCausalLM", "Qwen3_5MoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration",
+        "Qwen3NextForCausalLM"
     ]
     if not any(arch in getattr(model.config, 'architectures', []) for arch in mamba_like_arch):
         return

From c0a59cf2734267cc49641c3e662e8aed39028f62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Olejniczak?= <pawelx.olejniczak@intel.com>
Date: Tue, 19 May 2026 13:01:06 +0200
Subject: [PATCH 14/29] 
 [FIX_FOR_VLLM_CUSTOM=dcacdf9a8860a86401127d1c8f93ebf3cfbfd026] Fix
 MultiModelEngineClient, Qwen3.5 compilation, and EPLB refactoring (#1436)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix upstream regressions affecting hourly CI:

1. **MultiModelEngineClient**: Added missing
`notify_kv_transfer_request_rejected` abstract method (upstream PR
vllm-project/vllm#41269)
2. **Qwen3.5 test harness**: Updated `test_common.py` to read
`enforce_eager` from model card config (with env var override), enabling
per-model compilation control
3. **EPLB refactoring**: Removed `EMPTY_EPLB_STATE` import and
`enable_eplb` parameter from `patched_create_fused_moe_router` after
upstream MoE refactor (upstream PR vllm-project/vllm#41055)

Note: The `enforce_eager: true` workaround for Qwen3.5 compilation has
been removed — the root cause (mamba_type str-vs-Enum comparison in
hybrid cache allocation) is properly fixed by #1449, which should merge
first.

Verified on HPU: unit tests pass on Gaudi 3 (MoE, FP8, compressed
tensors).

---------

Signed-off-by: Paweł Olejniczak <pawelx.olejniczak@intel.com>
Signed-off-by: Pawel Olejniczak <pawelx.olejniczak@intel.com>
Co-authored-by: Iryna Boiko <iryna.boiko@intel.com>
---
 .../models/language/generation/test_common.py | 103 +++++++++---------
 .../openai/multi_model_api_server.py          |  37 +++++--
 vllm_gaudi/ops/hpu_fused_moe.py               |  12 +-
 3 files changed, 82 insertions(+), 70 deletions(-)

diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index da03369475..f9a2001da7 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -9,62 +9,66 @@
 
 
 def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-    dtype = eval_config.get('dtype', 'bfloat16')
-    max_num_seqs = eval_config.get('max_num_seqs', 128)
-    tp_size = int(os.environ.get('TP_SIZE', '1'))
-    enable_apc = os.environ.get('ENABLE_APC', 'True').lower() in ['true', '1']
-    enforce_eager = os.environ.get('ENFORCE_EAGER', 'False').lower() in ['true', '1']
-    kv_cache_dtype = os.environ.get('KV_CACHE_DTYPE', None)
-    task = eval_config.get('tasks', 'gsm8k')
-    async_scheduling = os.environ.get('ASYNC_SCHEDULING', 'False').lower() in ['true', '1']
-    max_model_len = eval_config.get('max_model_len', 4096)
-    batch_size = eval_config.get('batch_size', 'auto')
+    trust_remote_code = eval_config.get("trust_remote_code", False)
+    dtype = eval_config.get("dtype", "bfloat16")
+    max_num_seqs = eval_config.get("max_num_seqs", 128)
+    tp_size = int(os.environ.get("TP_SIZE", "1"))
+    enable_apc = os.environ.get("ENABLE_APC", "True").lower() in ["true", "1"]
+    enforce_eager = eval_config.get("enforce_eager", False)
+    if "ENFORCE_EAGER" in os.environ:
+        enforce_eager = os.environ["ENFORCE_EAGER"].lower() in ["true", "1"]
+    kv_cache_dtype = os.environ.get("KV_CACHE_DTYPE", None)
+    task = eval_config.get("tasks", "gsm8k")
+    async_scheduling = os.environ.get("ASYNC_SCHEDULING", "False").lower() in ["true", "1"]
+    max_model_len = eval_config.get("max_model_len", 4096)
+    batch_size = eval_config.get("batch_size", "auto")
     model_args = {
-        'pretrained': eval_config['model_name'],
-        'tensor_parallel_size': tp_size,
-        'async_scheduling': async_scheduling,
-        'enforce_eager': enforce_eager,
-        'enable_prefix_caching': enable_apc,
-        'dtype': dtype,
-        'max_model_len': max_model_len,
-        'max_num_seqs': max_num_seqs,
-        'trust_remote_code': trust_remote_code,
-        'batch_size': batch_size,
-        'enable_expert_parallel': eval_config.get('enable_expert_parallel', False),
-        'chat_template_args': eval_config.get('chat_template_args', {}),
-        'seed': eval_config.get('seed', 42),
+        "pretrained": eval_config["model_name"],
+        "tensor_parallel_size": tp_size,
+        "async_scheduling": async_scheduling,
+        "enforce_eager": enforce_eager,
+        "enable_prefix_caching": enable_apc,
+        "dtype": dtype,
+        "max_model_len": max_model_len,
+        "max_num_seqs": max_num_seqs,
+        "trust_remote_code": trust_remote_code,
+        "batch_size": batch_size,
+        "enable_expert_parallel": eval_config.get("enable_expert_parallel", False),
+        "chat_template_args": eval_config.get("chat_template_args", {}),
+        "seed": eval_config.get("seed", 42),
     }
     if kv_cache_dtype is not None:
-        model_args['kv_cache_dtype'] = kv_cache_dtype
+        model_args["kv_cache_dtype"] = kv_cache_dtype
 
-    if eval_config.get('gpu_memory_utilization') is not None:
-        model_args['gpu_memory_utilization'] = eval_config['gpu_memory_utilization']
-    if eval_config.get('reasoning_parser') is not None:
-        model_args['reasoning_parser'] = eval_config['reasoning_parser']
-    if eval_config.get('max_num_batched_tokens') is not None:
-        model_args['max_num_batched_tokens'] = eval_config['max_num_batched_tokens']
+    if eval_config.get("gpu_memory_utilization") is not None:
+        model_args["gpu_memory_utilization"] = eval_config["gpu_memory_utilization"]
+    if eval_config.get("reasoning_parser") is not None:
+        model_args["reasoning_parser"] = eval_config["reasoning_parser"]
+    if eval_config.get("max_num_batched_tokens") is not None:
+        model_args["max_num_batched_tokens"] = eval_config["max_num_batched_tokens"]
 
     if eval_config.get("inc"):
-        assert os.environ.get('QUANT_CONFIG', None), "must set QUANT_CONFIG environment variable for using INC"
-        model_args['quantization'] = 'inc'  # for both calibration and quantization
+        assert os.environ.get("QUANT_CONFIG", None), "must set QUANT_CONFIG environment variable for using INC"
+        model_args["quantization"] = "inc"  # for both calibration and quantization
         if eval_config.get("fp8"):  # for quantization in fp8
-            model_args['kv_cache_dtype'] = 'fp8_inc'
+            model_args["kv_cache_dtype"] = "fp8_inc"
 
     kwargs = {}
-    if 'fewshot_as_multiturn' in eval_config:
-        kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn']
-    if 'apply_chat_template' in eval_config:
-        kwargs['apply_chat_template'] = eval_config['apply_chat_template']
-    if eval_config.get('max_gen_toks') is not None:
-        kwargs['gen_kwargs'] = f"max_gen_toks={eval_config['max_gen_toks']}"
+    if "fewshot_as_multiturn" in eval_config:
+        kwargs["fewshot_as_multiturn"] = eval_config["fewshot_as_multiturn"]
+    if "apply_chat_template" in eval_config:
+        kwargs["apply_chat_template"] = eval_config["apply_chat_template"]
+    if eval_config.get("max_gen_toks") is not None:
+        kwargs["gen_kwargs"] = f"max_gen_toks={eval_config['max_gen_toks']}"
     llm = VLLM(**model_args)
-    results = lm_eval.simple_evaluate(model=llm,
-                                      tasks=[task],
-                                      num_fewshot=eval_config["num_fewshot"],
-                                      limit=eval_config["limit"],
-                                      batch_size="auto",
-                                      **kwargs)
+    results = lm_eval.simple_evaluate(
+        model=llm,
+        tasks=[task],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto",
+        **kwargs,
+    )
     del llm
     gc.collect()
 
@@ -75,11 +79,11 @@ def test_models(model_card_path, monkeypatch) -> None:
     with open(model_card_path) as f:
         model_card = yaml.safe_load(f)
     print(f"{model_card=}")
-    model_config = model_card['model_card']
+    model_config = model_card["model_card"]
     results = launch_lm_eval(model_config)
     RTOL = 0.03
-    metric = model_card['metrics']
-    task = model_config['tasks']
+    metric = model_card["metrics"]
+    task = model_config["tasks"]
     try:
         measured_value = results["results"][task][metric["name"]]
     except KeyError as e:
@@ -100,6 +104,7 @@ def __main__(args):
 
 if __name__ == "__main__":
     import argparse
+
     parser = argparse.ArgumentParser(description="Test vLLM models with lm-eval")
     parser.add_argument("--model_card_path", type=str, required=True, help="Path to the model card YAML file.")
     args = parser.parse_args()
diff --git a/vllm_gaudi/entrypoints/openai/multi_model_api_server.py b/vllm_gaudi/entrypoints/openai/multi_model_api_server.py
index 5b628aed52..c57c02cc2c 100644
--- a/vllm_gaudi/entrypoints/openai/multi_model_api_server.py
+++ b/vllm_gaudi/entrypoints/openai/multi_model_api_server.py
@@ -10,7 +10,7 @@
 from contextlib import asynccontextmanager
 from collections.abc import AsyncIterator
 from dataclasses import dataclass
-from typing import NamedTuple
+from typing import Any, NamedTuple
 
 import uvloop
 import yaml
@@ -149,6 +149,19 @@ async def sleep(self, level: int = 1, mode: str = "abort") -> None:
     async def wake_up(self, tags: list[str] | None = None) -> None:
         await self._engine.wake_up(tags=tags)
 
+    async def notify_kv_transfer_request_rejected(
+        self,
+        request_id: str,
+        kv_transfer_params: dict[str, Any],
+        *,
+        data_parallel_rank: int | None = None,
+    ) -> None:
+        await self._engine.notify_kv_transfer_request_rejected(
+            request_id=request_id,
+            kv_transfer_params=kv_transfer_params,
+            data_parallel_rank=data_parallel_rank,
+        )
+
     async def is_sleeping(self) -> bool:
         return await self._engine.is_sleeping()
 
@@ -299,7 +312,7 @@ def _resolve_frontend_settings(
                                if model_overrides.enable_auto_tool_choice is not None else args.enable_auto_tool_choice)
     tool_call_parser = (model_overrides.tool_call_parser
                         if model_overrides.tool_call_parser is not None else args.tool_call_parser)
-    chat_template = (model_overrides.chat_template if model_overrides.chat_template is not None else args.chat_template)
+    chat_template = model_overrides.chat_template if model_overrides.chat_template is not None else args.chat_template
     return FrontendSettings(
         enable_auto_tool_choice=enable_auto_tool_choice,
         tool_call_parser=tool_call_parser,
@@ -321,7 +334,7 @@ def _validate_model_frontend_overrides(
         if effective_enable_auto and not effective_tool_parser:
             raise ValueError(f"Model '{model_name}' enables auto tool choice but no tool_call_parser is set.")
 
-        if (effective_enable_auto and effective_tool_parser and effective_tool_parser not in valid_tool_parsers):
+        if effective_enable_auto and effective_tool_parser and effective_tool_parser not in valid_tool_parsers:
             raise ValueError(f"Model '{model_name}' has invalid tool_call_parser='{effective_tool_parser}'. "
                              f"Valid options: {valid_tool_parsers}")
 
@@ -392,8 +405,7 @@ def _load_multi_model_config(path: str, ) -> MultiModelConfigLoadResult:
     if default_model is None:
         default_model = next(iter(model_configs.keys()))
     if default_model not in model_configs:
-        raise ValueError(f"Default model '{default_model}' not found in config models: "
-                         f"{list(model_configs.keys())}")
+        raise ValueError(f"Default model '{default_model}' not found in config models: {list(model_configs.keys())}")
 
     return MultiModelConfigLoadResult(
         model_configs=model_configs,
@@ -418,8 +430,13 @@ async def build_multi_model_engine_client(
     args: Namespace,
     *,
     usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
-) -> AsyncIterator[tuple[MultiModelEngineClient, MultiModelAsyncLLM, dict[str, BaseModelPath], dict[str, int], dict[
-        str, ModelFrontendOverrides]]]:
+) -> AsyncIterator[tuple[
+        MultiModelEngineClient,
+        MultiModelAsyncLLM,
+        dict[str, BaseModelPath],
+        dict[str, int],
+        dict[str, ModelFrontendOverrides],
+]]:
     config_path = _resolve_multi_model_config_path()
     if not config_path:
         raise ValueError("A multi-model config path must be set when multi-model mode is enabled. "
@@ -504,7 +521,8 @@ async def _init_multi_model_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
         default_chat_template_kwargs=args.default_chat_template_kwargs,
-        trust_request_chat_template=args.trust_request_chat_template)
+        trust_request_chat_template=args.trust_request_chat_template,
+    )
 
     if "generate" in supported_tasks:
         from vllm.entrypoints.openai.generate.api_router import init_generate_state
@@ -537,8 +555,7 @@ async def _init_multi_model_state(
 
 def _attach_multi_model_router(app: FastAPI) -> None:
     if not envs.VLLM_SERVER_DEV_MODE:
-        logger.warning("The /v1/models/switch endpoint is disabled. "
-                       "Set VLLM_SERVER_DEV_MODE=1 to enable it.")
+        logger.warning("The /v1/models/switch endpoint is disabled. Set VLLM_SERVER_DEV_MODE=1 to enable it.")
         return
 
     router = APIRouter()
diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py
index 21db0b541a..86a10bdebc 100644
--- a/vllm_gaudi/ops/hpu_fused_moe.py
+++ b/vllm_gaudi/ops/hpu_fused_moe.py
@@ -22,8 +22,6 @@
     FusedTopKRouter, )
 from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import (
     GroupedTopKRouter, )
-from vllm.model_executor.layers.fused_moe.router.router_factory import (
-    EMPTY_EPLB_STATE, )
 from vllm.model_executor.layers.fused_moe.router.routing_simulator_router import (
     RoutingSimulatorRouter, )
 from vllm.model_executor.layers.fused_moe.router.zero_expert_router import (
@@ -388,8 +386,7 @@ def create_fused_moe_router(
     # custom routing parameters
     custom_routing_function: Callable | None = None,
     # eplb parameters
-    enable_eplb: bool = False,
-    eplb_state: EplbLayerState = EMPTY_EPLB_STATE,
+    eplb_state: EplbLayerState | None = None,
     # zero expert parameters
     zero_expert_type: str | None = None,
     num_logical_experts: int | None = None,
@@ -428,7 +425,6 @@ def create_fused_moe_router(
         custom_routing_function: Optional custom routing function
 
     EPLB arguments:
-        enable_eplb: Whether EPLB is enabled
         eplb_state: EPLB (Expert Parallelism Load Balancing) state
 
     Zero expert arguments:
@@ -451,7 +447,6 @@ def create_fused_moe_router(
             top_k=top_k,
             global_num_experts=global_num_experts,
             eplb_state=eplb_state,
-            enable_eplb=enable_eplb,
             indices_type_getter=indices_type_getter,
         )
 
@@ -468,7 +463,6 @@ def create_fused_moe_router(
             scoring_func=scoring_func,
             renormalize=renormalize,
             routed_scaling_factor=routed_scaling_factor,
-            enable_eplb=enable_eplb,
             indices_type_getter=indices_type_getter,
         )
 
@@ -487,7 +481,6 @@ def create_fused_moe_router(
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             num_fused_shared_experts=num_fused_shared_experts,
-            enable_eplb=enable_eplb,
             indices_type_getter=indices_type_getter,
         )
         return grouped_topk_router
@@ -499,7 +492,6 @@ def create_fused_moe_router(
             eplb_state=eplb_state,
             custom_routing_function=custom_routing_function,
             renormalize=renormalize,
-            enable_eplb=enable_eplb,
             indices_type_getter=indices_type_getter,
         )
 
@@ -514,7 +506,6 @@ def create_fused_moe_router(
             scoring_func=scoring_func,
             renormalize=renormalize,
             routed_scaling_factor=routed_scaling_factor,
-            enable_eplb=enable_eplb,
             indices_type_getter=indices_type_getter,
             hash_indices_table=hash_indices_table,
         )
@@ -525,7 +516,6 @@ def create_fused_moe_router(
         eplb_state=eplb_state,
         renormalize=renormalize,
         scoring_func=scoring_func,
-        enable_eplb=enable_eplb,
         indices_type_getter=indices_type_getter,
     )
 

From 8c5008eb02fe94c7fddc6ad94f3ca36d7ded362c Mon Sep 17 00:00:00 2001
From: Harish Subramony <harish.subramony@intel.com>
Date: Tue, 19 May 2026 06:46:56 -0700
Subject: [PATCH 15/29] Fix patch_hf3fs_mock_client_for_cpu_only (#1439)

1) added in https://github.com/vllm-project/vllm-gaudi/pull/1453
16 is supported for testing/smaller models; 128 is the standard HPU
kernel block size; 528 is required for Granite 4.0-H
(granitemoehybrid) without prefix caching (16-token FA alignment),
768 with prefix caching (chunk-aligned).

2) _patch_hf3fs_mock_client_for_cpu_only
Upstream mock client unconditionally calls
``torch.cuda.current_stream().wait_event(event)`` in ``batch_write``.
In environments where PyTorch is not compiled with CUDA, that path
throws
and the method returns ``-1`` for writes, causing connector unit tests
to
fail. This patch keeps the same behavior but skips CUDA synchronization
when
    CUDA is unavailable.

---------

Signed-off-by: Harish Subramony <harish.subramony@intel.com>
Co-authored-by: Iryna Boiko <iryna.boiko@intel.com>
---
 vllm_gaudi/patches.py | 92 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 22 deletions(-)

diff --git a/vllm_gaudi/patches.py b/vllm_gaudi/patches.py
index ac3931eb73..c9199e9332 100644
--- a/vllm_gaudi/patches.py
+++ b/vllm_gaudi/patches.py
@@ -29,9 +29,7 @@
   trigger a heavy import chain that interferes with platform initialisation.
 """
 
-import functools
 import gc
-from unittest.mock import MagicMock, patch
 
 import torch
 
@@ -53,6 +51,69 @@ def _hpu_accelerator_empty_cache() -> None:
         empty_cache()
 
 
+def _patch_hf3fs_mock_client_for_cpu_only() -> None:
+    """Patch HF3FS mock client to avoid CUDA stream waits on CPU-only builds.
+
+    Upstream mock client unconditionally calls
+    ``torch.cuda.current_stream().wait_event(event)`` in ``batch_write``.
+    In environments where PyTorch is not compiled with CUDA, that path throws
+    and the method returns ``-1`` for writes, causing connector unit tests to
+    fail. This patch keeps the same behavior but skips CUDA synchronization when
+    CUDA is unavailable.
+    """
+    try:
+        from vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.utils import hf3fs_mock_client as _mock_mod
+    except Exception:
+        # Keep plugin load resilient if the module path changes or is missing.
+        return
+
+    client_cls = getattr(_mock_mod, "Hf3fsClient", None)
+    if client_cls is None:
+        return
+
+    original_batch_write = getattr(client_cls, "batch_write", None)
+    if original_batch_write is None:
+        return
+
+    if getattr(original_batch_write, "_vllm_gaudi_cpu_safe_patch", False):
+        return
+
+    def _batch_write_cpu_safe(self, offsets, tensors, event):
+        if torch.cuda.is_available():
+            return original_batch_write(self, offsets, tensors, event)
+
+        results = []
+        try:
+            data_bytes_list = [self._tensor_to_bytes(tensor) for tensor in tensors]
+
+            with open(self._file_path, "r+b") as f:
+                for offset, data_bytes in zip(offsets, data_bytes_list):
+                    if offset < 0 or offset + len(data_bytes) > self._size:
+                        results.append(-1)
+                        continue
+
+                    f.seek(offset)
+                    bytes_written = f.write(data_bytes)
+
+                    if bytes_written == len(data_bytes) == self._bytes_per_page:
+                        results.append(self._bytes_per_page)
+                    else:
+                        _mock_mod.logger.error(
+                            "Write size mismatch: wrote %d, expected %d",
+                            bytes_written,
+                            self._bytes_per_page,
+                        )
+                        results.append(-1)
+        except Exception as e:
+            _mock_mod.logger.error("Batch write error: %s", e)
+            results.extend([-1] * (len(offsets) - len(results)))
+
+        return results
+
+    _batch_write_cpu_safe._vllm_gaudi_cpu_safe_patch = True  # type: ignore[attr-defined]
+    client_cls.batch_write = _batch_write_cpu_safe
+
+
 def _hpu_cleanup_dist_env_and_memory(shutdown_ray: bool = False) -> None:
     """HPU-safe replacement for ``cleanup_dist_env_and_memory``.
 
@@ -61,6 +122,9 @@ def _hpu_cleanup_dist_env_and_memory(shutdown_ray: bool = False) -> None:
     ``torch.accelerator.empty_cache()`` (which is incompatible with the
     HPU allocator).
     """
+    # Re-apply lazy runtime patches that may depend on import timing.
+    _patch_hf3fs_mock_client_for_cpu_only()
+
     # Reset environment variable cache
     envs.disable_envs_cache()
     # Ensure all objects are not frozen before cleanup
@@ -123,6 +187,7 @@ def apply() -> None:
     import vllm.distributed as _vllm_distributed
 
     _vllm_distributed.cleanup_dist_env_and_memory = _hpu_cleanup_dist_env_and_memory
+    _patch_hf3fs_mock_client_for_cpu_only()
 
     # --- batched_count_greater_than (deferred) ---
     # We cannot import the sampler modules here because the import chain
@@ -146,29 +211,12 @@ def patch_hf3fs_mock_client():
 
     The upstream mock client's ``batch_write`` unconditionally calls
     ``torch.cuda.current_stream().wait_event(event)``, which raises
-    ``RuntimeError`` on platforms without CUDA (e.g. HPU).  We wrap
-    ``batch_write`` to stub ``torch.cuda.current_stream`` with a no-op
-    mock for the duration of the call.
+    ``RuntimeError`` on platforms without CUDA (e.g. HPU). This helper
+    installs the CPU-safe replacement for ``batch_write``.
 
     Called from ``register_utils()`` (general plugin) rather than
     ``apply()`` (platform plugin) to avoid circular imports — the mock
     client transitively imports ``vllm.config`` which is not yet fully
     initialized during platform registration.
     """
-    if torch.cuda.is_available():
-        return
-
-    try:
-        from vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.utils import (
-            hf3fs_mock_client, )
-    except ImportError:
-        return
-
-    _orig_batch_write = hf3fs_mock_client.Hf3fsClient.batch_write
-
-    @functools.wraps(_orig_batch_write)
-    def _safe_batch_write(self, offsets, tensors, event):
-        with patch("torch.cuda.current_stream", return_value=MagicMock()):
-            return _orig_batch_write(self, offsets, tensors, event)
-
-    hf3fs_mock_client.Hf3fsClient.batch_write = _safe_batch_write
+    _patch_hf3fs_mock_client_for_cpu_only()

From a4150f5647baa3654aff2df427e42da13242484a Mon Sep 17 00:00:00 2001
From: Bartosz Myrcha <bartosz.myrcha@intel.com>
Date: Tue, 19 May 2026 21:24:30 +0200
Subject: [PATCH 16/29] Increase timeout from default 6h to 12h (#1464)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This pull request updates the `.github/workflows/pre-merge.yaml`
workflow configuration to add a `timeout-minutes: 720` (12 hours) limit
to all jobs. This change ensures that no individual job in the pre-merge
workflow can run indefinitely, which helps prevent stuck or runaway jobs
in CI and improves overall pipeline reliability.

**CI/CD Workflow Improvements:**

* Added `timeout-minutes: 720` to all jobs in
`.github/workflows/pre-merge.yaml` to enforce a 12-hour maximum runtime
per job. This applies to jobs such as `retrieve_head_sha`, `gatekeeper`,
`discover_runner`, `discover_tests`, `discover_calibration_tests`, test
execution jobs, and finalization/cleanup jobs.

No other logic or behavior changes were made—this is a
configuration-only update to improve CI robustness.

Signed-off-by: Bartosz Myrcha <bartosz.myrcha@intel.com>
---
 .github/workflows/pre-merge.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/pre-merge.yaml b/.github/workflows/pre-merge.yaml
index 64336abd07..7b782c7e71 100644
--- a/.github/workflows/pre-merge.yaml
+++ b/.github/workflows/pre-merge.yaml
@@ -29,6 +29,7 @@ concurrency:
 jobs:
   retrieve_head_sha:
     runs-on: ubuntu-latest
+    timeout-minutes: 720
     outputs:
       head_sha: ${{ steps.set_sha.outputs.head_sha }}
     steps:
@@ -40,6 +41,7 @@ jobs:
   gatekeeper:
     needs: retrieve_head_sha
     runs-on: ubuntu-latest
+    timeout-minutes: 720
     permissions:
       # Required to read the status of checks and PR details
       checks: read
@@ -136,6 +138,7 @@ jobs:
   discover_runner:
     needs: gatekeeper
     runs-on: ${{ inputs.use_hourly_runner == 'true' && 'hourly-ci' || 'pr-ci' }}
+    timeout-minutes: 720
     outputs:
       runner_name: ${{ steps.get_name.outputs.name }}
     steps:
@@ -150,6 +153,7 @@ jobs:
     needs: [discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
@@ -180,6 +184,7 @@ jobs:
   discover_calibration_tests:
     needs: [discover_runner, retrieve_head_sha]
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
@@ -207,6 +212,7 @@ jobs:
     # This job runs in parallel with the build job
     needs: [gatekeeper, retrieve_head_sha]
     runs-on: ubuntu-latest
+    timeout-minutes: 720
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -235,6 +241,7 @@ jobs:
     if: inputs.skip_tests != 'true'
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     permissions:
       contents: read # Required to checkout code and read history
     outputs:
@@ -354,6 +361,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     steps:
       - name: Run pytest in tests/unit_tests
         run: |
@@ -378,6 +386,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     steps:
       - name: Run test scripts
         run: |
@@ -408,6 +417,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     steps:
       - name: Run test scripts
         run: |
@@ -433,6 +443,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     steps:
       - name: Run test scripts
         run: |
@@ -459,6 +470,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     strategy:
       fail-fast: false
       matrix:
@@ -491,6 +503,7 @@ jobs:
   calibration_tests:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_calibration_tests, discover_runner, retrieve_head_sha]
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     strategy:
       fail-fast: false
       matrix:
@@ -522,6 +535,7 @@ jobs:
   calibration_arg_parsing_tests:
     needs: [pre_merge_hpu_test_build, discover_runner, retrieve_head_sha]
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     steps:
       - name: Run calibration arg parsing tests
         run: |
@@ -544,6 +558,7 @@ jobs:
     needs: [retrieve_head_sha]
     if: inputs.is_merge_group != 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 720
     outputs:
       nixl_changed: ${{ steps.check.outputs.nixl_changed }}
     steps:
@@ -571,6 +586,7 @@ jobs:
     needs: [check_dockerfile_changes, discover_runner, retrieve_head_sha]
     if: needs.check_dockerfile_changes.outputs.nixl_changed == 'true'
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -595,6 +611,7 @@ jobs:
     needs: [hpu_unit_tests, e2e, hpu_perf_tests, calibration_tests, calibration_arg_parsing_tests, discover_runner]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     # This job is required to pass for pre-merge CI. By itself it does nothing, and will only pass if all jobs specified in "needs" list pass.
     steps:
       - name: Succeeded if all previous jobs passed
@@ -605,6 +622,7 @@ jobs:
     # This job runs after hpu-test-suite completes
     needs: [pre_merge_hpu_test, pre_merge_hpu_test_build]
     runs-on: ubuntu-latest
+    timeout-minutes: 720
     permissions:
       # Permissions are required on a per-job basis
       pull-requests: write
@@ -624,6 +642,7 @@ jobs:
     if: always()
     needs: [discover_runner, hpu_unit_tests, hpu_pd_tests, hpu_perf_tests, hpu_dp_tests, e2e, calibration_tests, calibration_arg_parsing_tests]
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    timeout-minutes: 720
     steps:
       - name: Remove Docker image to free up space
         env:

From 56474c1fbeeb5089ed76007849552ec586029b48 Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iryna.boiko@intel.com>
Date: Wed, 20 May 2026 09:45:21 +0200
Subject: [PATCH 17/29] Removal of ray and redundant transformers packages from
 gaudi requirements (#1445)

Signed-off-by: Iryna Boiko <iboiko@habana.ai>
---
 requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 77b97ce24d..c14e18a0f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,7 @@
 # Dependencies for HPU code
-ray>=2.48.0
 pandas>=2.2.3
 numba>=0.58.0
 numpy>=1.26.0
-transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0, != 5.6.*
 kaldi-native-fbank >= 1.18.7
 decord >= 0.6.0
 tblib==3.1.0

From bb9ca736b5398842f3449a2d37b90e5584605f22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Olejniczak?= <pawelx.olejniczak@intel.com>
Date: Thu, 21 May 2026 09:58:47 +0200
Subject: [PATCH 18/29] 
 [FIX_FOR_VLLM_CUSTOM=a78b842d0e85d287176031334f4721cd96b6e47d] Fix
 offloading_connector test flush assertion for load transfers (#1468)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream vLLM PR vllm-project/vllm#42611 ("Flush all pending jobs on
last step") changed \`get_flushed_transfers()\` to return both store and
load flushes. The vllm-gaudi copy of the offloading_connector unit tests
assumed only store flushes, causing:

1. \`AssertionError\` in \`utils.py\` \`_parse_transfers\`
(\`isinstance(src_spec, GPULoadStoreSpec)\` assert fails on load
flushes)
2. \`flushed_gpu_block_indexes\` mismatch in \`test_scheduler\` tests

**Fix**: Mirror the upstream change — replace the assert with an
\`if/else\` handling both store and load flush types, and add
\`expected_flushed_gpu_block_indexes\` to affected tests.

Signed-off-by: Paweł Olejniczak <pawelx.olejniczak@intel.com>
---
 .../offloading_connector/test_scheduler.py           |  8 ++++++++
 .../kv_offload/offloading_connector/utils.py         | 12 ++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py b/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py
index e899906e87..3b75ce2394 100644
--- a/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py
+++ b/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py
@@ -210,9 +210,12 @@ def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling:
     # store 1 blocks
     runner.new_request(token_ids=[0] * offloaded_block_size)
     runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output(block_hashes))
+    # With sync scheduling, all-finished flush fires within this run.
+    # With async scheduling, the finish is delayed so flush fires later.
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
         expected_stored_gpu_block_indexes=(0, 1, 2),
+        expected_flushed_gpu_block_indexes=(0, 1, 2) if not async_scheduling else (),
     )
 
     # start a request to load the first block, but don't complete
@@ -249,6 +252,9 @@ def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling:
     # second request will use the GPU prefix cache
     assert transfer_jobs == list(runner.offloading_spec.handler.transfer_specs)
 
+    # Fence index drained: stores completed before request_finished ran.
+    assert runner.connector_scheduler._block_id_to_pending_jobs == {}
+
 
 @pytest.mark.parametrize("async_scheduling", [True, False])
 def test_abort_loading_requests(request_runner, async_scheduling: bool):
@@ -269,6 +275,7 @@ def test_abort_loading_requests(request_runner, async_scheduling: bool):
     runner.run(
         decoded_tokens=[EOS_TOKEN_ID],
         expected_stored_gpu_block_indexes=(0, 1, 2),
+        expected_flushed_gpu_block_indexes=(0, 1, 2) if not async_scheduling else (),
     )
 
     # start a request to load the first block, but don't complete
@@ -295,6 +302,7 @@ def test_abort_loading_requests(request_runner, async_scheduling: bool):
     runner.run(
         decoded_tokens=[],
         expected_loaded_gpu_block_indexes=(0, 1, 2),
+        expected_flushed_gpu_block_indexes=(0, 1, 2),
     )
 
     # assert request is deleted
diff --git a/tests/unit_tests/kv_offload/offloading_connector/utils.py b/tests/unit_tests/kv_offload/offloading_connector/utils.py
index aab2a5c64f..91c17b9912 100644
--- a/tests/unit_tests/kv_offload/offloading_connector/utils.py
+++ b/tests/unit_tests/kv_offload/offloading_connector/utils.py
@@ -286,10 +286,14 @@ def new_request(
     def _parse_transfers(self):
         for transfer_spec in self.offloading_spec.get_flushed_transfers():
             src_spec, dst_spec = transfer_spec
-            assert isinstance(src_spec, GPULoadStoreSpec)
-
-            for block_id in src_spec.block_ids:
-                self.flushed_gpu_block_indexes.add(self.gpu_block_index[block_id.item()])
+            if isinstance(src_spec, GPULoadStoreSpec):
+                # store flush
+                for block_id in src_spec.block_ids:
+                    self.flushed_gpu_block_indexes.add(self.gpu_block_index[block_id.item()])
+            else:
+                # load flush
+                for block_id in dst_spec.block_ids:
+                    self.flushed_gpu_block_indexes.add(self.gpu_block_index[block_id.item()])
 
         block_size_factor = self.offloaded_block_size // self.gpu_block_size
 

From dc459b884875e78c60d00ed9600f46dc6a55064c Mon Sep 17 00:00:00 2001
From: Bartosz Myrcha <bartosz.myrcha@intel.com>
Date: Thu, 21 May 2026 12:07:27 +0200
Subject: [PATCH 19/29] Add pre-merge-approval for execute_pre_merge (#1471)

Signed-off-by: Bartosz Myrcha <bartosz.myrcha@intel.com>
---
 .github/workflows/pre-merge-trigger.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/pre-merge-trigger.yaml b/.github/workflows/pre-merge-trigger.yaml
index 8794b31847..0c7f9a1c26 100644
--- a/.github/workflows/pre-merge-trigger.yaml
+++ b/.github/workflows/pre-merge-trigger.yaml
@@ -17,8 +17,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  gate:
+     runs-on: ubuntu-latest
+     environment: pre-merge-approval
+     steps:
+       - run: echo "Approved"
   execute_pre_merge:
     runs-on: ubuntu-latest
+    needs: gate
     timeout-minutes: 720
     permissions:
       actions: write       # dispatch workflows, read run status, cancel orphaned runs

From ca2d9527f2098e65ffc7ce199a0849f1289f3773 Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Date: Thu, 21 May 2026 15:00:55 +0200
Subject: [PATCH 20/29] ci: route HF_TOKEN-using jobs through approved-workflow
 environment (#1473)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Adds `environment: approved-workflow` to every job that consumes
`secrets.HF_TOKEN` across the three CI workflows. Together with the
existing approval gate in `pre-merge-trigger.yaml` (`environment:
pre-merge-approval`, added in #1471), this completes the two-layer
protection model:

```
PR opened
  -> pre-merge-trigger `gate` job: pauses for required reviewer (approval #1)
  -> on approval, pre-merge.yaml is dispatched
  -> downstream secret-using jobs resolve HF_TOKEN from the
     `approved-workflow` environment (no second per-job approval)
```

## Why

With `HF_TOKEN` previously at repo-secret scope, any matrix entry of any
e2e/test job had direct access the moment CI started. The recent
malicious fork PR exfiltrated it via an auto-discovered `run_*`
function. After this change, the token is only released from a GitHub
Environment that a maintainer-controlled deployment-branch rule
restricts to `main` / `releases/**`, and only after the upstream gate
has approved the dispatch.

We deliberately add the environment only on jobs that actually use the
secret (15 jobs). Helper jobs (`gatekeeper`, `discover_*`, `retrieve_*`,
`pre-commit`, `post-comment`, `cleanup_*`, `build_nixl_dockerfile`,
`check_dockerfile_changes`, `prepare-release-branch`,
`summarize_and_notify`, `setup_and_build`,
`store_last_stable_vllm_commit`) do not touch HF_TOKEN and are not
modified, to avoid pointless extra gate evaluations.

## Affected jobs (15)

- `pre-merge.yaml`: `hpu_unit_tests`, `hpu_pd_tests`, `hpu_perf_tests`,
`hpu_dp_tests`, `e2e`, `calibration_tests`
- `hourly-ci.yaml`: `run_unit_tests`, `e2e`, `run_data_parallel_test`,
`run_pd_disaggregate_test`
- `create-release-branch.yaml`: `run_unit_tests`, `e2e`,
`run_data_parallel_test`, `run_pd_disaggregate_test`,
`run_hpu_perf_tests`

## Diff

+15 lines, 0 deletions. Each touched job gets exactly one new line:
`environment: approved-workflow`, inserted immediately after `runs-on:`.

## Required repo configuration (before this PR can be merged safely)

1. Settings → Environments → create environment **`approved-workflow`**.
2. Add **`HF_TOKEN`** as an environment secret (the rotated value).
3. **No required reviewers** on this environment (the upstream
`pre-merge-approval` gate already enforces approval; adding reviewers
here would prompt once per job).
4. **Deployment branches and tags**: Selected branches → `main`,
`releases/**`. Prevents a fork PR from claiming the environment from a
non-trusted ref.
5. **Delete** `HF_TOKEN` from repository-level secrets so the
environment value is the only source.

## Testing

Validated end-to-end against `bmyrcha/vllm-gaudi` first using a benign
fork PR. With the two environments configured as above, the gate paused
as expected, jobs received the secret after approval without a second
prompt, and a deliberately mis-authored downstream PR could not reach
the secret.

Close-cross-ref: builds on #1471.

Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
---
 .github/workflows/create-release-branch.yaml | 5 +++++
 .github/workflows/hourly-ci.yaml             | 4 ++++
 .github/workflows/pre-merge.yaml             | 6 ++++++
 3 files changed, 15 insertions(+)

diff --git a/.github/workflows/create-release-branch.yaml b/.github/workflows/create-release-branch.yaml
index 8f0ce9a2da..9ffc000880 100644
--- a/.github/workflows/create-release-branch.yaml
+++ b/.github/workflows/create-release-branch.yaml
@@ -164,6 +164,7 @@ jobs:
     needs: [prepare-release-branch, setup_and_build, discover_runner]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     steps:
       - name: Run pytest in tests/unit_tests
         run: |
@@ -216,6 +217,7 @@ jobs:
     needs: [prepare-release-branch, setup_and_build, discover_tests, discover_runner]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     strategy:
       fail-fast: false
       matrix:
@@ -248,6 +250,7 @@ jobs:
     needs: [prepare-release-branch, setup_and_build, discover_runner]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     steps:
       - name: Run Data Parallel test
         run: |
@@ -275,6 +278,7 @@ jobs:
     needs: [prepare-release-branch, setup_and_build, discover_runner]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     steps:
       - name: Run PD disaggregate test
         run: |
@@ -305,6 +309,7 @@ jobs:
     needs: [prepare-release-branch, setup_and_build, discover_runner]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     steps:
       - name: Run Sharegpt performance tests with warmup
         run: |
diff --git a/.github/workflows/hourly-ci.yaml b/.github/workflows/hourly-ci.yaml
index 659221c336..dd5c8a65cc 100644
--- a/.github/workflows/hourly-ci.yaml
+++ b/.github/workflows/hourly-ci.yaml
@@ -101,6 +101,7 @@ jobs:
     needs: [setup_and_build, discover_runner]
     # <-- UPDATED: Runs on the specific runner
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     steps:
       - name: Run pytest in tests/unit_tests
         run: |
@@ -157,6 +158,7 @@ jobs:
     needs: [setup_and_build, discover_tests, discover_runner]
     # <-- UPDATED: Runs on the specific runner
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     strategy:
       fail-fast: false
       matrix:
@@ -192,6 +194,7 @@ jobs:
     needs: [setup_and_build, discover_runner]
     # <-- UPDATED: Runs on the specific runner
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     steps:
       - name: Run Data Parallel test
         run: |
@@ -220,6 +223,7 @@ jobs:
     needs: [setup_and_build, discover_runner]
     # <-- UPDATED: Runs on the specific runner
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     steps:
       - name: Run PD disaggregate test
         run: |
diff --git a/.github/workflows/pre-merge.yaml b/.github/workflows/pre-merge.yaml
index 7b782c7e71..687c8ea7db 100644
--- a/.github/workflows/pre-merge.yaml
+++ b/.github/workflows/pre-merge.yaml
@@ -361,6 +361,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     timeout-minutes: 720
     steps:
       - name: Run pytest in tests/unit_tests
@@ -386,6 +387,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     timeout-minutes: 720
     steps:
       - name: Run test scripts
@@ -417,6 +419,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     timeout-minutes: 720
     steps:
       - name: Run test scripts
@@ -443,6 +446,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     timeout-minutes: 720
     steps:
       - name: Run test scripts
@@ -470,6 +474,7 @@ jobs:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_tests, discover_runner, retrieve_head_sha]
     # --- UPDATED: Run on the specific node ---
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     timeout-minutes: 720
     strategy:
       fail-fast: false
@@ -503,6 +508,7 @@ jobs:
   calibration_tests:
     needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_calibration_tests, discover_runner, retrieve_head_sha]
     runs-on: ${{ needs.discover_runner.outputs.runner_name }}
+    environment: approved-workflow
     timeout-minutes: 720
     strategy:
       fail-fast: false

From 7b7bc8f195cf62ff42db61be96ba56d675bb038d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Olejniczak?= <pawelx.olejniczak@intel.com>
Date: Fri, 22 May 2026 12:59:15 +0200
Subject: [PATCH 21/29] 
 [FIX_FOR_VLLM_CUSTOM=0a54df28471be07b3d668ea21c5e411569d3baea] Fix
 DynamicNTKScalingRotaryEmbedding and HPUCompressedTensorsConfig (#1479)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Root cause
Upstream vLLM at SHA 0a54df28 introduced two API changes that broke
vllm-gaudi:
1. PR vllm-project/vllm#41277 added a required `max_trained_positions`
parameter to `DynamicNTKScalingRotaryEmbedding.__init__()`, causing the
unit test to fail with TypeError.
2. PR vllm-project/vllm#43144 removed `sparsity_scheme_map` and
`sparsity_ignore_list` from `CompressedTensorsConfig.__init__()`,
causing `HPUCompressedTensorsConfig` instantiation to fail during e2e
tests.

## Upstream PR
https://github.com/vllm-project/vllm/pull/41277
Added max_trained_positions to DynamicNTKScalingRotaryEmbedding

https://github.com/vllm-project/vllm/pull/43144
Removed sparsity parameters from CompressedTensorsConfig

## Fix
1. Add `max_trained_positions` parameter to the rotary embedding unit
test.
2. Remove stale `sparsity_scheme_map` and `sparsity_ignore_list` from
HPUCompressedTensorsConfig init signature and super() call, plus the
unused SparsityCompressionConfig import.

Signed-off-by: Paweł Olejniczak <pawelx.olejniczak@intel.com>
---
 tests/unit_tests/ops/test_hpu_rotary_embedding.py | 1 +
 vllm_gaudi/ops/hpu_compressed_tensors.py          | 5 -----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/unit_tests/ops/test_hpu_rotary_embedding.py b/tests/unit_tests/ops/test_hpu_rotary_embedding.py
index 2ef23ab4f9..35c2bb5805 100644
--- a/tests/unit_tests/ops/test_hpu_rotary_embedding.py
+++ b/tests/unit_tests/ops/test_hpu_rotary_embedding.py
@@ -201,6 +201,7 @@ def test_dynamic_ntk_scaling_rotary_embedding(
         "head_size": head_size,
         "rotary_dim": rotary_dim,
         "max_position_embeddings": max_position_embeddings,
+        "max_trained_positions": max_position_embeddings,
         "base": base,
         "is_neox_style": is_neox_style,
         "scaling_factor": scaling_factor,
diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py
index af7916020c..74d91e809b 100644
--- a/vllm_gaudi/ops/hpu_compressed_tensors.py
+++ b/vllm_gaudi/ops/hpu_compressed_tensors.py
@@ -20,7 +20,6 @@
     CompressedTensorsConfig,
     CompressedTensorsMoEMethod,
     CompressedTensorsKVCacheMethod,
-    SparsityCompressionConfig,
 )
 from vllm.model_executor.layers.quantization.compressed_tensors import (compressed_tensors_moe)
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
@@ -1056,8 +1055,6 @@ def __init__(
         target_scheme_map: dict[str, Any],
         ignore: list[str],
         quant_format: str,
-        sparsity_scheme_map: dict[str, SparsityCompressionConfig],
-        sparsity_ignore_list: list[str],
         kv_cache_scheme: dict[str, Any] | None = None,
         config: dict[str, Any] | None = None,
         transform_config: dict[str, Any] | None = None,
@@ -1068,8 +1065,6 @@ def __init__(
             target_scheme_map,
             ignore,
             quant_format,
-            sparsity_scheme_map,
-            sparsity_ignore_list,
             kv_cache_scheme,
             config,
             transform_config,

From 2cb5d9993ef352ccb8af733b478812139c3d05fe Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iryna.boiko@intel.com>
Date: Fri, 22 May 2026 14:17:36 +0200
Subject: [PATCH 22/29] Fix stale gate ref overriding caller router_logits in
 dp_size==1 MoE fast path (#1469)

PR #1441 added an _hpu_gate_ref fallback in the dp_size==1 fast path
that unconditionally re-invoked a runner-owned gate, overwriting
router_logits supplied by the caller. For SharedFusedMoE models
(Qwen3 MoE, ernie45, ...) the block's mlp.gate(...) has already
produced router_logits and _sync_shared_moe_gates sets
runner.gate=None post-INC; the cached _hpu_gate_ref still points at
the pre-INC module and produced shape/dtype mismatches under fp8.

Only invoke the runner-owned gate when the caller did not provide
router_logits, preserving the DeepSeek R1 internal-router fast path
from #1441.

---------

Signed-off-by: Iryna Boiko <iboiko@habana.ai>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 664d88ef03..d3880fcb33 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -4815,6 +4815,15 @@ def _sync_moe_kernel_flags(module: torch.nn.Module):
                 runner = getattr(experts, "runner", None)
                 if runner is not None and hasattr(runner, "gate"):
                     runner.gate = None
+                    # Refresh the cached gate ref captured at
+                    # FusedMoE.__init__ to the post-INC block-level gate.
+                    # The dp_size==1 fast path (patched_fused_moe_forward)
+                    # falls back to runner._hpu_gate_ref when runner.gate
+                    # is None; the pre-INC reference points at the now-
+                    # replaced module and produced shape/dtype mismatches
+                    # under fp8.
+                    if block_gate is not None:
+                        object.__setattr__(runner, "_hpu_gate_ref", block_gate)
 
                 if id(experts) in self._detached_moe_gates:
                     self._detached_moe_gates.remove(id(experts))

From adce75b46cea1e030099849a41cd57e756172f21 Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iryna.boiko@intel.com>
Date: Mon, 25 May 2026 09:38:04 +0200
Subject: [PATCH 23/29] Update lora tests (#1488)

Signed-off-by: Iryna Boiko <iboiko@habana.ai>
---
 tests/unit_tests/lora/test_llama_tp.py             | 3 ---
 tests/unit_tests/lora/test_llm_with_multi_loras.py | 2 --
 2 files changed, 5 deletions(-)

diff --git a/tests/unit_tests/lora/test_llama_tp.py b/tests/unit_tests/lora/test_llama_tp.py
index 6e8bfa8de8..1f714065d7 100755
--- a/tests/unit_tests/lora/test_llama_tp.py
+++ b/tests/unit_tests/lora/test_llama_tp.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
-
 import vllm
 import vllm.config
 from vllm.lora.request import LoRARequest
@@ -103,6 +101,5 @@ def test_llama_lora(llama32_lora_files):
         max_model_len=1024,
         max_loras=4,
         dtype='bfloat16',
-        hf_token=os.environ.get("HF_TOKEN"),
     )
     generate_and_test(llm, llama32_lora_files)
diff --git a/tests/unit_tests/lora/test_llm_with_multi_loras.py b/tests/unit_tests/lora/test_llm_with_multi_loras.py
index ea21d18595..d33befbb5e 100644
--- a/tests/unit_tests/lora/test_llm_with_multi_loras.py
+++ b/tests/unit_tests/lora/test_llm_with_multi_loras.py
@@ -7,7 +7,6 @@
 """
 
 import pytest
-import os
 
 from vllm import LLM
 from vllm.lora.request import LoRARequest
@@ -66,7 +65,6 @@ def test_multiple_lora_requests():
         gpu_memory_utilization=0.5,
         enforce_eager=True,
         dtype='bfloat16',
-        hf_token=os.environ.get("HF_TOKEN"),
     )
     PROMPTS = ["Hello, my name is"] * 2
     LORA_NAME = "Alice"

From 87aef6ca737a70b0afb311b9d84589deca48f1bd Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 25 May 2026 05:02:00 -0700
Subject: [PATCH 24/29] Fix HPU prompt_token_ids device placement for penalty
 sampling (#1465)

Move prompt_token_ids to self.device in selective sampling metadata
creation for both skip_copy paths.
This keeps prompt and output penalty masks on the same device and
prevents runtime device mismatch errors during
repetition/presence/frequency penalty application.

Signed-off-by: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Co-authored-by: Iryna Boiko <iryna.boiko@intel.com>
---
 vllm_gaudi/v1/worker/hpu_input_batch.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_input_batch.py b/vllm_gaudi/v1/worker/hpu_input_batch.py
index bf3207cf4e..c2ac2f8f68 100644
--- a/vllm_gaudi/v1/worker/hpu_input_batch.py
+++ b/vllm_gaudi/v1/worker/hpu_input_batch.py
@@ -640,12 +640,14 @@ def make_selective_sampling_metadata(
                 # The prompt tokens are used only for applying penalties during
                 # the sampling process. Hence copy these tensors only when
                 # there are requests which need penalties to be applied.
-                prompt_token_ids = self._make_prompt_token_ids_cpu_tensor()[req_indices]
+                prompt_token_ids = self._make_prompt_token_ids_cpu_tensor()[req_indices].to(device=self.device,
+                                                                                            non_blocking=True)
         else:
             # Even with skip_copy=True, we need prompt_token_ids for penalties
             if not self.no_penalties:
                 cached_tensor = self._get_cached_prompt_token_ids()
-                prompt_token_ids = cached_tensor[req_indices] if cached_tensor is not None else None
+                prompt_token_ids = cached_tensor[req_indices].to(
+                    device=self.device, non_blocking=True) if cached_tensor is not None else None
             else:
                 prompt_token_ids = None
 

From bc4f535eed8f15618739b5f6772096afc392ee9b Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Wed, 27 May 2026 15:54:58 +0800
Subject: [PATCH 25/29] Fix decode bucket generation for hybrid models with
 mismatched block sizes (#1485)

## Problem

For hybrid models like Qwen3.5 (GDN + attention),
`_align_hybrid_block_size()` sets `block_size=640` (unified KV-cache
page for mamba/attention alignment), while HPU kernels use
`attn_block_size=128`.

The decode bucket generation (introduced by f24f3f9d) uses the formula:
```
max_decode_blocks = ceil(max_model_len / block_size) * max_num_seqs
                  = ceil(262144 / 640) * 45 = 18450
```

But the runtime decode path (`_create_decode_input_data`) computes
`num_blocks` using `attn_block_size=128`, producing values up to
`ceil(262144/128) * 45 = 92160`.

This causes hundreds of **"Configuration was not warmed-up"** warnings
and costly HPU graph recompilation on every decode step.

## Root Cause

Two different block_size semantics coexist:
- `self.block_size = 640`: KV-cache management page size (unified for
hybrid mamba/attention)
- `self.attn_block_size = 128`: HPU attention kernel page size (what
hardware actually uses)

Decode bucket generation used `block_size` but should use
`attn_block_size` to match the runtime.

## Fix

Temporarily scope `bucketing_manager.block_size` to `attn_block_size`
during decode bucket generation in `warmup_model()`, then restore the
original value so prompt fallback paths remain unaffected.

## Testing

- Verified with Qwen3.5-35B-A3B on 4x Gaudi3 (TP=4,
max_model_len=262144, max_num_seqs=45)
- Decode buckets now correctly cover runtime num_blocks range
- No more "Configuration was not warmed-up" warnings during serving

Signed-off-by: Youlei Yang <youlei.yang@intel.com>

Signed-off-by: Youlei Yang <youlei.yang@intel.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/unit_tests/test_decode_bucket_hybrid.py | 440 ++++++++++++++++++
 vllm_gaudi/v1/worker/hpu_model_runner.py      |  26 +-
 2 files changed, 462 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit_tests/test_decode_bucket_hybrid.py

diff --git a/tests/unit_tests/test_decode_bucket_hybrid.py b/tests/unit_tests/test_decode_bucket_hybrid.py
new file mode 100644
index 0000000000..1e1d14d09f
--- /dev/null
+++ b/tests/unit_tests/test_decode_bucket_hybrid.py
@@ -0,0 +1,440 @@
+# SPDX-License-Identifier: Apache-2.0
+###############################################################################
+# Copyright (C) 2024-2026 Intel Corporation
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+"""
+Regression tests for decode bucket generation and warmup in hybrid models.
+
+Hybrid models (e.g., Qwen3.5) have block_size != attn_block_size:
+- block_size=640: unified page size for KV cache management
+- attn_block_size=128: HPU kernel page size used by paged attention
+
+The decode path (_create_decode_input_data) computes num_blocks using
+attn_block_size. Therefore:
+1. Decode buckets MUST be generated in attn_block_size units.
+2. Warmup seq_lengths MUST produce the correct sum(num_blocks) to match
+   the target bucket after find_decode_bucket lookup.
+3. For non-contiguous PA, _generate_seq_lengths MUST NOT cap num_blocks
+   at kv_cache_config.num_blocks (physical pool), because runtime can
+   exceed this via prefix-sharing.
+
+Regression: f24f3f9d introduced a formula for max_decode_blocks using
+block_size instead of attn_block_size, and added a physical-pool cap
+that prevented large decode buckets from being warmed.
+"""
+
+import math
+
+import pytest
+from types import SimpleNamespace
+from unittest.mock import patch
+
+from vllm_gaudi.extension.bucketing.common import (
+    HPUBucketingManager,
+    find_equal_or_closest_greater_config,
+)
+from vllm_gaudi.extension.bucketing.exponential import ExponentialBucketingStrategy
+from vllm_gaudi.extension.runtime import get_config, clear_config
+from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner
+
+# --- Qwen3.5 hybrid model parameters ---
+_QWEN35_BLOCK_SIZE = 640  # unified page size (5 * 128)
+_QWEN35_ATTN_BLOCK_SIZE = 128  # HPU kernel page size
+_QWEN35_MAX_MODEL_LEN = 262144
+_QWEN35_MAX_NUM_SEQS = 45
+_QWEN35_NUM_HPU_BLOCKS = 15405  # physical blocks in attn_block_size units
+
+
+@pytest.fixture(autouse=True)
+def default_config(monkeypatch):
+    """Reset singleton and pin bucketing config for deterministic tests."""
+    # Reset singleton to prevent state leakage between tests
+    HPUBucketingManager._instance = None
+    # Pin bucketing strategy to avoid env-dependent behavior in CI
+    monkeypatch.setenv("VLLM_BUCKETING_STRATEGY", "exp")
+    monkeypatch.delenv("VLLM_EXPONENTIAL_BUCKETING", raising=False)
+    clear_config()
+    get_config()
+    yield
+    HPUBucketingManager._instance = None
+    clear_config()
+
+
+class _MockConfig:
+    """Lightweight mock for get_config()."""
+
+    def __init__(self, **kwargs):
+        defaults = dict(
+            prefix_caching=False,
+            use_contiguous_pa=False,
+            merged_prefill=False,
+            VLLM_PROMPT_BS_BUCKET_MIN=None,
+            VLLM_PROMPT_BS_BUCKET_STEP=None,
+            VLLM_PROMPT_BS_BUCKET_MAX=None,
+            VLLM_PROMPT_SEQ_BUCKET_MIN=None,
+            VLLM_PROMPT_SEQ_BUCKET_STEP=None,
+            VLLM_PROMPT_SEQ_BUCKET_MAX=None,
+            VLLM_DECODE_BS_BUCKET_MIN=None,
+            VLLM_DECODE_BS_BUCKET_STEP=None,
+            VLLM_DECODE_BS_BUCKET_MAX=None,
+            VLLM_DECODE_BLOCK_BUCKET_MIN=None,
+            VLLM_DECODE_BLOCK_BUCKET_STEP=None,
+            VLLM_DECODE_BLOCK_BUCKET_MAX=None,
+            VLLM_PROMPT_QUERY_BUCKET_MIN=None,
+        )
+        defaults.update(kwargs)
+        for k, v in defaults.items():
+            object.__setattr__(self, k, v)
+
+
+def _make_bucketing_manager(block_size, max_model_len, max_num_seqs, num_hpu_blocks):
+    """Create a minimally-configured HPUBucketingManager."""
+    mgr = HPUBucketingManager.__new__(HPUBucketingManager)
+    mgr.block_size = block_size
+    mgr.max_model_len = max_model_len
+    mgr.max_num_seqs = max_num_seqs
+    mgr.max_num_prefill_seqs = 1
+    mgr.num_hpu_blocks = num_hpu_blocks
+    mgr.max_num_batched_tokens = 131072
+    mgr.initialized = True
+    mgr.mamba_chunk_size = None
+    mgr.mamba_chunk_size_is_explicit = False
+    mgr.num_speculative_tokens = None
+    mgr.use_sliding_window = False
+    mgr.fallback_bs_base_step = 2
+    mgr.fallback_seq_base_step = 32
+    mgr.fallback_blocks_base_step = 32
+    mgr._fallback_max_ctx = 0
+    return mgr
+
+
+class _MockModelRunner:
+    """Minimal mock of HPUModelRunner for _generate_seq_lengths testing."""
+
+    def __init__(self, use_contiguous_pa, num_blocks, max_model_len, speculative_config=None):
+        self.use_contiguous_pa = use_contiguous_pa
+        self.kv_cache_config = SimpleNamespace(num_blocks=num_blocks)
+        self.max_model_len = max_model_len
+        self.speculative_config = speculative_config
+
+
+def _generate_seq_lengths(runner, num_samples, num_blocks, block_size):
+    """Call HPUModelRunner._generate_seq_lengths via unbound method."""
+    return HPUModelRunner._generate_seq_lengths(runner, num_samples, num_blocks, block_size)
+
+
+# =============================================================================
+# Test 1: Decode bucket generation uses attn_block_size for hybrid models
+# =============================================================================
+
+
+@patch('vllm_gaudi.extension.bucketing.exponential.get_config')
+def test_hybrid_decode_buckets_use_attn_block_size(mock_exp_config):
+    """Decode buckets for hybrid model must be generated using attn_block_size.
+
+    When block_size=640 is incorrectly used:
+      max_decode_blocks = ceil(262144/640)*45 = 18450
+    When attn_block_size=128 is correctly used:
+      max_decode_blocks = ceil(262144/128)*45 = 92160
+
+    The warmup_model() scopes bucketing_manager.block_size to attn_block_size
+    before calling generate_decode_buckets(). This test verifies that using
+    the correct block_size produces the right max.
+    """
+    mock_exp_config.return_value = _MockConfig(use_contiguous_pa=False)
+
+    # With WRONG block_size (640) — the bug
+    mgr_wrong = _make_bucketing_manager(
+        block_size=_QWEN35_BLOCK_SIZE,
+        max_model_len=_QWEN35_MAX_MODEL_LEN,
+        max_num_seqs=_QWEN35_MAX_NUM_SEQS,
+        num_hpu_blocks=_QWEN35_NUM_HPU_BLOCKS,
+    )
+    mgr_wrong.generate_decode_buckets()
+    wrong_max_ctx = max(ctx for _, _, ctx in mgr_wrong.decode_buckets)
+
+    # With CORRECT block_size (128) — the fix
+    mgr_correct = _make_bucketing_manager(
+        block_size=_QWEN35_ATTN_BLOCK_SIZE,
+        max_model_len=_QWEN35_MAX_MODEL_LEN,
+        max_num_seqs=_QWEN35_MAX_NUM_SEQS,
+        num_hpu_blocks=_QWEN35_NUM_HPU_BLOCKS,
+    )
+    mgr_correct.generate_decode_buckets()
+    correct_max_ctx = max(ctx for _, _, ctx in mgr_correct.decode_buckets)
+
+    expected_max = math.ceil(_QWEN35_MAX_MODEL_LEN / _QWEN35_ATTN_BLOCK_SIZE) * _QWEN35_MAX_NUM_SEQS
+    wrong_expected = math.ceil(_QWEN35_MAX_MODEL_LEN / _QWEN35_BLOCK_SIZE) * _QWEN35_MAX_NUM_SEQS
+
+    assert wrong_max_ctx <= wrong_expected, (
+        f"Wrong block_size should produce max_ctx <= {wrong_expected}, got {wrong_max_ctx}")
+    assert correct_max_ctx <= expected_max, (
+        f"Correct block_size should produce max_ctx <= {expected_max}, got {correct_max_ctx}")
+    assert correct_max_ctx > wrong_max_ctx, (
+        f"attn_block_size={_QWEN35_ATTN_BLOCK_SIZE} should produce larger buckets than "
+        f"block_size={_QWEN35_BLOCK_SIZE}: {correct_max_ctx} vs {wrong_max_ctx}")
+
+
+@patch('vllm_gaudi.extension.bucketing.exponential.get_config')
+def test_hybrid_decode_buckets_cover_runtime_scenarios(mock_exp_config):
+    """Decode buckets must cover all runtime-reachable configurations.
+
+    For 28 seqs at max context: 28 * ceil(262144/128) = 28 * 2048 = 57344.
+    A bucket >= 57344 must exist for batch_size=28.
+    """
+    mock_exp_config.return_value = _MockConfig(use_contiguous_pa=False)
+
+    mgr = _make_bucketing_manager(
+        block_size=_QWEN35_ATTN_BLOCK_SIZE,
+        max_model_len=_QWEN35_MAX_MODEL_LEN,
+        max_num_seqs=_QWEN35_MAX_NUM_SEQS,
+        num_hpu_blocks=_QWEN35_NUM_HPU_BLOCKS,
+    )
+    mgr.generate_decode_buckets()
+
+    # For each batch size, the max reachable ctx is bs * max_blocks_per_seq
+    max_blocks_per_seq = math.ceil(_QWEN35_MAX_MODEL_LEN / _QWEN35_ATTN_BLOCK_SIZE)
+
+    # Check that large decode scenarios are covered
+    test_cases = [
+        (28, 37620),  # Real case from the bug report
+        (45, 92160),  # Maximum: all seqs at max_model_len
+        (1, 2048),  # Single seq at max context
+    ]
+    for bs, target_ctx in test_cases:
+        # Verify target is reachable (within physical limits for the batch)
+        max_reachable = bs * max_blocks_per_seq
+        assert target_ctx <= max_reachable, (f"Test case ({bs}, {target_ctx}) exceeds reachable max {max_reachable}")
+
+        # Verify a covering bucket exists
+        found = find_equal_or_closest_greater_config(mgr.decode_buckets, (bs, 1, target_ctx))
+        assert found is not None, (f"No decode bucket found >= ({bs}, 1, {target_ctx}). "
+                                   f"Max bucket for bs={bs}: "
+                                   f"{max((ctx for b, _, ctx in mgr.decode_buckets if b >= bs), default='NONE')}")
+
+
+# =============================================================================
+# Test 2: _generate_seq_lengths does NOT cap for non-contiguous PA
+# =============================================================================
+
+
+class TestGenerateSeqLengthsNonContiguousPA:
+    """Verify _generate_seq_lengths behavior for non-contiguous PA."""
+
+    def test_no_cap_when_num_blocks_exceeds_physical(self):
+        """num_blocks > kv_cache_config.num_blocks should NOT be capped.
+
+        This is the key regression: capping prevents large decode buckets
+        from being warmed, causing 'not warmed-up' warnings at runtime.
+        """
+        runner = _MockModelRunner(
+            use_contiguous_pa=False,
+            num_blocks=_QWEN35_NUM_HPU_BLOCKS,  # 15405
+            max_model_len=_QWEN35_MAX_MODEL_LEN,
+        )
+        target_blocks = 37620  # Much larger than num_blocks=15405
+
+        seq_lengths = _generate_seq_lengths(runner, 28, target_blocks, _QWEN35_ATTN_BLOCK_SIZE)
+
+        # Verify total blocks from seq_lengths matches target
+        total_blocks = sum(math.ceil((sl + 1) / _QWEN35_ATTN_BLOCK_SIZE) for sl in seq_lengths)
+        assert total_blocks == target_blocks, (
+            f"Expected total_blocks={target_blocks}, got {total_blocks}. "
+            f"Non-contiguous PA must not cap at kv_cache_config.num_blocks={_QWEN35_NUM_HPU_BLOCKS}")
+
+    def test_max_model_len_still_bounds_per_seq(self):
+        """Individual seq_lengths must still be clamped by max_model_len."""
+        runner = _MockModelRunner(
+            use_contiguous_pa=False,
+            num_blocks=_QWEN35_NUM_HPU_BLOCKS,
+            max_model_len=_QWEN35_MAX_MODEL_LEN,
+        )
+        # Large bucket: 1 seq with 92160 blocks (way beyond max_model_len/block_size=2048)
+        seq_lengths = _generate_seq_lengths(runner, 1, 92160, _QWEN35_ATTN_BLOCK_SIZE)
+
+        assert len(seq_lengths) == 1
+        assert seq_lengths[0] <= _QWEN35_MAX_MODEL_LEN - 1, (
+            f"seq_length {seq_lengths[0]} exceeds max_model_len-1={_QWEN35_MAX_MODEL_LEN - 1}")
+
+    @pytest.mark.parametrize("batch_size,target_blocks", [
+        (28, 37620),
+        (45, 92160),
+        (14, 20000),
+        (1, 2048),
+    ])
+    def test_warmup_roundtrip_targets_correct_bucket(self, batch_size, target_blocks):
+        """Verify warmup roundtrip: seq_lengths -> num_blocks -> find_decode_bucket.
+
+        The warmup path generates seq_lengths from the target bucket, then the
+        runtime decode path recomputes num_blocks from those seq_lengths. The
+        resulting sum(num_blocks) must find the same bucket via find_decode_bucket.
+        """
+        runner = _MockModelRunner(
+            use_contiguous_pa=False,
+            num_blocks=_QWEN35_NUM_HPU_BLOCKS,
+            max_model_len=_QWEN35_MAX_MODEL_LEN,
+        )
+        max_blocks_per_seq = math.ceil(_QWEN35_MAX_MODEL_LEN / _QWEN35_ATTN_BLOCK_SIZE)
+
+        # Skip unreachable buckets (can't produce these at runtime either)
+        max_reachable = batch_size * max_blocks_per_seq
+        if target_blocks > max_reachable:
+            pytest.skip(f"Bucket ({batch_size}, 1, {target_blocks}) is unreachable "
+                        f"(max={max_reachable})")
+
+        seq_lengths = _generate_seq_lengths(runner, batch_size, target_blocks, _QWEN35_ATTN_BLOCK_SIZE)
+
+        # Simulate _create_decode_input_data's num_blocks computation
+        num_blocks_per_req = [math.ceil((sl + 1) / _QWEN35_ATTN_BLOCK_SIZE) for sl in seq_lengths]
+        total_blocks_after_roundtrip = sum(num_blocks_per_req)
+
+        # The roundtrip total should equal the target (for reachable buckets)
+        assert total_blocks_after_roundtrip == target_blocks, (
+            f"Roundtrip mismatch for bucket ({batch_size}, 1, {target_blocks}): "
+            f"got sum(num_blocks)={total_blocks_after_roundtrip}. "
+            f"Warmup will target wrong bucket!")
+
+
+# =============================================================================
+# Test 3: _generate_seq_lengths DOES cap for contiguous PA
+# =============================================================================
+
+
+class TestGenerateSeqLengthsContiguousPA:
+    """Verify _generate_seq_lengths caps correctly for contiguous PA."""
+
+    def test_caps_at_physical_blocks(self):
+        """For contiguous PA, num_blocks MUST be capped at kv_cache_config.num_blocks.
+
+        This is because contiguous PA uses block_id = num_blocks - 1 as the
+        contiguous allocation base, which must be a valid physical block.
+        """
+        runner = _MockModelRunner(
+            use_contiguous_pa=True,
+            num_blocks=_QWEN35_NUM_HPU_BLOCKS,  # 15405
+            max_model_len=_QWEN35_MAX_MODEL_LEN,
+        )
+        target_blocks = 37620  # Larger than physical
+
+        seq_lengths = _generate_seq_lengths(runner, 28, target_blocks, _QWEN35_ATTN_BLOCK_SIZE)
+
+        # Total blocks should be capped at num_blocks
+        total_blocks = sum(math.ceil((sl + 1) / _QWEN35_ATTN_BLOCK_SIZE) for sl in seq_lengths)
+        assert total_blocks <= _QWEN35_NUM_HPU_BLOCKS, (
+            f"Contiguous PA: total_blocks={total_blocks} exceeds physical "
+            f"num_blocks={_QWEN35_NUM_HPU_BLOCKS}. block_id would overflow!")
+
+
+# =============================================================================
+# Test 4: End-to-end decode bucket max formula for hybrid models
+# =============================================================================
+
+
+@patch('vllm_gaudi.extension.bucketing.exponential.get_config')
+def test_hybrid_max_decode_blocks_formula(mock_exp_config):
+    """Verify max_decode_blocks = ceil(max_model_len/attn_block_size) * max_num_seqs.
+
+    For Qwen3.5: ceil(262144/128) * 45 = 2048 * 45 = 92160.
+    This must NOT use block_size=640 which gives ceil(262144/640)*45 = 18450.
+    """
+    mock_exp_config.return_value = _MockConfig(use_contiguous_pa=False)
+    strategy = ExponentialBucketingStrategy()
+
+    # Using attn_block_size=128 (correct)
+    _, _, block_cfg = strategy.get_decode_cfgs(
+        max_num_seqs=_QWEN35_MAX_NUM_SEQS,
+        block_size=_QWEN35_ATTN_BLOCK_SIZE,
+        max_num_batched_tokens=131072,
+        max_model_len=_QWEN35_MAX_MODEL_LEN,
+        max_blocks=_QWEN35_NUM_HPU_BLOCKS,
+    )
+    expected_max = math.ceil(_QWEN35_MAX_MODEL_LEN / _QWEN35_ATTN_BLOCK_SIZE) * _QWEN35_MAX_NUM_SEQS
+    assert block_cfg[2] == expected_max, (
+        f"max_decode_blocks should be {expected_max} with attn_block_size={_QWEN35_ATTN_BLOCK_SIZE}, "
+        f"got {block_cfg[2]}")
+
+    # Using block_size=640 (wrong — would produce 18450)
+    _, _, block_cfg_wrong = strategy.get_decode_cfgs(
+        max_num_seqs=_QWEN35_MAX_NUM_SEQS,
+        block_size=_QWEN35_BLOCK_SIZE,
+        max_num_batched_tokens=131072,
+        max_model_len=_QWEN35_MAX_MODEL_LEN,
+        max_blocks=_QWEN35_NUM_HPU_BLOCKS,
+    )
+    wrong_max = math.ceil(_QWEN35_MAX_MODEL_LEN / _QWEN35_BLOCK_SIZE) * _QWEN35_MAX_NUM_SEQS
+    assert block_cfg_wrong[2] == wrong_max, (
+        f"With block_size=640, max_decode_blocks should be {wrong_max}, got {block_cfg_wrong[2]}")
+    assert expected_max > wrong_max, (f"attn_block_size formula ({expected_max}) must produce larger max than "
+                                      f"block_size formula ({wrong_max})")
+
+
+# =============================================================================
+# Test 5: Verify the bug scenario — bucket (28, 1, 37620) IS reachable
+# =============================================================================
+
+
+def test_bucket_37620_reachable_at_runtime():
+    """Bucket (28, 1, 37620) is reachable: 28 seqs averaging ~1344 blocks each.
+
+    Each seq has context_len ≈ 171903 tokens → ceil(171904/128) = 1344 blocks.
+    Sum across 28 seqs ≈ 37620. This is within max_model_len per seq.
+    """
+    attn_block_size = _QWEN35_ATTN_BLOCK_SIZE
+    max_blocks_per_seq = math.ceil(_QWEN35_MAX_MODEL_LEN / attn_block_size)  # 2048
+    batch_size = 28
+    target_blocks = 37620
+
+    # Each seq needs target_blocks/batch_size ≈ 1344 blocks
+    blocks_per_seq = target_blocks / batch_size  # 1343.57
+    tokens_per_seq = blocks_per_seq * attn_block_size  # ~171977
+
+    assert tokens_per_seq < _QWEN35_MAX_MODEL_LEN, (f"Scenario requires {tokens_per_seq:.0f} tokens/seq which exceeds "
+                                                    f"max_model_len={_QWEN35_MAX_MODEL_LEN}")
+    assert blocks_per_seq <= max_blocks_per_seq, (f"Scenario requires {blocks_per_seq:.1f} blocks/seq which exceeds "
+                                                  f"max_blocks_per_seq={max_blocks_per_seq}")
+
+
+# =============================================================================
+# Test 6: Regression test — with old cap, bucket warmup targets wrong bucket
+# =============================================================================
+
+
+def test_old_cap_causes_wrong_bucket_warmup():
+    """Demonstrate that capping at kv_cache_config.num_blocks causes warmup
+    to target the wrong bucket, producing 'not warmed-up' warnings.
+
+    With cap: _generate_seq_lengths(28, min(15405, 37620)=15405, 128)
+    → sum(num_blocks) ≈ 15405 → find_decode_bucket finds a smaller bucket.
+    """
+    runner_capped = _MockModelRunner(
+        use_contiguous_pa=True,  # simulate old buggy behavior (cap always)
+        num_blocks=_QWEN35_NUM_HPU_BLOCKS,
+        max_model_len=_QWEN35_MAX_MODEL_LEN,
+    )
+    target_bucket_ctx = 37620
+
+    # With cap (simulates old behavior)
+    seq_lengths_capped = _generate_seq_lengths(runner_capped, 28, target_bucket_ctx, _QWEN35_ATTN_BLOCK_SIZE)
+    total_capped = sum(math.ceil((sl + 1) / _QWEN35_ATTN_BLOCK_SIZE) for sl in seq_lengths_capped)
+
+    # Without cap (correct behavior for non-contiguous PA)
+    runner_uncapped = _MockModelRunner(
+        use_contiguous_pa=False,
+        num_blocks=_QWEN35_NUM_HPU_BLOCKS,
+        max_model_len=_QWEN35_MAX_MODEL_LEN,
+    )
+    seq_lengths_uncapped = _generate_seq_lengths(runner_uncapped, 28, target_bucket_ctx, _QWEN35_ATTN_BLOCK_SIZE)
+    total_uncapped = sum(math.ceil((sl + 1) / _QWEN35_ATTN_BLOCK_SIZE) for sl in seq_lengths_uncapped)
+
+    # Capped version misses the target
+    assert total_capped < target_bucket_ctx, (f"Capped version should produce fewer blocks than target: "
+                                              f"{total_capped} vs {target_bucket_ctx}")
+    assert total_capped <= _QWEN35_NUM_HPU_BLOCKS, (
+        f"Capped version should be bounded by num_blocks={_QWEN35_NUM_HPU_BLOCKS}")
+
+    # Uncapped version hits the target exactly
+    assert total_uncapped == target_bucket_ctx, (f"Uncapped version should produce exactly {target_bucket_ctx} blocks, "
+                                                 f"got {total_uncapped}")
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index d3880fcb33..893bd795d7 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -5207,8 +5207,12 @@ def _add_dummy_request(self,
             num_scheduled_tokens[req_id] = scheduled_tokens
 
     def _generate_seq_lengths(self, num_samples, num_blocks, block_size):
-        # ensure the actual number of blocks is less than the KV cache blocks
-        num_blocks = min(self.kv_cache_config.num_blocks, num_blocks)
+        # For contiguous PA, cap num_blocks to physical KV cache size because
+        # block_id = num_blocks - 1 must be a valid physical block.
+        # For non-contiguous PA, block_id=0 is always valid and runtime can
+        # exceed physical blocks via prefix-sharing, so don't cap.
+        if self.use_contiguous_pa:
+            num_blocks = min(self.kv_cache_config.num_blocks, num_blocks)
 
         assert num_samples <= num_blocks
         blocks = [num_blocks // num_samples] * num_samples
@@ -5306,7 +5310,9 @@ def _prepare_dummy_scenario(self, prompt_cfg, decode_cfg):
             decode_block_size = self.attn_block_size
             if self.use_contiguous_pa:
                 decode_seq_lengths = [decode_block_size] * decode_bs
-                block_id = decode_num_blocks - 1
+                # Cap block_id at physical pool — contiguous PA uses
+                # block_id as the allocation base which must be valid.
+                block_id = min(decode_num_blocks - 1, self.kv_cache_config.num_blocks - 1)
             else:
                 decode_seq_lengths = self._generate_seq_lengths(decode_bs, decode_num_blocks, decode_block_size)
                 block_id = 0
@@ -5575,7 +5581,19 @@ def warmup_model(self) -> None:
 
         self.bucketing_manager.generate_prompt_buckets()
         if not self.is_pooling_model:
-            self.bucketing_manager.generate_decode_buckets()
+            # For hybrid models where HPU kernel block size (attn_block_size)
+            # differs from KV-cache block_size, decode buckets must be
+            # generated in attn_block_size units because the runtime decode
+            # path (_create_decode_input_data) computes num_blocks using
+            # attn_block_size.  Scope the mutation to avoid affecting prompt
+            # fallback paths that still need the original block_size.
+            saved_block_size = self.bucketing_manager.block_size
+            if self.attn_block_size != self.block_size:
+                self.bucketing_manager.block_size = self.attn_block_size
+            try:
+                self.bucketing_manager.generate_decode_buckets()
+            finally:
+                self.bucketing_manager.block_size = saved_block_size
         else:
             self.bucketing_manager.decode_buckets = []
 

From bf8dfdf3160255c262862799e0434603ec6e845c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Olejniczak?= <pawelx.olejniczak@intel.com>
Date: Wed, 27 May 2026 11:04:34 +0200
Subject: [PATCH 26/29] 
 [FIX_FOR_VLLM_CUSTOM=b06813e87207e15b133e903d641e03f237d85b17] Fix
 gdn_linear_attn import path after upstream mamba refactor (#1496)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Root cause

Upstream vLLM PR vllm-project/vllm#41126 (commit 7e1b45a092) refactored

`vllm.model_executor.layers.mamba.gdn_linear_attn.GatedDeltaNetAttention`
into a `gdn/` subpackage:
`vllm.model_executor.layers.mamba.gdn.qwen_gdn_linear_attn.QwenGatedDeltaNetAttention`.

This broke `vllm_gaudi/models/qwen3_5.py` which imported from the old
path.

## Fix

Updated 6 lines in `vllm_gaudi/models/qwen3_5.py`:
- Changed import path from `gdn_linear_attn` to
`gdn.qwen_gdn_linear_attn`
- Updated class reference from `GatedDeltaNetAttention` to
`QwenGatedDeltaNetAttention`

## Upstream compatibility

Pinned to vLLM SHA: `b06813e87207e15b133e903d641e03f237d85b17`

Signed-off-by: Paweł Olejniczak <pawelx.olejniczak@intel.com>
---
 vllm_gaudi/models/qwen3_5.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm_gaudi/models/qwen3_5.py b/vllm_gaudi/models/qwen3_5.py
index d8b3dabf7f..c37b39fa92 100644
--- a/vllm_gaudi/models/qwen3_5.py
+++ b/vllm_gaudi/models/qwen3_5.py
@@ -1,5 +1,5 @@
 import torch
-from vllm.model_executor.layers.mamba.gdn_linear_attn import GatedDeltaNetAttention
+from vllm.model_executor.layers.mamba.gdn.qwen_gdn_linear_attn import QwenGatedDeltaNetAttention
 from vllm.forward_context import get_forward_context
 
 from vllm_gaudi.ops.causal_conv1d_pytorch import (
@@ -26,7 +26,7 @@ def _save_ssm_state(core_attn_out, final_state, ssm_state, state_indices):
     return core_attn_out
 
 
-class HPUGatedDeltaNetAttention(GatedDeltaNetAttention):
+class HPUGatedDeltaNetAttention(QwenGatedDeltaNetAttention):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -295,10 +295,10 @@ def forward(
 
 # Replace the class in the upstream modules so that both Qwen3-Next and
 # Qwen3.5 model definitions instantiate HPUGatedDeltaNetAttention.
-import vllm.model_executor.layers.mamba.gdn_linear_attn as _gdn_module  # noqa: E402
+import vllm.model_executor.layers.mamba.gdn.qwen_gdn_linear_attn as _gdn_module  # noqa: E402
 import vllm.model_executor.models.qwen3_next as _qwen3_next_module  # noqa: E402
 import vllm.model_executor.models.qwen3_5 as _qwen3_5_module  # noqa: E402
 
-_gdn_module.GatedDeltaNetAttention = HPUGatedDeltaNetAttention
-_qwen3_next_module.GatedDeltaNetAttention = HPUGatedDeltaNetAttention
-_qwen3_5_module.GatedDeltaNetAttention = HPUGatedDeltaNetAttention
+_gdn_module.QwenGatedDeltaNetAttention = HPUGatedDeltaNetAttention
+_qwen3_next_module.QwenGatedDeltaNetAttention = HPUGatedDeltaNetAttention
+_qwen3_5_module.QwenGatedDeltaNetAttention = HPUGatedDeltaNetAttention

From d8af506f53190442d84ecf04958f13a639c6ca75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Smyrek?= <radoslawx.smyrek@intel.com>
Date: Wed, 27 May 2026 15:52:33 +0200
Subject: [PATCH 27/29] =?UTF-8?q?Revert=20"Skip=20materialised=20causal=20?=
 =?UTF-8?q?attn=5Fbias=20on=20FSDPA=20for=20non-GDN=20hybri=E2=80=A6=20(#1?=
 =?UTF-8?q?482)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…d models (#1413)"

This reverts commit 808dbfaffad15ad0acbd0c94f4cb081a68b1f68b.

Signed-off-by: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
Co-authored-by: Iryna Boiko <iryna.boiko@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 46 ------------------------
 1 file changed, 46 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 893bd795d7..8da476c5af 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1137,12 +1137,6 @@ def __init__(
                                              and (getattr(hf_text_config, "mamba_chunk_size", None) is not None
                                                   or getattr(hf_text_config, "chunk_size", None) is not None))
 
-        # Non-GDN hybrid: at least one mamba/linear-style layer and zero GDN
-        # (gdn_attention / linear_attention) layers. Used to gate optimizations
-        # that have only been validated on non-GDN hybrid topologies
-        # (e.g. Granite-4 Mamba2+Transformer).
-        self.is_non_gdn_hybrid = (self.num_mamba_like_layers > 0 and self.num_gdn == 0)
-
         # For HPU GDN, use configured chunk size when explicitly provided;
         # otherwise default to 128 to match bucket alignment.
         if self.num_mamba_like_layers > 0:
@@ -3897,21 +3891,6 @@ def set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
                 or not attn_metadata.is_prompt):
             return attn_metadata
 
-        # Extended FSDPA-native causal short-circuit for non-GDN hybrid models
-        # (e.g. Granite-4 Mamba2+Transformer). FusedSDPA can encode a purely
-        # causal mask natively via is_causal=True + valid_seq_lengths, including
-        # chunked prefill where block_list is non-None. Skipping the
-        # materialised [bs, 1, q_len, total_kv_len] attn_bias avoids a large
-        # add_bf16 on the attention critical path (significant at long
-        # context). Conservative scope: only non-GDN hybrid models; GDN /
-        # pure-transformer / other topologies keep the materialised bias path
-        # until validated.
-        if (self.prefill_use_fusedsdpa and self.is_causal and not self.is_pooling_model
-                and not getattr(self, 'sliding_window', None)
-                and not getattr(self, 'model_has_chunked_attention', False)
-                and getattr(self, 'alibi_slopes', None) is None and self.is_non_gdn_hybrid):
-            return attn_metadata
-
         if attn_metadata.attn_bias is not None:
             return attn_metadata
 
@@ -6796,17 +6775,6 @@ def __init__(
         self.interleaved_sliding_window = (is_interleaved(vllm_config.model_config.hf_text_config)
                                            and self.sliding_window)
 
-        # Detect non-GDN hybrid topologies (e.g. Granite-4 Mamba2+Transformer).
-        # Used to gate the FSDPA-native causal short-circuit in _set_attn_bias.
-        # Mirrors the runner's num_mamba_like_layers / num_gdn computation
-        # (HPUModelRunner.__init__) so the same set of models is targeted.
-        get_num_layers = vllm_config.model_config.get_num_layers_by_block_type
-        parallel_config = vllm_config.parallel_config
-        num_mamba_like = sum(
-            get_num_layers(parallel_config, bt) for bt in ("mamba", "gdn_attention", "linear_attention"))
-        num_gdn = sum(get_num_layers(parallel_config, bt) for bt in ("gdn_attention", "linear_attention"))
-        self.is_non_gdn_hybrid = (num_mamba_like > 0 and num_gdn == 0)
-
         if self.interleaved_sliding_window:
             self.use_window_sdpa = with_default(get_config().PT_HPU_SDPA_QKV_SLICE_MODE_FWD, False)
             #os.getenv("PT_HPU_SDPA_QKV_SLICE_MODE_FWD", "false").strip().lower() in ("1", "true")
@@ -6839,20 +6807,6 @@ def _set_attn_bias(self, attn_metadata: HPUAttentionMetadataV1, batch_size: int,
                 or not attn_metadata.is_prompt):
             return attn_metadata
 
-        # Extended FSDPA-native causal short-circuit for non-GDN hybrid models
-        # (e.g. Granite-4 Mamba2+Transformer). FusedSDPA handles a purely
-        # causal mask natively (is_causal=True + valid_seq_lengths). Skip
-        # materialising a [bs, 1, q_len, total_kv_len] attn_bias even during
-        # chunked prefill (block_list is non-None) for these topologies; this
-        # removes a sizable add_bf16 from the attention critical path during
-        # long-context chunked prefill. interleaved_sliding_window and
-        # chunked-attention bias paths (window_attn_bias / chunked_attn_bias)
-        # are populated later in process_metadata and used by hpu_attn
-        # instead. Conservative scope: only non-GDN hybrid models; all other
-        # topologies retain the original behaviour.
-        if (self.prefill_use_fusedsdpa and not self.interleaved_sliding_window and self.is_non_gdn_hybrid):
-            return attn_metadata
-
         if attn_metadata.attn_bias is not None:
             return attn_metadata
 

From 78b3b3de6dd7f3fb831660fbb7959259a2c0d527 Mon Sep 17 00:00:00 2001
From: Soila Kavulya <soila.p.kavulya@intel.com>
Date: Fri, 15 May 2026 17:46:28 -0700
Subject: [PATCH 28/29] Fix accuracy issue in minimax_m2 with TP > 1

Signed-off-by: Soila Kavulya <soila.p.kavulya@intel.com>
---
 vllm_gaudi/models/minimax_m2.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm_gaudi/models/minimax_m2.py b/vllm_gaudi/models/minimax_m2.py
index a6d720e00e..2e642e9e36 100644
--- a/vllm_gaudi/models/minimax_m2.py
+++ b/vllm_gaudi/models/minimax_m2.py
@@ -107,9 +107,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (bs * seq_len, n_experts)
         router_logits, _ = self.gate(hidden_states.to(torch.float32))
         final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits)
-        final_hidden_states = final_hidden_states
-        if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
 
         return final_hidden_states.view(bs, seq_len, hidden_dim)
 

From 7f2a309fc6a8fbd72ab57935e0b0958d08b86af0 Mon Sep 17 00:00:00 2001
From: Soila Kavulya <soila.p.kavulya@intel.com>
Date: Wed, 27 May 2026 09:48:37 -0700
Subject: [PATCH 29/29] Fix lint format issues

Signed-off-by: Soila Kavulya <soila.p.kavulya@intel.com>
---
 vllm_gaudi/models/minimax_m2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_gaudi/models/minimax_m2.py b/vllm_gaudi/models/minimax_m2.py
index 2e642e9e36..459f480879 100644
--- a/vllm_gaudi/models/minimax_m2.py
+++ b/vllm_gaudi/models/minimax_m2.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.layers.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce)
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP
 from vllm.model_executor.layers.layernorm import RMSNorm