From 1ccccf57b34788e1af549aeceb232d5307dabdc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Olejniczak?= Date: Wed, 15 Apr 2026 14:16:09 +0300 Subject: [PATCH 1/3] Return CompilationTimes from HPU worker compile_or_warm_up_model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Paweł Olejniczak --- vllm_gaudi/v1/worker/hpu_worker.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py index e1305f7e78..634e61950f 100644 --- a/vllm_gaudi/v1/worker/hpu_worker.py +++ b/vllm_gaudi/v1/worker/hpu_worker.py @@ -32,7 +32,7 @@ from vllm.v1.worker.utils import bind_kv_cache from vllm_gaudi.utils import is_fake_hpu from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner -from vllm.v1.worker.worker_base import WorkerBase +from vllm.v1.worker.worker_base import CompilationTimes, WorkerBase from vllm_gaudi.extension.logger import logger as init_logger @@ -521,7 +521,7 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: logger.info(msg) self.compile_or_warm_up_model() - def compile_or_warm_up_model(self) -> float: + def compile_or_warm_up_model(self) -> CompilationTimes: # Don't run the warmup if the model is already warmed up if not getattr(self.model_runner, 'graphed_buckets', None): self.model_runner.warmup_model() # type: ignore[union-attr] @@ -529,7 +529,10 @@ def compile_or_warm_up_model(self) -> float: # the model initialization and profiling. set_random_seed(self.model_config.seed) - return self.vllm_config.compilation_config.compilation_time + return CompilationTimes( + language_model=self.vllm_config.compilation_config.compilation_time, + encoder=self.vllm_config.compilation_config.encoder_compilation_time, + ) def sample_tokens(self, grammar_output: "GrammarOutput|None") -> ModelRunnerOutput | AsyncModelRunnerOutput: return self.model_runner.sample_tokens(grammar_output) # type: ignore[union-attr] From 3006cdaad4320ddf97bed3104fcea78781426950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Olejniczak?= Date: Wed, 15 Apr 2026 14:55:38 +0300 Subject: [PATCH 2/3] Add zero_expert_type and num_logical_experts to HPU create_fused_moe_router MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Paweł Olejniczak --- vllm_gaudi/ops/hpu_fused_moe.py | 37 +++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index 6fcd1394b3..0c713ce348 100755 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -28,6 +28,8 @@ EMPTY_EPLB_STATE, ) from vllm.model_executor.layers.fused_moe.router.routing_simulator_router import ( RoutingSimulatorRouter, ) +from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( + ZeroExpertRouter, ) from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp) from vllm_gaudi.extension.runtime import get_config from vllm.model_executor.utils import set_weight_attrs @@ -335,6 +337,9 @@ def create_fused_moe_router( # eplb parameters enable_eplb: bool = False, eplb_state: EplbLayerState = EMPTY_EPLB_STATE, + # zero expert parameters + zero_expert_type: str | None = None, + num_logical_experts: int | None = None, ) -> FusedMoERouter: """ Factory function to create the appropriate FusedMoERouter subclass based on @@ -342,10 +347,11 @@ def create_fused_moe_router( The selection logic follows this priority order: 1. RoutingSimulatorRouter - if VLLM_MOE_ROUTING_SIMULATION_STRATEGY env var is set - 2. GroupedTopKRouter - if use_grouped_topk is True - 3. CustomRoutingRouter - if custom_routing_function is not None - 4. FusedTopKBiasRouter - if e_score_correction_bias is not None - 5. FusedTopKRouter - default fallback + 2. ZeroExpertRouter - if zero_expert_type is not None + 3. GroupedTopKRouter - if use_grouped_topk is True + 4. CustomRoutingRouter - if custom_routing_function is not None + 5. FusedTopKBiasRouter - if e_score_correction_bias is not None + 6. FusedTopKRouter - default fallback Common arguments: top_k: Number of experts to select per token @@ -371,6 +377,12 @@ def create_fused_moe_router( enable_eplb: Whether EPLB is enabled eplb_state: EPLB (Expert Parallelism Load Balancing) state + Zero expert arguments: + zero_expert_type: Type of zero expert (e.g. identity). If not None, + creates a ZeroExpertRouter. + num_logical_experts: Number of real (non-zero) experts. Required when + zero_expert_type is not None. + Returns: An instance of the appropriate FusedMoERouter subclass """ @@ -385,6 +397,23 @@ def create_fused_moe_router( indices_type_getter=indices_type_getter, ) + if zero_expert_type is not None: + assert num_logical_experts is not None, ("num_logical_experts is required when zero_expert_type is set") + assert e_score_correction_bias is not None, ("e_score_correction_bias is required when zero_expert_type is set") + return ZeroExpertRouter( + top_k=top_k, + global_num_experts=global_num_experts, + eplb_state=eplb_state, + e_score_correction_bias=e_score_correction_bias, + num_logical_experts=num_logical_experts, + zero_expert_type=zero_expert_type, + scoring_func=scoring_func, + renormalize=renormalize, + routed_scaling_factor=routed_scaling_factor, + enable_eplb=enable_eplb, + indices_type_getter=indices_type_getter, + ) + if use_grouped_topk: assert custom_routing_function is None if num_expert_group is None or topk_group is None: From ae652eb326c6a066c8a9eeba3dacd00de77aa155 Mon Sep 17 00:00:00 2001 From: Pawel Olejniczak Date: Thu, 16 Apr 2026 01:29:12 +0300 Subject: [PATCH 3/3] Remove block_size from OffloadingEvent in test_scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Paweł Olejniczak --- .../kv_offload/offloading_connector/test_scheduler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py b/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py index 3749026091..cd6cb8fad6 100644 --- a/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py +++ b/tests/unit_tests/kv_offload/offloading_connector/test_scheduler.py @@ -111,8 +111,8 @@ def to_keys(int_ids: list[int]) -> list[OffloadKey]: return [make_offload_key(str(i).encode(), 0) for i in int_ids] def take_events() -> Iterable[OffloadingEvent]: - yield OffloadingEvent(keys=to_keys([1, 2, 3]), block_size=16, medium="A", removed=False) - yield OffloadingEvent(keys=to_keys([4, 5, 6]), block_size=32, medium="B", removed=True) + yield OffloadingEvent(keys=to_keys([1, 2, 3]), medium="A", removed=False) + yield OffloadingEvent(keys=to_keys([4, 5, 6]), medium="B", removed=True) runner.manager.take_events.side_effect = take_events events = list(runner.scheduler_connector.take_events()) @@ -120,7 +120,7 @@ def take_events() -> Iterable[OffloadingEvent]: event = events[0] assert isinstance(event, BlockStored) assert event.block_hashes == [str(i).encode() for i in [1, 2, 3]] - assert event.block_size == 16 + assert event.block_size == 0 assert event.medium == "A" assert event.token_ids == [] assert event.parent_block_hash is None