Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -111,16 +111,16 @@ def to_keys(int_ids: list[int]) -> list[OffloadKey]:
return [make_offload_key(str(i).encode(), 0) for i in int_ids]

def take_events() -> Iterable[OffloadingEvent]:
yield OffloadingEvent(keys=to_keys([1, 2, 3]), block_size=16, medium="A", removed=False)
yield OffloadingEvent(keys=to_keys([4, 5, 6]), block_size=32, medium="B", removed=True)
yield OffloadingEvent(keys=to_keys([1, 2, 3]), medium="A", removed=False)
yield OffloadingEvent(keys=to_keys([4, 5, 6]), medium="B", removed=True)

runner.manager.take_events.side_effect = take_events
events = list(runner.scheduler_connector.take_events())
assert len(events) == 2
event = events[0]
assert isinstance(event, BlockStored)
assert event.block_hashes == [str(i).encode() for i in [1, 2, 3]]
assert event.block_size == 16
assert event.block_size == 0
assert event.medium == "A"
assert event.token_ids == []
assert event.parent_block_hash is None
Expand Down
37 changes: 33 additions & 4 deletions vllm_gaudi/ops/hpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
EMPTY_EPLB_STATE, )
from vllm.model_executor.layers.fused_moe.router.routing_simulator_router import (
RoutingSimulatorRouter, )
from vllm.model_executor.layers.fused_moe.router.zero_expert_router import (
ZeroExpertRouter, )
from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp)
from vllm_gaudi.extension.runtime import get_config
from vllm.model_executor.utils import set_weight_attrs
Expand Down Expand Up @@ -335,17 +337,21 @@ def create_fused_moe_router(
# eplb parameters
enable_eplb: bool = False,
eplb_state: EplbLayerState = EMPTY_EPLB_STATE,
# zero expert parameters
zero_expert_type: str | None = None,
num_logical_experts: int | None = None,
) -> FusedMoERouter:
"""
Factory function to create the appropriate FusedMoERouter subclass based on
the provided parameters.

The selection logic follows this priority order:
1. RoutingSimulatorRouter - if VLLM_MOE_ROUTING_SIMULATION_STRATEGY env var is set
2. GroupedTopKRouter - if use_grouped_topk is True
3. CustomRoutingRouter - if custom_routing_function is not None
4. FusedTopKBiasRouter - if e_score_correction_bias is not None
5. FusedTopKRouter - default fallback
2. ZeroExpertRouter - if zero_expert_type is not None
3. GroupedTopKRouter - if use_grouped_topk is True
4. CustomRoutingRouter - if custom_routing_function is not None
5. FusedTopKBiasRouter - if e_score_correction_bias is not None
6. FusedTopKRouter - default fallback

Common arguments:
top_k: Number of experts to select per token
Expand All @@ -371,6 +377,12 @@ def create_fused_moe_router(
enable_eplb: Whether EPLB is enabled
eplb_state: EPLB (Expert Parallelism Load Balancing) state

Zero expert arguments:
zero_expert_type: Type of zero expert (e.g. identity). If not None,
creates a ZeroExpertRouter.
num_logical_experts: Number of real (non-zero) experts. Required when
zero_expert_type is not None.
Comment thread
pawel-olejniczak marked this conversation as resolved.

Returns:
An instance of the appropriate FusedMoERouter subclass
"""
Expand All @@ -385,6 +397,23 @@ def create_fused_moe_router(
indices_type_getter=indices_type_getter,
)

if zero_expert_type is not None:
assert num_logical_experts is not None, ("num_logical_experts is required when zero_expert_type is set")
assert e_score_correction_bias is not None, ("e_score_correction_bias is required when zero_expert_type is set")
Comment thread
pawel-olejniczak marked this conversation as resolved.
return ZeroExpertRouter(
top_k=top_k,
global_num_experts=global_num_experts,
eplb_state=eplb_state,
e_score_correction_bias=e_score_correction_bias,
num_logical_experts=num_logical_experts,
zero_expert_type=zero_expert_type,
scoring_func=scoring_func,
renormalize=renormalize,
routed_scaling_factor=routed_scaling_factor,
enable_eplb=enable_eplb,
indices_type_getter=indices_type_getter,
)

if use_grouped_topk:
assert custom_routing_function is None
if num_expert_group is None or topk_group is None:
Expand Down
9 changes: 6 additions & 3 deletions vllm_gaudi/v1/worker/hpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from vllm.v1.worker.utils import bind_kv_cache
from vllm_gaudi.utils import is_fake_hpu
from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner
from vllm.v1.worker.worker_base import WorkerBase
from vllm.v1.worker.worker_base import CompilationTimes, WorkerBase

from vllm_gaudi.extension.logger import logger as init_logger

Expand Down Expand Up @@ -521,15 +521,18 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
logger.info(msg)
self.compile_or_warm_up_model()

def compile_or_warm_up_model(self) -> float:
def compile_or_warm_up_model(self) -> CompilationTimes:
# Don't run the warmup if the model is already warmed up
if not getattr(self.model_runner, 'graphed_buckets', None):
self.model_runner.warmup_model() # type: ignore[union-attr]
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed(self.model_config.seed)

return self.vllm_config.compilation_config.compilation_time
return CompilationTimes(
language_model=self.vllm_config.compilation_config.compilation_time,
encoder=self.vllm_config.compilation_config.encoder_compilation_time,
)

def sample_tokens(self, grammar_output: "GrammarOutput|None") -> ModelRunnerOutput | AsyncModelRunnerOutput:
return self.model_runner.sample_tokens(grammar_output) # type: ignore[union-attr]
Expand Down
Loading