Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ ray>=2.48.0
pandas>=2.2.3
numba>=0.58.0
numpy>=1.26.0
transformers>= 4.56.0, <5
transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0, != 5.6.*
kaldi-native-fbank >= 1.18.7
tblib==3.1.0
1 change: 0 additions & 1 deletion tests/models/language/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def launch_lm_eval(eval_config):
'async_scheduling': async_scheduling,
'enforce_eager': enforce_eager,
'enable_prefix_caching': enable_apc,
'add_bos_token': True,
'dtype': dtype,
'max_model_len': max_model_len,
'max_num_seqs': max_num_seqs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,20 +91,19 @@ def test_offloading_connector(request_runner, async_scheduling: bool):
runner.new_request(token_ids=[1] * offloaded_block_size)
runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output([]))
runner.run(decoded_tokens=[EOS_TOKEN_ID])
runner.manager.lookup.assert_called()
assert len(list(runner.manager.lookup.call_args.args[0])) == 1
runner.manager.lookup.assert_called_once()
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The assertion was weakened from validating the lookup input (previously checking the iterable length) to only checking lookup() was called once. To keep coverage of the signature change, consider asserting that lookup was called with an OffloadKey (and, if applicable, that req_context is passed via args/kwargs) so the test will fail if the old iterable-based call is accidentally reintroduced.

Suggested change
runner.manager.lookup.assert_called_once()
runner.manager.lookup.assert_called_once()
lookup_args, lookup_kwargs = runner.manager.lookup.call_args
assert lookup_args
assert isinstance(lookup_args[0], OffloadKey)
if "req_context" in lookup_kwargs:
assert lookup_kwargs["req_context"] is not None
else:
assert len(lookup_args) >= 2
assert lookup_args[1] is not None

Copilot uses AI. Check for mistakes.

# single block lookup with a hit
runner.scheduler.reset_prefix_cache()
runner.new_request(token_ids=[0] * offloaded_block_size)
runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output([]))
runner.manager.lookup.return_value = 1
runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(0, 1, 2))
Comment on lines 97 to 101
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests now stub the scheduler by overwriting _maximal_prefix_lookup directly. Because this is a private implementation detail, it makes the tests brittle to upstream refactors and bypasses coverage of the new OffloadingManager.lookup(key: OffloadKey) -> bool behavior. Consider mocking manager.lookup with a side_effect that returns True/False based on the provided OffloadKey (or providing a small helper/fake manager) so the test exercises the real prefix-lookup logic.

Copilot uses AI. Check for mistakes.

# single block lookup with a hit in a middle block
runner.new_request(token_ids=[0] * offloaded_block_size * 2 + [1] * offloaded_block_size)
runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output([]))
runner.manager.lookup.return_value = 1
runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(3, 4, 5))

# test take_events
Expand Down Expand Up @@ -182,7 +181,7 @@ def test_request_preemption(request_runner, async_scheduling: bool):

# request should now return from preemption
# re-load [0, ..., 8] from the CPU and store [9, 10, 11]
runner.manager.lookup.return_value = 3
runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 3
runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output(block_hashes))
runner.run(
decoded_tokens=[0] * gpu_block_size,
Expand Down Expand Up @@ -219,7 +218,7 @@ def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling:
# start a request to load the first block, but don't complete
runner.scheduler.reset_prefix_cache()
runner.new_request(token_ids=[0] * offloaded_block_size)
runner.manager.lookup.return_value = 1
runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
runner.run(
decoded_tokens=[],
complete_transfers=False,
Expand All @@ -231,7 +230,7 @@ def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling:

# start a new request to load the same first block
runner.new_request(token_ids=[0] * offloaded_block_size)
runner.manager.lookup.return_value = 1
runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
runner.run(
decoded_tokens=[],
complete_transfers=False,
Expand Down Expand Up @@ -275,7 +274,7 @@ def test_abort_loading_requests(request_runner, async_scheduling: bool):
# start a request to load the first block, but don't complete
runner.scheduler.reset_prefix_cache()
runner.new_request(token_ids=[0] * offloaded_block_size)
runner.manager.lookup.return_value = 1
runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1
runner.run(
decoded_tokens=[],
complete_transfers=False,
Expand Down
10 changes: 5 additions & 5 deletions tests/unit_tests/kv_offload/offloading_connector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,14 @@ def __init__(self,
self.scheduler_connector: OffloadingConnector = scheduler_connector

# extract mocked OffloadingManager of scheduler connector
connector_scheduler = scheduler_connector.connector_scheduler
assert connector_scheduler is not None
manager = connector_scheduler.manager
self.connector_scheduler = scheduler_connector.connector_scheduler
assert self.connector_scheduler is not None
manager = self.connector_scheduler.manager
assert isinstance(manager, MagicMock)
self.manager: MagicMock = manager

assert len(connector_scheduler.config.kv_group_configs) == 1
kv_group_config = connector_scheduler.config.kv_group_configs[0]
assert len(self.connector_scheduler.config.kv_group_configs) == 1
kv_group_config = self.connector_scheduler.config.kv_group_configs[0]
assert kv_group_config.gpu_block_size == gpu_block_size
assert kv_group_config.offloaded_block_size == offloaded_block_size

Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/ops/test_hpu_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def test_compressed_tensors_wna16_moe_method(default_vllm_config: None, dist_ini
mock_ctx = MagicMock(spec=["dp_metadata"])
mock_ctx.dp_metadata = None
with override_forward_context(mock_ctx):
out = oot_op.runner.forward_dispatch(oot_op, hidden_states, router_logits, hidden_states)
out = oot_op.runner._forward_dispatch(oot_op, hidden_states, router_logits, hidden_states)

# Check correctness
torch.testing.assert_close(ref_output, out, atol=1e-4, rtol=1e-4)
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/ops/test_hpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_unquantized_fused_moe_method(default_vllm_config: None, dist_init):
mock_ctx = MagicMock(spec=["dp_metadata"])
mock_ctx.dp_metadata = None
with override_forward_context(mock_ctx):
out = oot_op.runner.forward_dispatch(oot_op, hidden_states, router_logits, hidden_states)
out = oot_op.runner._forward_dispatch(oot_op, hidden_states, router_logits, hidden_states)

# Check correctness
torch.testing.assert_close(ref_output, out, atol=1e-4, rtol=1e-4)
1 change: 0 additions & 1 deletion tests/unit_tests/ops/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def create_fused_moe(quant_config=None):
hidden_size=512,
intermediate_size=256,
params_dtype=torch.bfloat16,
reduce_results=True,
renormalize=True,
use_grouped_topk=False,
num_expert_group=None,
Expand Down
12 changes: 10 additions & 2 deletions vllm_gaudi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,15 @@ def register():
def register_utils():
"""Register utility functions for the HPU platform."""
import vllm_gaudi.utils # noqa: F401

vllm_gaudi.utils.patch_nixl_utils_for_hpu()
# Install the in-process EngineCore reconfigure hook only when
# multi-model mode is requested, to avoid heavy imports for all users.
import os

if os.environ.get("VLLM_HPU_MULTI_MODEL_CONFIG"):
from vllm_gaudi.v1.engine.core_patch import install_engine_core_patch

install_engine_core_patch()


Expand All @@ -86,7 +90,8 @@ def register_ops():
"""Register custom ops for the HPU platform."""
import vllm_gaudi.v1.sample.hpu_rejection_sampler # noqa: F401
import vllm_gaudi.distributed.kv_transfer.kv_connector.v1.hpu_nixl_connector # noqa: F401
if os.getenv('VLLM_HPU_HETERO_KV_LAYOUT', 'false').lower() == 'true':

if os.getenv("VLLM_HPU_HETERO_KV_LAYOUT", "false").lower() == "true":
import vllm_gaudi.distributed.kv_transfer.kv_connector.v1.hetero_hpu_nixl_connector # noqa: F401
import vllm_gaudi.v1.kv_offload.worker.cpu_hpu # noqa: F401
import vllm_gaudi.ops.hpu_attention # noqa: F401
Expand All @@ -107,16 +112,18 @@ def register_ops():

# Conditionally register HPURowParallelLinear only when chunking is enabled
from vllm_gaudi.ops.hpu_row_parallel_linear import register as register_row_parallel

register_row_parallel()

# Register HPU LoRA layers only when row parallel chunking is active
env_value = os.environ.get('VLLM_ROW_PARALLEL_CHUNKS', '1')
env_value = os.environ.get("VLLM_ROW_PARALLEL_CHUNKS", "1")
try:
row_parallel_chunks = int(env_value)
except ValueError:
row_parallel_chunks = 1
if row_parallel_chunks > 1:
from vllm_gaudi.lora.layers.hpu_row_parallel_linear import register_hpu_lora_layers

register_hpu_lora_layers()


Expand All @@ -125,4 +132,5 @@ def register_models():
import vllm_gaudi.models.interfaces # noqa: F401
import vllm_gaudi.models.bert # noqa: F401
from .models import register_model

register_model()
Loading
Loading