Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ numba>=0.58.0
numpy>=1.26.0
transformers>= 4.56.0, <5
kaldi-native-fbank >= 1.18.7
tblib==3.1.0
41 changes: 21 additions & 20 deletions tests/full_tests/ci_e2e_discoverable_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -347,28 +347,29 @@ run_gsm8k_qwen3_30b_test() {
# This test requires new transformers and huggingface_hub versions for Qwen3.5 model support, once VLLM supports latest transfomer,
# we can remove the pip version pinning and restoration in this test and just rely on the environment having the right versions.
run_gsm8k_qwen35_9b_test() {
# Test case is temporary disabled due to #37975
echo "➡️ Testing GSM8K on Qwen3.5-9B..."
_QWEN35_OLD_TRANSFORMERS_VER=$(pip show transformers | grep Version | awk '{print $2}')
_QWEN35_OLD_HF_HUB_VER=$(pip show huggingface_hub | grep Version | awk '{print $2}')
#_QWEN35_OLD_TRANSFORMERS_VER=$(pip show transformers | grep Version | awk '{print $2}')
#_QWEN35_OLD_HF_HUB_VER=$(pip show huggingface_hub | grep Version | awk '{print $2}')

# Ensure old package versions are restored on exit (even on failure)
_restore_qwen35_deps() {
if [ -n "$_QWEN35_OLD_TRANSFORMERS_VER" ] && [ -n "$_QWEN35_OLD_HF_HUB_VER" ]; then
echo "🔄 Restoring transformers==$_QWEN35_OLD_TRANSFORMERS_VER huggingface_hub==$_QWEN35_OLD_HF_HUB_VER ..."
pip install "transformers==$_QWEN35_OLD_TRANSFORMERS_VER" "huggingface_hub==$_QWEN35_OLD_HF_HUB_VER" --no-deps
else
echo "⚠️ Skipping restore: could not determine original package versions."
fi
trap - EXIT
}
trap _restore_qwen35_deps EXIT

pip install transformers==5.3.0 huggingface_hub==1.7.1 --no-deps

VLLM_SKIP_WARMUP=True ENABLE_APC=False VLLM_FUSED_BLOCK_SOFTMAX_ADJUSTMENT=False VLLM_GRAPH_RESERVED_MEM=0.2 \
pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/test_common.py" --model_card_path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/qwen3.5-9b.yaml"

_restore_qwen35_deps
#_restore_qwen35_deps() {
# if [ -n "$_QWEN35_OLD_TRANSFORMERS_VER" ] && [ -n "$_QWEN35_OLD_HF_HUB_VER" ]; then
# echo "🔄 Restoring transformers==$_QWEN35_OLD_TRANSFORMERS_VER huggingface_hub==$_QWEN35_OLD_HF_HUB_VER ..."
# pip install "transformers==$_QWEN35_OLD_TRANSFORMERS_VER" "huggingface_hub==$_QWEN35_OLD_HF_HUB_VER" --no-deps
# else
# echo "⚠️ Skipping restore: could not determine original package versions."
# fi
# trap - EXIT
#}
#trap _restore_qwen35_deps EXIT

#pip install transformers==5.3.0 huggingface_hub==1.7.1 --no-deps

#VLLM_SKIP_WARMUP=True ENABLE_APC=False VLLM_FUSED_BLOCK_SOFTMAX_ADJUSTMENT=False VLLM_GRAPH_RESERVED_MEM=0.2 \
#pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/test_common.py" --model_card_path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/qwen3.5-9b.yaml"

#_restore_qwen35_deps
echo "✅ Test with Qwen3.5-9B passed."
}

Expand Down Expand Up @@ -457,7 +458,7 @@ run_cpu_offloading_test() {
run_offloading_connector_test() {
echo "➡️ Testing OffloadingConnector."
VLLM_SKIP_WARMUP=True VLLM_USE_V1=1 \
pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/unit_tests/kv_offload/test_offloading_connector.py"
pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/unit_tests/kv_offload/offloading_connector"
echo "✅ Test OffloadingConnector passed."
}

Expand Down
4 changes: 1 addition & 3 deletions tests/models/language/generation/generation_mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.utils import encode_image_url, encode_video_url
from dataclasses import asdict
from typing import Union, Any
from PIL import Image
from dataclasses import dataclass
Expand Down Expand Up @@ -89,8 +88,7 @@ def run_model(model_name: str, inputs: Union[dict, list[dict]], modality: str, *

engine_args = EngineArgs(model=model_name, **extra_engine_args)

engine_args = asdict(engine_args)
llm = LLM(**engine_args)
llm = LLM.from_engine_args(engine_args)

outputs = llm.chat(
inputs,
Expand Down
4 changes: 1 addition & 3 deletions tests/models/language/generation/generation_mm_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from vllm.assets.image import ImageAsset, ImageAssetName
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from dataclasses import asdict
from typing import Union, get_args
from PIL import Image
from dataclasses import dataclass
Expand Down Expand Up @@ -170,8 +169,7 @@ def run_model(model_name: str, inputs: Union[dict, list[dict]], modality: str, *

engine_args = EngineArgs(model=model_name, **extra_engine_args)

engine_args = asdict(engine_args)
llm = LLM(**engine_args)
llm = LLM.from_engine_args(engine_args)

outputs = llm.generate(
inputs,
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from tests.unit_tests.kv_offload.offloading_connector.utils import (
request_runner, )
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These parenthesized imports include whitespace before the closing ) (e.g. request_runner, )), which typically triggers pycodestyle/ruff E202. Please reformat the import (or run the formatter).

Suggested change
request_runner, )
request_runner,
)

Copilot uses AI. Check for mistakes.

__all__ = ["request_runner"]
242 changes: 242 additions & 0 deletions tests/unit_tests/kv_offload/offloading_connector/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector import (
OffloadingConnector,
OffloadingConnectorStats,
)


def test_build_kv_connector_stats_with_none():
"""Test that build_kv_connector_stats returns empty stats when given None."""
stats = OffloadingConnector.build_kv_connector_stats(data=None)

assert stats is not None
assert isinstance(stats, OffloadingConnectorStats)
assert len(stats.data) == 0
assert stats.is_empty()


def test_build_kv_connector_stats_with_empty_dict():
"""Test that build_kv_connector_stats returns empty stats with empty dict."""
stats = OffloadingConnector.build_kv_connector_stats(data={})

assert stats is not None
assert isinstance(stats, OffloadingConnectorStats)
assert len(stats.data) == 0
assert stats.is_empty()


def test_build_kv_connector_stats_reconstructs_offload_stats():
"""Test that OffloadingConnector stats are properly reconstructed with
correct data."""
serialized_data = {
"CPU_to_GPU": [
{
"op_size": 16,
"op_time": 1.0
},
{
"op_size": 8,
"op_time": 0.5
},
],
"GPU_to_CPU": [
{
"op_size": 1,
"op_time": 0.1
},
{
"op_size": 2,
"op_time": 0.2
},
],
}

stats = OffloadingConnector.build_kv_connector_stats(data=serialized_data)

offload_connector_stats = stats
assert isinstance(offload_connector_stats, OffloadingConnectorStats)
assert offload_connector_stats.data["CPU_to_GPU"] == [
{
"op_size": 16,
"op_time": 1.0
},
{
"op_size": 8,
"op_time": 0.5
},
]
assert offload_connector_stats.data["GPU_to_CPU"] == [
{
"op_size": 1,
"op_time": 0.1
},
{
"op_size": 2,
"op_time": 0.2
},
]


def test_aggregate_same_connector():
"""Test aggregating stats from the same connector type."""
stats1 = OffloadingConnectorStats(
data={
"CPU_to_GPU": [
{
"op_size": 16,
"op_time": 1.0
},
{
"op_size": 8,
"op_time": 0.5
},
],
"GPU_to_CPU": [
{
"op_size": 1,
"op_time": 0.1
},
{
"op_size": 2,
"op_time": 0.2
},
],
})

stats2 = OffloadingConnectorStats(
data={
"CPU_to_GPU": [
{
"op_size": 3,
"op_time": 0.2
},
{
"op_size": 7,
"op_time": 0.9
},
],
"GPU_to_CPU": [{
"op_size": 16,
"op_time": 2
}],
})

result = stats1.aggregate(stats2)

assert result is stats1 # Should return self
offload_connector_stats = result
assert offload_connector_stats.data["CPU_to_GPU"] == [
{
"op_size": 16,
"op_time": 1.0
},
{
"op_size": 8,
"op_time": 0.5
},
{
"op_size": 3,
"op_time": 0.2
},
{
"op_size": 7,
"op_time": 0.9
},
]
assert offload_connector_stats.data["GPU_to_CPU"] == [
{
"op_size": 1,
"op_time": 0.1
},
{
"op_size": 2,
"op_time": 0.2
},
{
"op_size": 16,
"op_time": 2
},
]


def test_reduce():
"""Test that reduce() correctly reduces all nested connector stats."""
stats = OffloadingConnectorStats(
data={
"CPU_to_GPU": [
{
"op_size": 16,
"op_time": 1.0
},
{
"op_size": 8,
"op_time": 0.5
},
{
"op_size": 3,
"op_time": 0.2
},
{
"op_size": 7,
"op_time": 0.9
},
],
"GPU_to_CPU": [
{
"op_size": 1,
"op_time": 0.1
},
{
"op_size": 2,
"op_time": 0.2
},
{
"op_size": 16,
"op_time": 2
},
],
})

reduced = stats.reduce()

assert isinstance(reduced, dict)
# Check that the stats were reduced (should have aggregated values)
assert "CPU_to_GPU_total_bytes" in reduced
assert "CPU_to_GPU_total_time" in reduced
assert "GPU_to_CPU_total_bytes" in reduced
assert "GPU_to_CPU_total_time" in reduced
assert reduced["CPU_to_GPU_total_bytes"] == 34
assert reduced["CPU_to_GPU_total_time"] == 2.6
assert reduced["GPU_to_CPU_total_time"] == 2.3
assert reduced["GPU_to_CPU_total_bytes"] == 19


def test_reset():
"""Test that reset() resets all nested connector stats."""
offload_connector_stats = OffloadingConnectorStats(
data={
"CPU_to_GPU": [
{
"op_size": 3,
"op_time": 0.2
},
{
"op_size": 7,
"op_time": 0.9
},
],
"GPU_to_CPU": [{
"op_size": 16,
"op_time": 2
}],
})

assert not offload_connector_stats.is_empty()

offload_connector_stats.reset()

# After reset, stats should be empty
assert offload_connector_stats.is_empty()
assert len(offload_connector_stats.data) == 0
Loading
Loading