diff --git a/tests/worker/test_cudagraph_wrapper_perf.py b/tests/worker/test_cudagraph_wrapper_perf.py deleted file mode 100644 index d73fe46c903..00000000000 --- a/tests/worker/test_cudagraph_wrapper_perf.py +++ /dev/null @@ -1,185 +0,0 @@ -"""Tests for CUDAGraphWrapper.__getattr__ performance optimization. - -This module tests that the patched CUDAGraphWrapper avoids expensive __repr__ -calls when hasattr() is used for non-existent attributes. The original vLLM -implementation includes {self.runnable} in the AttributeError message, which -triggers model tree traversal and can take ~6ms on large models. -""" - -import time - -import pytest -import torch -import torch.nn as nn - -from vllm_omni.worker.gpu_model_runner import CUDAGraphWrapper - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -class SlowReprModel(nn.Module): - """A mock model with artificially slow __repr__ to detect unwanted calls.""" - - def __init__(self, repr_delay_ms: float = 10.0): - super().__init__() - self.linear = nn.Linear(16, 16) - self.repr_delay_ms = repr_delay_ms - self.repr_call_count = 0 - - def forward(self, x): - return self.linear(x) - - def __repr__(self): - self.repr_call_count += 1 - # Simulate expensive repr by sleeping - time.sleep(self.repr_delay_ms / 1000.0) - return f"SlowReprModel(delay={self.repr_delay_ms}ms)" - - -class MockCUDAGraphWrapper: - """A minimal mock that mimics CUDAGraphWrapper structure for CPU testing.""" - - def __init__(self, runnable): - # Store in __dict__ directly to avoid triggering __getattr__ - object.__setattr__(self, "runnable", runnable) - - def __getattr__(self, key: str): - # This is the optimized implementation we're testing - runnable = object.__getattribute__(self, "runnable") - if hasattr(runnable, key): - return getattr(runnable, key) - # Key optimization: DO NOT include {self.runnable} in error message - # as it triggers expensive __repr__ on large models - raise AttributeError(f"Attribute {key} not exists in the runnable of cudagraph wrapper") - - -def test_hasattr_nonexistent_does_not_trigger_repr(): - """Verify that hasattr for non-existent attributes doesn't call __repr__.""" - model = SlowReprModel(repr_delay_ms=100.0) # Very slow repr - wrapper = MockCUDAGraphWrapper(model) - - # Reset counter - model.repr_call_count = 0 - - # Call hasattr for non-existent attribute multiple times - for _ in range(10): - result = hasattr(wrapper, "nonexistent_attribute_xyz") - assert result is False - - # __repr__ should never have been called - assert model.repr_call_count == 0, ( - f"__repr__ was called {model.repr_call_count} times when checking " - "for non-existent attributes. This indicates the AttributeError " - "message contains {self.runnable} which triggers expensive repr." - ) - - -def test_hasattr_nonexistent_is_fast(): - """Verify that hasattr for non-existent attributes is fast (<1ms per call).""" - model = SlowReprModel(repr_delay_ms=100.0) - wrapper = MockCUDAGraphWrapper(model) - - num_iterations = 100 - start = time.perf_counter() - for _ in range(num_iterations): - hasattr(wrapper, "nonexistent_attribute_xyz") - elapsed_ms = (time.perf_counter() - start) * 1000 - - avg_ms = elapsed_ms / num_iterations - # If __repr__ were being called, each would take ~100ms - # We expect <1ms per call with the fix - assert avg_ms < 1.0, ( - f"hasattr for non-existent attribute took {avg_ms:.2f}ms on average. " - "Expected <1ms. This suggests __repr__ is being triggered." - ) - - -def test_hasattr_existing_attribute_works(): - """Verify that hasattr for existing attributes returns True and works correctly.""" - model = SlowReprModel() - wrapper = MockCUDAGraphWrapper(model) - - # 'forward' exists on nn.Module - assert hasattr(wrapper, "forward") is True - - # 'linear' exists on our model - assert hasattr(wrapper, "linear") is True - - # Can actually access the attribute - linear = wrapper.linear - assert isinstance(linear, nn.Linear) - - -def test_getattr_existing_attribute_returns_value(): - """Verify that getattr for existing attributes returns the correct value.""" - model = SlowReprModel() - wrapper = MockCUDAGraphWrapper(model) - - # Access forward method - forward_method = wrapper.forward - assert callable(forward_method) - - # Access linear layer - linear = wrapper.linear - assert isinstance(linear, nn.Linear) - assert linear.in_features == 16 - assert linear.out_features == 16 - - -def test_getattr_nonexistent_raises_attribute_error(): - """Verify that getattr for non-existent attributes raises AttributeError.""" - model = SlowReprModel() - wrapper = MockCUDAGraphWrapper(model) - - with pytest.raises(AttributeError) as exc_info: - _ = wrapper.nonexistent_attribute - - # Verify error message format (should NOT contain model repr) - error_msg = str(exc_info.value) - assert "nonexistent_attribute" in error_msg - assert "cudagraph wrapper" in error_msg - # Should NOT contain the slow repr output - assert "SlowReprModel(delay=" not in error_msg - - -def test_attribute_error_message_does_not_contain_runnable_repr(): - """Explicitly verify the error message doesn't trigger runnable repr.""" - model = SlowReprModel(repr_delay_ms=100.0) - wrapper = MockCUDAGraphWrapper(model) - model.repr_call_count = 0 - - try: - _ = wrapper.nonexistent_attr - except AttributeError: - pass - - # __repr__ should not have been called during error construction - assert model.repr_call_count == 0, ( - "AttributeError message construction triggered __repr__. The error message should not include {self.runnable}." - ) - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -def test_real_cudagraph_wrapper_hasattr_performance(): - """Test the actual CUDAGraphWrapper from vllm_omni (requires CUDA).""" - from vllm.config import CUDAGraphMode - - model = SlowReprModel(repr_delay_ms=50.0).cuda() - model.repr_call_count = 0 - - # Create actual CUDAGraphWrapper - try: - wrapper = CUDAGraphWrapper(model, runtime_mode=CUDAGraphMode.NONE) - except Exception: - pytest.skip("Could not create CUDAGraphWrapper") - - # Test hasattr performance - num_iterations = 50 - start = time.perf_counter() - for _ in range(num_iterations): - hasattr(wrapper, "nonexistent_xyz") - elapsed_ms = (time.perf_counter() - start) * 1000 - - avg_ms = elapsed_ms / num_iterations - assert avg_ms < 1.0, f"Real CUDAGraphWrapper hasattr took {avg_ms:.2f}ms avg. Expected <1ms with the optimization." - assert model.repr_call_count == 0, f"__repr__ called {model.repr_call_count} times" diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index a7abaf7b62a..35e15984355 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -1,9 +1,8 @@ -import sys from typing import TYPE_CHECKING, Any, cast import numpy as np import torch -from vllm.compilation.cuda_graph import CUDAGraphWrapper as _OriginalCUDAGraphWrapper +from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.config import CUDAGraphMode from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context @@ -38,22 +37,6 @@ logger = init_logger(__name__) -class CUDAGraphWrapper(_OriginalCUDAGraphWrapper): - def __getattr__(self, key: str) -> Any: - # allow accessing the attributes of the runnable. - if hasattr(self.runnable, key): - return getattr(self.runnable, key) - raise AttributeError(f"Attribute {key} not exists in the runnable of cudagraph wrapper") - - -# Patch vLLM's CUDAGraphWrapper with our optimized version -for _module_name, _module in sys.modules.items(): - if "vllm" not in _module_name: - continue - if hasattr(_module, "CUDAGraphWrapper") and _module.CUDAGraphWrapper is _OriginalCUDAGraphWrapper: - _module.CUDAGraphWrapper = CUDAGraphWrapper - - class OmniGPUModelRunner(GPUModelRunner): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)