Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_lora_resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ async def mock_generate(*args, **kwargs):
mock_engine.add_lora.reset_mock()

mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

models = OpenAIServingModels(
Expand Down
14 changes: 7 additions & 7 deletions tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

serving_chat = _build_serving_chat(mock_engine)
Expand Down Expand Up @@ -459,7 +459,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

serving_chat = _build_serving_chat(mock_engine)
Expand Down Expand Up @@ -492,7 +492,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

# Initialize the serving chat
Expand Down Expand Up @@ -537,7 +537,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

# Initialize the serving chat
Expand Down Expand Up @@ -583,7 +583,7 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

# Initialize the serving chat
Expand Down Expand Up @@ -629,7 +629,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

serving_chat = _build_serving_chat(mock_engine)
Expand Down Expand Up @@ -662,7 +662,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

# Mock the generate method to return an async generator
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def serving() -> OpenAIServing:
model_config.max_model_len = 32768
models = Mock(spec=OpenAIServingModels)
models.model_config = model_config
models.processor = Mock()
models.input_processor = Mock()
models.io_processor = Mock()

serving = OpenAIServing(
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
mock_model_config = MagicMock(spec=ModelConfig)
mock_model_config.max_model_len = 2048
mock_engine_client.model_config = mock_model_config
mock_engine_client.processor = MagicMock()
mock_engine_client.input_processor = MagicMock()
mock_engine_client.io_processor = MagicMock()

serving_models = OpenAIServingModels(
Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai/test_serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ async def serving_responses_instance(self):
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config

engine_client.processor = MagicMock()
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()

models = MagicMock()
Expand Down Expand Up @@ -213,7 +213,7 @@ async def serving_responses_instance(self):
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config

engine_client.processor = MagicMock()
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()

models = MagicMock()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,17 @@
from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import processor as processor_mod
from vllm.v1.engine.processor import Processor
from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor

cherry_pil_image = ImageAsset("cherry_blossom").pil_image
stop_pil_image = ImageAsset("stop_sign").pil_image
baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays


# Mock processor for testing
def _mk_processor(
def _mock_input_processor(
monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
) -> Processor:
) -> InputProcessor:
"""
Create a Processor instance with minimal configuration suitable for unit
tests without accessing external resources.
Expand All @@ -36,7 +35,7 @@ def _mk_processor(
raising=True,
)
monkeypatch.setattr(
processor_mod,
input_processor_mod,
"processor_cache_from_config",
lambda vllm_config, mm_registry: None,
raising=True,
Expand Down Expand Up @@ -65,11 +64,11 @@ def __init__(self, gb: float):
device_config=DeviceConfig(device="cpu"),
)

return Processor(vllm_config, tokenizer=None)
return InputProcessor(vllm_config, tokenizer=None)


def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
processor = _mk_processor(monkeypatch)
input_processor = _mock_input_processor(monkeypatch)

prompt = {
"prompt": "USER: <image>\nDescribe\nASSISTANT:",
Expand All @@ -79,15 +78,15 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
}

with pytest.raises(ValueError, match="must have same length as data"):
processor.process_inputs(
input_processor.process_inputs(
request_id="req-1",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
)


def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
processor = _mk_processor(monkeypatch)
input_processor = _mock_input_processor(monkeypatch)

prompt = {
"prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
Expand All @@ -101,7 +100,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
}

with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
processor.process_inputs(
input_processor.process_inputs(
request_id="req-2",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
Expand All @@ -119,7 +118,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
def test_multi_modal_uuids_accepts_none_and_passes_through(
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
):
processor = _mk_processor(
input_processor = _mock_input_processor(
monkeypatch,
mm_cache_gb=mm_cache_gb,
enable_prefix_caching=enable_prefix_caching,
Expand All @@ -137,7 +136,7 @@ def fake_preprocess(

# Monkeypatch only the bound preprocess method on this instance
monkeypatch.setattr(
processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
)

# Use a consistent two-image scenario across all configurations
Expand All @@ -151,7 +150,7 @@ def fake_preprocess(
"multi_modal_uuids": mm_uuids,
}

processor.process_inputs(
input_processor.process_inputs(
request_id="req-3",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
Expand All @@ -163,7 +162,9 @@ def fake_preprocess(
def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False)
input_processor = _mock_input_processor(
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
)

captured: dict[str, object] = {}

Expand All @@ -174,7 +175,7 @@ def fake_preprocess(
return {"type": "token", "prompt_token_ids": [1]}

monkeypatch.setattr(
processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
)

request_id = "req-42"
Expand All @@ -188,7 +189,7 @@ def fake_preprocess(
"multi_modal_uuids": mm_uuids,
}

processor.process_inputs(
input_processor.process_inputs(
request_id=request_id,
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
Expand Down
4 changes: 2 additions & 2 deletions vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
from vllm.tasks import SupportedTask
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.engine.input_processor import InputProcessor


class EngineClient(ABC):
"""Protocol class for Clients to Engine"""

vllm_config: VllmConfig
model_config: ModelConfig
processor: Processor
input_processor: InputProcessor
io_processor: IOProcessor | None

@property
Expand Down
6 changes: 3 additions & 3 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def __init__(
self.supported_tasks = supported_tasks

self.model_config = self.llm_engine.model_config
self.processor = self.llm_engine.processor
self.input_processor = self.llm_engine.input_processor
self.io_processor = self.llm_engine.io_processor

def get_tokenizer(self) -> AnyTokenizer:
Expand All @@ -364,7 +364,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)

def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache()
self.input_processor.clear_mm_cache()
self.llm_engine.reset_mm_cache()

def get_default_sampling_params(self) -> SamplingParams:
Expand Down Expand Up @@ -1674,7 +1674,7 @@ def _process_inputs(
tokenization_kwargs,
)

engine_request = self.processor.process_inputs(
engine_request = self.input_processor.process_inputs(
request_id,
engine_prompt,
params,
Expand Down
10 changes: 5 additions & 5 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def __init__(
self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
self.log_error_stack = log_error_stack

self.processor = self.models.processor
self.input_processor = self.models.input_processor
self.io_processor = self.models.io_processor
self.model_config = self.models.model_config
self.max_model_len = self.model_config.max_model_len
Expand Down Expand Up @@ -330,7 +330,7 @@ def _get_reasoning_parser(
return parser

async def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache()
self.input_processor.clear_mm_cache()
await self.engine_client.reset_mm_cache()

async def beam_search(
Expand All @@ -348,8 +348,8 @@ async def beam_search(
length_penalty = params.length_penalty
include_stop_str_in_output = params.include_stop_str_in_output

processor = self.processor
tokenizer = processor.tokenizer
input_processor = self.input_processor
tokenizer = input_processor.tokenizer
if tokenizer is None:
raise ValueError(
"You cannot use beam search when `skip_tokenizer_init` is True"
Expand Down Expand Up @@ -1214,7 +1214,7 @@ async def _process_inputs(
self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs
)

engine_request = self.processor.process_inputs(
engine_request = self.input_processor.process_inputs(
request_id,
engine_prompt,
params,
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(
)
self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)

self.processor = self.engine_client.processor
self.input_processor = self.engine_client.input_processor
self.io_processor = self.engine_client.io_processor
self.model_config = self.engine_client.model_config
self.max_model_len = self.model_config.max_model_len
Expand Down
Loading