From e92a8cf7d6e6b45a4e3c95c479a1c6ca746231f9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 02:55:33 +0000 Subject: [PATCH 01/17] [V0 Deprecation] Remove AsyncLLMEngine Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 2 - tests/async_engine/__init__.py | 0 tests/async_engine/api_server_async_engine.py | 54 - tests/async_engine/conftest.py | 12 - tests/async_engine/test_api_server.py | 139 --- tests/async_engine/test_request_tracker.py | 71 -- tests/v1/test_oracle.py | 18 - vllm/engine/async_llm_engine.py | 1042 +---------------- vllm/entrypoints/launcher.py | 2 - vllm/entrypoints/openai/api_server.py | 115 +- 10 files changed, 4 insertions(+), 1451 deletions(-) delete mode 100644 tests/async_engine/__init__.py delete mode 100644 tests/async_engine/api_server_async_engine.py delete mode 100644 tests/async_engine/conftest.py delete mode 100644 tests/async_engine/test_api_server.py delete mode 100644 tests/async_engine/test_request_tracker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6f06099edd53..1ac03ad6348a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -47,7 +47,6 @@ steps: source_file_dependencies: - vllm/ - tests/mq_llm_engine - - tests/async_engine - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal @@ -58,7 +57,6 @@ steps: commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s mq_llm_engine # MQLLMEngine - - pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s multimodal diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py deleted file mode 100644 index ec6b20f5e04b..000000000000 --- a/tests/async_engine/api_server_async_engine.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""vllm.entrypoints.api_server with some extra logging for testing.""" -from collections.abc import Iterable -from typing import Any - -import uvicorn -from fastapi.responses import JSONResponse, Response - -import vllm.entrypoints.api_server -import vllm.envs as envs -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.utils import FlexibleArgumentParser - -app = vllm.entrypoints.api_server.app - - -class AsyncLLMEngineWithStats(AsyncLLMEngine): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._num_aborts = 0 - - async def _engine_abort(self, request_ids: Iterable[str]): - ids = list(request_ids) - self._num_aborts += len(ids) - await super()._engine_abort(ids) - - def testing_stats(self) -> dict[str, Any]: - return {"num_aborted_requests": self._num_aborts} - - -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(engine.testing_stats()) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser = AsyncEngineArgs.add_cli_args(parser) - args = parser.parse_args() - - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) - vllm.entrypoints.api_server.engine = engine - uvicorn.run(app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE) diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py deleted file mode 100644 index 375b248ebeda..000000000000 --- a/tests/async_engine/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py deleted file mode 100644 index 07370a880329..000000000000 --- a/tests/async_engine/test_api_server.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copyreg -import os -import subprocess -import sys -import time -from multiprocessing import Pool -from pathlib import Path - -import pytest -import requests -import urllib3.exceptions - - -def _pickle_new_connection_error(obj): - """Custom pickler for NewConnectionError to fix tblib compatibility.""" - # Extract the original message by removing the "conn: " prefix - full_message = obj.args[0] if obj.args else "" - if ': ' in full_message: - # Split off the connection part and keep the actual message - _, actual_message = full_message.split(': ', 1) - else: - actual_message = full_message - return _unpickle_new_connection_error, (actual_message, ) - - -def _unpickle_new_connection_error(message): - """Custom unpickler for NewConnectionError.""" - # Create with None as conn and the actual message - return urllib3.exceptions.NewConnectionError(None, message) - - -# Register the custom pickle/unpickle functions for tblib compatibility -copyreg.pickle(urllib3.exceptions.NewConnectionError, - _pickle_new_connection_error) - - -def _query_server(prompt: str, max_tokens: int = 5) -> dict: - response = requests.post("http://localhost:8000/generate", - json={ - "prompt": prompt, - "max_tokens": max_tokens, - "temperature": 0, - "ignore_eos": True - }) - response.raise_for_status() - return response.json() - - -def _query_server_long(prompt: str) -> dict: - return _query_server(prompt, max_tokens=500) - - -@pytest.fixture -def api_server(distributed_executor_backend: str): - script_path = Path(__file__).parent.joinpath( - "api_server_async_engine.py").absolute() - commands = [ - sys.executable, - "-u", - str(script_path), - "--model", - "facebook/opt-125m", - "--host", - "127.0.0.1", - "--distributed-executor-backend", - distributed_executor_backend, - ] - - # API Server Test Requires V0. - my_env = os.environ.copy() - my_env["VLLM_USE_V1"] = "0" - uvicorn_process = subprocess.Popen(commands, env=my_env) - yield - uvicorn_process.terminate() - - -@pytest.mark.timeout(300) -@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) -def test_api_server(api_server, distributed_executor_backend: str): - """ - Run the API server and test it. - - We run both the server and requests in separate processes. - - We test that the server can handle incoming requests, including - multiple requests at the same time, and that it can handle requests - being cancelled without crashing. - """ - with Pool(32) as pool: - # Wait until the server is ready - prompts = ["warm up"] * 1 - result = None - while not result: - try: - for r in pool.map(_query_server, prompts): - result = r - break - except requests.exceptions.ConnectionError: - time.sleep(1) - - # Actual tests start here - # Try with 1 prompt - for result in pool.map(_query_server, prompts): - assert result - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests == 0 - - # Try with 100 prompts - prompts = ["test prompt"] * 100 - for result in pool.map(_query_server, prompts): - assert result - - with Pool(32) as pool: - # Cancel requests - prompts = ["canceled requests"] * 100 - pool.map_async(_query_server_long, prompts) - time.sleep(0.01) - pool.terminate() - pool.join() - - # check cancellation stats - # give it some time to update the stats - time.sleep(1) - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests > 0 - - # check that server still runs after cancellations - with Pool(32) as pool: - # Try with 100 prompts - prompts = ["test prompt after canceled"] * 100 - for result in pool.map(_query_server, prompts): - assert result diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py deleted file mode 100644 index 1851eeeda790..000000000000 --- a/tests/async_engine/test_request_tracker.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.async_llm_engine import RequestTracker -from vllm.outputs import RequestOutput - - -@pytest.mark.asyncio -async def test_request_tracker(): - tracker = RequestTracker() - stream_1 = tracker.add_request("1") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 1 - assert new[0]["request_id"] == "1" - assert not aborted - assert not stream_1.finished - - stream_2 = tracker.add_request("2") - stream_3 = tracker.add_request("3") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 2 - assert new[0]["request_id"] == "2" - assert new[1]["request_id"] == "3" - assert not aborted - assert not stream_2.finished - assert not stream_3.finished - - # request_ids must be unique - with pytest.raises(KeyError): - tracker.add_request("1") - assert not tracker.new_requests_event.is_set() - - tracker.abort_request("1") - new, aborted = tracker.get_new_and_aborted_requests() - assert len(aborted) == 1 - assert "1" in aborted - assert not new - assert stream_1.finished - - stream_4 = tracker.add_request("4") - tracker.abort_request("4") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - # aborted new requests will cancel each other out - - # there's no need for them to propagate into the - # engine - assert not aborted - assert not new - assert stream_4.finished - - stream_5 = tracker.add_request("5") - assert tracker.new_requests_event.is_set() - tracker.process_request_output( - RequestOutput("2", "output", [], [], [], finished=True)) - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert not aborted - assert len(new) == 1 - assert new[0]["request_id"] == "5" - assert stream_2.finished - assert not stream_5.finished diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 794c1f68f147..28c24f62895a 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -7,7 +7,6 @@ import vllm.envs as envs from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine MODEL = "meta-llama/Llama-3.2-1B-Instruct" @@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch): _ = AsyncEngineArgs(model=MODEL).create_engine_config() assert envs.VLLM_USE_V1 m.delenv("VLLM_USE_V1") - - -def test_reject_using_constructor_directly(monkeypatch): - with monkeypatch.context() as m: - if os.getenv("VLLM_USE_V1", None): - m.delenv("VLLM_USE_V1") - - # Sets VLLM_USE_V1=1. - vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config() - - # This uses the V0 constructor directly. - with pytest.raises(ValueError): - AsyncLLMEngine(vllm_config, - AsyncLLMEngine._get_executor_cls(vllm_config), - log_stats=True) - - m.delenv("VLLM_USE_V1") diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index c53ece18964c..ede027759a8b 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,1044 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio -import time -import weakref -from functools import partial -from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, - Mapping, Optional, Set, Tuple, Type, Union) -from weakref import ReferenceType +from vllm.v1.engine.async_llm import AsyncLLM -import vllm.envs as envs -from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig, - SchedulerConfig, VllmConfig) -from vllm.config.lora import LoRAConfig -from vllm.core.scheduler import SchedulerOutputs -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_timeout import asyncio_timeout -from vllm.engine.llm_engine import LLMEngine -from vllm.engine.metrics_types import StatLoggerBase -from vllm.engine.protocol import EngineClient -from vllm.executor.executor_base import ExecutorBase -from vllm.inputs import PromptType -from vllm.inputs.preprocess import InputPreprocessor -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import SamplingParams -from vllm.sequence import ExecuteModelRequest -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, deprecate_kwargs, weak_bind - -logger = init_logger(__name__) -ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S - - -class AsyncEngineDeadError(RuntimeError): - pass - - -def _log_task_completion(task: asyncio.Task, - error_callback: Callable[[Exception], None]) -> None: - """This function is only intended for the `engine.run_engine_loop()` task. - - In particular, that task runs a `while True` loop that can only exit if - there is an exception. - """ - - exception = None - try: - return_value = task.result() - raise AssertionError( - f"The engine background task should never finish without an " - f"exception. {return_value}") - except asyncio.exceptions.CancelledError: - # We assume that if the task is cancelled, we are gracefully shutting - # down. This should only happen on program exit. - logger.info("Engine is gracefully shutting down.") - except Exception as e: - exception = e - logger.error("Engine background task failed", exc_info=e) - error_callback(exception) - raise AsyncEngineDeadError( - "Task finished unexpectedly. This should never happen! " - "Please open an issue on GitHub. See stack trace above for the " - "actual cause.") from e - - -STOP_ITERATION = Exception() # Sentinel - - -class AsyncStream: - """A stream of RequestOutputs for a request that can be iterated over - asynchronously via an async generator.""" - - def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: - self.request_id = request_id - self._cancel = cancel - self._queue: asyncio.Queue = asyncio.Queue() - self._finished = False - - def put(self, item: Union[RequestOutput, Exception]) -> None: - if not self._finished: - self._queue.put_nowait(item) - - def finish( - self, - exception: Optional[Union[BaseException, Type[BaseException]]] = None, - ) -> None: - if not self._finished: - self._finished = True - self._queue.put_nowait( - exception if self._is_raisable(exception) else STOP_ITERATION) - - @property - def finished(self) -> bool: - return self._finished - - async def generator(self) -> AsyncGenerator[RequestOutput, None]: - try: - while True: - result = await self._queue.get() - if self._is_raisable(result): - if result == STOP_ITERATION: - return - raise result - yield result - except GeneratorExit: - self._cancel(self.request_id) - raise asyncio.CancelledError from None - - @staticmethod - def _is_raisable(value: Any): - return isinstance(value, BaseException) or \ - (isinstance(value, type) and \ - issubclass(value, BaseException)) - - -class RequestTracker: - """Synchronous abstraction for tracking requests.""" - - def __init__(self) -> None: - self._request_streams: Dict[str, AsyncStream] = {} - self._aborted_requests: asyncio.Queue[str] = asyncio.Queue() - self._new_requests: asyncio.Queue[Tuple[AsyncStream, - dict]] = asyncio.Queue() - self.new_requests_event = asyncio.Event() - - def __contains__(self, item): - return item in self._request_streams - - def __len__(self) -> int: - return len(self._request_streams) - - def propagate_exception(self, - exc: Exception, - request_id: Optional[str] = None) -> None: - """Propagate an exception to request streams - (all if request_id is None).""" - if request_id is not None: - self.abort_request(request_id, exception=exc) - else: - # NB: tuple() used here because self.abort_request pops the stream - # out of self._request_streams, so we can't iterate on it directly - for rid in tuple(self._request_streams.keys()): - self.abort_request(rid, exception=exc) - - def process_request_output(self, - request_output: RequestOutput, - *, - verbose: bool = False) -> None: - """Process a request output from the engine.""" - request_id = request_output.request_id - finished = request_output.finished - - if finished: - stream = self._request_streams.pop(request_id, None) - else: - stream = self._request_streams.get(request_id) - # Guard against a KeyError which can occur if the request was aborted - # while the output was generated - if stream is not None: - stream.put(request_output) - if finished: - stream.finish() - - if verbose and finished: - logger.info("Finished request %s.", request_id) - - def process_exception(self, - request_id: str, - exception: BaseException, - *, - verbose: bool = False) -> None: - """Propagate an exception from the engine.""" - if verbose: - logger.info("Finished request %s.", request_id) - self.abort_request(request_id, exception=exception) - - def add_request(self, - request_id: str, - *, - verbose: bool = False, - **engine_add_request_kwargs) -> AsyncStream: - """Add a request to be sent to the engine on the next background - loop iteration.""" - if request_id in self._request_streams: - raise KeyError(f"Request {request_id} already exists.") - - abort_request = partial(self.abort_request, verbose=verbose) - stream = AsyncStream(request_id, abort_request) - self._new_requests.put_nowait((stream, { - "request_id": request_id, - **engine_add_request_kwargs - })) - - self.new_requests_event.set() - - if verbose: - logger.info("Added request %s.", request_id) - - return stream - - def abort_request(self, - request_id: str, - *, - exception: Optional[Union[BaseException, - Type[BaseException]]] = None, - verbose: bool = False) -> None: - """Abort a request during next background loop iteration.""" - if verbose: - logger.info("Aborted request %s.", request_id) - - self._aborted_requests.put_nowait(request_id) - - stream = self._request_streams.pop(request_id, None) - if stream is not None: - stream.finish(exception=exception) - - def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]: - """Get the new requests and finished requests to be - sent to the engine.""" - new_requests: List[Dict] = [] - finished_requests: Set[str] = set() - - while not self._aborted_requests.empty(): - request_id = self._aborted_requests.get_nowait() - finished_requests.add(request_id) - - while not self._new_requests.empty(): - stream, new_request = self._new_requests.get_nowait() - request_id = stream.request_id - if request_id in finished_requests: - # The request has already been aborted. - stream.finish(asyncio.CancelledError) - finished_requests.discard(request_id) - else: - self._request_streams[request_id] = stream - new_requests.append(new_request) - - return new_requests, finished_requests - - async def wait_for_new_requests(self): - if not self.has_new_requests(): - await self.new_requests_event.wait() - self.new_requests_event.clear() - - def has_new_requests(self): - return not self._new_requests.empty() - - -class _AsyncLLMEngine(LLMEngine): - """Extension of LLMEngine to add async methods.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - async def step_async(self, virtual_engine: int) -> List[RequestOutput]: - """Performs one decoding iteration and returns newly generated results. - The workers are ran asynchronously if possible. - - This function performs one decoding iteration of the engine. It first - schedules the sequences to be executed in the next iteration and the - token blocks to be swapped in/out/copy. Then, it executes the model - and updates the scheduler with the model outputs. Finally, it decodes - the sequences and returns the newly generated results. - """ - # these are cached outputs from previous iterations. None if on first - # iteration - cached_outputs = self.cached_scheduler_outputs[virtual_engine] - seq_group_metadata_list = cached_outputs.seq_group_metadata_list - scheduler_outputs = cached_outputs.scheduler_outputs - allow_async_output_proc = cached_outputs.allow_async_output_proc - - ctx = self.scheduler_contexts[virtual_engine] - - # Clear outputs for each new scheduler iteration - ctx.request_outputs.clear() - - # skip the scheduler if there are any remaining steps in the seq groups. - # This ensures that the scheduler is only called again when the current - # batch has completed. - if not self._has_remaining_steps(seq_group_metadata_list): - - # Schedule iteration - (seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc - ) = self.scheduler[virtual_engine].schedule() - - ctx.seq_group_metadata_list = seq_group_metadata_list - ctx.scheduler_outputs = scheduler_outputs - - if not scheduler_outputs.is_empty(): - # this will cause mamba_cache/minimax_cache failed - # to release finished_requests_ids of the last steps - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() - - # Maybe switch from async mode to sync mode - if not allow_async_output_proc and len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - - else: - finished_requests_ids = list() - - assert seq_group_metadata_list is not None - assert scheduler_outputs is not None - - if not scheduler_outputs.is_empty(): - - # Check if we have a cached last_output from the previous iteration. - # For supporting PP this is probably the best way to pass the - # sampled_token_ids, as a separate broadcast over all the PP stages - # will cause one virtual engine's microbatch to block the pipeline. - last_sampled_token_ids = \ - self._get_last_sampled_token_ids(virtual_engine) - - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, - blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, - blocks_to_copy=scheduler_outputs.blocks_to_copy, - virtual_engine=virtual_engine, - num_lookahead_slots=scheduler_outputs.num_lookahead_slots, - running_queue_size=scheduler_outputs.running_queue_size, - finished_requests_ids=finished_requests_ids, - # We use ExecuteModelRequest to pass the last sampled_token_ids - # to each of the non-last PP stages for in-place prepare_input. - last_sampled_token_ids=last_sampled_token_ids) - - if allow_async_output_proc: - execute_model_req.async_callback = self.async_callbacks[ - virtual_engine] - - # Execute the model. - outputs = await self.model_executor.execute_model_async( - execute_model_req) - - else: - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - outputs = [] - - if not self._has_remaining_steps(seq_group_metadata_list): - # is_first_step_output is True only when the num_steps of all - # the sequences are 1. - is_first_step_output: bool = False if not seq_group_metadata_list \ - else seq_group_metadata_list[0].state.num_steps == 1 - - ctx.append_output(outputs=outputs, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - is_async=allow_async_output_proc, - is_last_step=True, - is_first_step_output=is_first_step_output) - - if outputs and allow_async_output_proc: - assert len( - outputs - ) == 1, "Async postprocessor expects only a single output set" - self._advance_to_next_step( - outputs[0], seq_group_metadata_list, - scheduler_outputs.scheduled_seq_groups) - - if not allow_async_output_proc: - self._process_model_outputs(ctx=ctx) - - # Log stats. - self.do_log_stats(scheduler_outputs, outputs) - - # Tracing - self.do_tracing(scheduler_outputs) - - else: - # Multi-step case - return ctx.request_outputs - - if not self.has_unfinished_requests(): - # Drain async postprocessor (if exists) - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - assert len(ctx.output_queue) == 0 - - return ctx.request_outputs - - async def stop_remote_worker_execution_loop_async(self) -> None: - """Stop the remote worker execution loop.""" - await self.model_executor.stop_remote_worker_execution_loop_async() - - async def get_tokenizer_async(self, - lora_request: Optional[LoRARequest] = None - ) -> AnyTokenizer: - return await ( - self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) - - async def add_request_async( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> None: - """ - Async version of - [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]. - """ - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - if priority != 0 and not self.scheduler_config.policy == "priority": - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - if arrival_time is None: - arrival_time = time.time() - - if data_parallel_rank is not None: - raise ValueError("Targeting data_parallel_rank only supported " - "in v1 client.") - - if (isinstance(prompt, dict) - and prompt.get("prompt_embeds", None) is not None - and not prompt.get("prompt_token_ids", None)): - # We use the -2 dimension (instead of 0) in case a batched input - # of batch size 1 is passed in. - prompt["prompt_token_ids"] = [0 - ] * prompt["prompt_embeds"].shape[-2] - - processed_inputs = await self.input_preprocessor.preprocess_async( - prompt, - lora_request=lora_request, - tokenization_kwargs=tokenization_kwargs, - ) - - self._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=params, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - - async def check_health_async(self) -> None: - self.model_executor.check_health() - - async def collective_rpc_async(self, - method: str, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None): - raise NotImplementedError - - -class AsyncLLMEngine(EngineClient): - """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine]. - - This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to - make it asynchronous. It uses asyncio to create a background loop that keeps - processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked - by the generate method when there are requests in the waiting queue. The - generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine] - to the caller. - - Args: - log_requests: Whether to log the requests. - start_engine_loop: If True, the background task to run the engine - will be automatically started in the generate call. - *args: Arguments for [`LLMEngine`][vllm.LLMEngine]. - **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine]. - """ - - _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine - - def __init__(self, - *args: Any, - log_requests: bool = True, - start_engine_loop: bool = True, - **kwargs: Any) -> None: - if envs.VLLM_USE_V1: - raise ValueError( - "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. " - "This should not happen. As a workaround, try using " - "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github.") - - self.log_requests = log_requests - self.engine = self._engine_class(*args, **kwargs) - - # This ensures quick processing of request outputs - # so the append to asyncio queues is not delayed, - # especially for multi-step. - self.use_process_request_outputs_callback = ( - self.engine.model_config.use_async_output_proc) - - if self.use_process_request_outputs_callback: - self.engine.process_request_outputs_callback = \ - weak_bind(self.process_request_outputs) - - self.background_loop: Optional[asyncio.Future] = None - # We need to keep a reference to unshielded - # task as well to prevent it from being garbage - # collected - self._background_loop_unshielded: Optional[asyncio.Task] = None - self.start_engine_loop = start_engine_loop - self._errored_with: Optional[BaseException] = None - - # Lazy initialized fields - self._request_tracker: RequestTracker - - def __del__(self): - if rt := getattr(self, "request_tracker", None): - # Wake up engine loop so that it will exit cleanly - rt.new_requests_event.set() - - @classmethod - def _get_executor_cls(cls, - engine_config: VllmConfig) -> Type[ExecutorBase]: - return LLMEngine._get_executor_cls(engine_config) - - @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=("This argument will have no effect. " - "Use `enable_log_requests` instead."), - ) - def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - enable_log_requests: bool = False, - disable_log_stats: bool = False, - disable_log_requests: bool = True, # Deprecated, will be removed - ) -> "AsyncLLMEngine": - """Create an AsyncLLMEngine from the EngineArgs.""" - - return cls( - vllm_config=vllm_config, - executor_class=cls._get_executor_cls(vllm_config), - start_engine_loop=start_engine_loop, - log_requests=enable_log_requests, - log_stats=not disable_log_stats, - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: AsyncEngineArgs, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "AsyncLLMEngine": - """Creates an async LLM engine from the engine arguments.""" - - vllm_config = engine_args.create_engine_config(usage_context) - - async_engine_cls = cls - if envs.VLLM_USE_V1: - from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine - async_engine_cls = V1AsyncLLMEngine - - return async_engine_cls.from_vllm_config( - vllm_config=vllm_config, - start_engine_loop=start_engine_loop, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - enable_log_requests=engine_args.enable_log_requests, - ) - - @property - def is_running(self) -> bool: - return (self.background_loop is not None - and self._background_loop_unshielded is not None - and not self._background_loop_unshielded.done()) - - @property - def is_stopped(self) -> bool: - return self.errored or (self.background_loop is not None and - self._background_loop_unshielded is not None - and self._background_loop_unshielded.done()) - - @property - def errored(self) -> bool: - return self._errored_with is not None - - @property - def dead_error(self) -> BaseException: - return AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - def set_errored(self, exc: Exception) -> None: - self._errored_with = exc - - def _error_callback(self, exc: Exception) -> None: - self.set_errored(exc) - self._request_tracker.propagate_exception(exc) - - async def get_input_preprocessor(self) -> InputPreprocessor: - return self.engine.input_preprocessor - - async def get_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - return await self.engine.get_tokenizer_async(lora_request) - - def start_background_loop(self) -> None: - """Start the background loop.""" - if self.errored: - raise AsyncEngineDeadError( - "Background loop has errored already.") from self._errored_with - if self.is_running: - raise RuntimeError("Background loop is already running.") - # Initialize the RequestTracker here so it uses the right event loop. - self._request_tracker = RequestTracker() - - self._background_loop_unshielded = asyncio.get_event_loop( - ).create_task(self.run_engine_loop(weakref.ref(self))) - self._background_loop_unshielded.add_done_callback( - partial(_log_task_completion, error_callback=self._error_callback)) - self.background_loop = asyncio.shield(self._background_loop_unshielded) - - def shutdown_background_loop(self) -> None: - """ - Shut down the background loop. - - This method needs to be called during cleanup to remove - references to `self` and properly GC the resources held - by the async LLM engine (e.g., the executors as well as - their resources). - """ - if self._background_loop_unshielded is not None: - self._background_loop_unshielded.cancel() - self._background_loop_unshielded = None - self.background_loop = None - - async def engine_step(self, virtual_engine: int) -> bool: - """Kick the engine to process the waiting requests. - - Returns True if there are in-progress requests.""" - - new_requests, aborted_requests = ( - self._request_tracker.get_new_and_aborted_requests()) - - for new_request in new_requests: - # Add the request into the vLLM engine's waiting queue. - try: - await self.engine.add_request_async(**new_request) - except ValueError as e: - # TODO: use a vLLM specific error for failed validation - self._request_tracker.process_exception( - new_request["request_id"], - e, - verbose=self.log_requests, - ) - - if aborted_requests: - await self._engine_abort(aborted_requests) - - request_outputs = await self.engine.step_async(virtual_engine) - - # Put the outputs into the corresponding streams. - # If used as a callback, then already invoked inside - # LLMEngine's _process_model_outputs - if not self.use_process_request_outputs_callback: - all_finished = self.process_request_outputs(request_outputs) - else: - # For callback case, we only need to detect when all - # requests are finished - all_finished = all(request_output.finished - for request_output in request_outputs) - - return not all_finished - - def process_request_outputs(self, request_outputs) -> bool: - # Put the outputs into the corresponding streams. - all_finished = True - for request_output in request_outputs: - self._request_tracker.process_request_output( - request_output, verbose=self.log_requests) - all_finished = all_finished and request_output.finished - - return all_finished - - async def _engine_abort(self, request_ids: Iterable[str]): - self.engine.abort_request(request_ids) - - @staticmethod - async def run_engine_loop(engine_ref: ReferenceType): - """We use a weakref to the engine so that the running loop - doesn't prevent the engine being garbage collected.""" - engine: Optional[AsyncLLMEngine] = engine_ref() - if not engine: - return - - pipeline_parallel_size = \ - engine.engine.parallel_config.pipeline_parallel_size - has_requests_in_progress = [False] * pipeline_parallel_size - while True: - if not any(has_requests_in_progress): - logger.debug("Waiting for new requests...") - # Stop the execute model loop in parallel workers until there - # are more requests to process. This avoids waiting - # indefinitely in torch.distributed ops which may otherwise - # time out, and unblocks the RPC thread in the workers so that - # they can process any other queued control plane messages, - # such as add/remove lora adapters. - await engine.engine.stop_remote_worker_execution_loop_async() - request_tracker = engine._request_tracker - # Allow engine to be garbage collected while - # waiting for new requests - del engine - await asyncio.sleep(0) - if engine_ref() is None: - return - await request_tracker.wait_for_new_requests() - engine = engine_ref() - if not engine: - return - logger.debug("Got new requests!") - requests_in_progress = [ - asyncio.create_task(engine.engine_step(ve)) - for ve in range(pipeline_parallel_size) - ] - has_requests_in_progress = [True] * pipeline_parallel_size - - # Abort if iteration takes too long due to unrecoverable errors - # (eg. NCCL timeouts). - try: - async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S): - done, _ = await asyncio.wait( - requests_in_progress, - return_when=asyncio.FIRST_COMPLETED) - for _ in range(pipeline_parallel_size): - await asyncio.sleep(0) - for task in done: - result = task.result() - virtual_engine = requests_in_progress.index(task) - has_unfinished_requests = ( - engine.engine. - has_unfinished_requests_for_virtual_engine( - virtual_engine)) - if result or has_unfinished_requests: - requests_in_progress[virtual_engine] = ( - asyncio.create_task( - engine.engine_step(virtual_engine))) - has_requests_in_progress[virtual_engine] = True - else: - has_requests_in_progress[virtual_engine] = False - except asyncio.TimeoutError as exc: - logger.error( - "Engine iteration timed out. This should never happen!") - engine.set_errored(exc) - raise - await asyncio.sleep(0) - - async def add_request( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[RequestOutput, None]: - if not self.is_running: - if self.start_engine_loop: - self.start_background_loop() - else: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - if (priority != 0 - and not self.engine.scheduler_config.policy == "priority"): - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - - stream = self._request_tracker.add_request( - request_id, - verbose=self.log_requests, - prompt=prompt, - params=params, - arrival_time=arrival_time or time.time(), - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - data_parallel_rank=data_parallel_rank, - tokenization_kwargs=tokenization_kwargs, - ) - - return stream.generator() - - async def generate( - self, - prompt: PromptType, - sampling_params: SamplingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - ) -> AsyncGenerator[RequestOutput, None]: - """Generate outputs for a request. - - Generate outputs for a request. This method is a coroutine. It adds the - request into the waiting queue of the LLMEngine and streams the outputs - from the LLMEngine to the caller. - - Args: - prompt: The prompt to the LLM. See - [`PromptType`][vllm.inputs.PromptType] for more details about - the format of each input. - sampling_params: The sampling parameters of the request. - request_id: The unique id of the request. - lora_request: LoRA request to use for generation, if any. - trace_headers: OpenTelemetry trace headers. - priority: The priority of the request. - Only applicable with priority scheduling. - data_parallel_rank: The (global) data parallel rank that must - handle this request. Only applicable if DP is enabled. - Yields: - The output `RequestOutput` objects from the LLMEngine - for the request. - - Details: - - If the engine is not running, start the background loop, - which iteratively invokes - [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step] - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. - - Example: - >>> # Please refer to entrypoints/api_server.py for - >>> # the complete example. - >>> - >>> # initialize the engine and the example input - >>> # note that engine_args here is AsyncEngineArgs instance - >>> engine = AsyncLLMEngine.from_engine_args(engine_args) - >>> example_input = { - >>> "prompt": "What is LLM?", - >>> "stream": False, # assume the non-streaming case - >>> "temperature": 0.0, - >>> "request_id": 0, - >>> } - >>> - >>> # start the generation - >>> results_generator = engine.generate( - >>> example_input["prompt"], - >>> SamplingParams(temperature=example_input["temperature"]), - >>> example_input["request_id"]) - >>> - >>> # get the results - >>> final_output = None - >>> async for request_output in results_generator: - >>> if await request.is_disconnected(): - >>> # Abort the request if the client disconnects. - >>> await engine.abort(request_id) - >>> # Return or raise an error - >>> ... - >>> final_output = request_output - >>> - >>> # Process and return the final output - >>> ... - """ - try: - async for output in await self.add_request( - request_id, - prompt, - sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - data_parallel_rank=data_parallel_rank, - ): - yield LLMEngine.validate_output(output, RequestOutput) - except asyncio.CancelledError: - await self.abort(request_id) - raise - - def encode( - self, - prompt: PromptType, - pooling_params: PoolingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - raise NotImplementedError( - "Pooling models are not supported in vLLM V0") - - async def abort(self, request_id: Union[str, Iterable[str]]) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - if not isinstance(request_id, str): - raise RuntimeError("Only single-request abort supported in" - " deprecated V0") - if not self.is_running: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - return self._abort(request_id) - - def _abort(self, request_id: str) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - self._request_tracker.abort_request(request_id, - exception=asyncio.CancelledError, - verbose=self.log_requests) - - async def get_vllm_config(self) -> VllmConfig: - """Get the vllm configuration of the vLLM engine.""" - return self.engine.get_vllm_config() - - async def get_model_config(self) -> ModelConfig: - """Get the model configuration of the vLLM engine.""" - return self.engine.get_model_config() - - async def get_parallel_config(self) -> ParallelConfig: - """Get the parallel configuration of the vLLM engine.""" - return self.engine.get_parallel_config() - - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - return self.engine.get_decoding_config() - - async def get_scheduler_config(self) -> SchedulerConfig: - """Get the scheduling configuration of the vLLM engine.""" - return self.engine.get_scheduler_config() - - async def get_lora_config(self) -> LoRAConfig: - """Get the lora configuration of the vLLM engine.""" - return self.engine.get_lora_config() - - async def do_log_stats( - self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None) -> None: - self.engine.do_log_stats() - - async def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - t = time.perf_counter() - logger.debug("Starting health check...") - if self.is_stopped: - raise AsyncEngineDeadError("Background loop is stopped.") - - await self.engine.check_health_async() - logger.debug("Health check took %fs", time.perf_counter() - t) - - async def is_tracing_enabled(self) -> bool: - return self.engine.is_tracing_enabled() - - def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: - self.engine.add_logger(logger_name=logger_name, logger=logger) - - def remove_logger(self, logger_name: str) -> None: - self.engine.remove_logger(logger_name=logger_name) - - async def start_profile(self) -> None: - self.engine.start_profile() - - async def stop_profile(self) -> None: - self.engine.stop_profile() - - async def reset_mm_cache(self) -> None: - self.engine.reset_mm_cache() - - async def reset_prefix_cache(self, - device: Optional[Device] = None) -> None: - self.engine.reset_prefix_cache(device) - - async def sleep(self, level: int = 1) -> None: - await self.reset_prefix_cache() - self.engine.sleep(level) - - async def wake_up(self, tags: Optional[list[str]] = None) -> None: - self.engine.wake_up(tags) - - async def is_sleeping(self) -> bool: - return self.engine.is_sleeping() - - async def add_lora(self, lora_request: LoRARequest) -> bool: - return self.engine.add_lora(lora_request) - - async def collective_rpc(self, - method: str, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None): - """ - Perform a collective RPC call to the given path. - """ - return await self.engine.collective_rpc_async(method, timeout, args, - kwargs) - - -# TODO(v1): Remove this class proxy when V1 goes default. -if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: - from vllm.v1.engine.async_llm import AsyncLLM - - AsyncLLMEngine = AsyncLLM # type: ignore +AsyncLLMEngine = AsyncLLM # type: ignore diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 887e27710924..657190543269 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -11,7 +11,6 @@ from fastapi import FastAPI, Request, Response from vllm import envs -from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.multiprocessing import MQEngineDeadError from vllm.engine.protocol import EngineClient from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, @@ -155,7 +154,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: """ @app.exception_handler(RuntimeError) - @app.exception_handler(AsyncEngineDeadError) @app.exception_handler(MQEngineDeadError) @app.exception_handler(EngineDeadError) @app.exception_handler(EngineGenerateError) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2e4aa7f3d5a6..85706738835e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -import atexit import gc import importlib import inspect @@ -17,7 +16,6 @@ from argparse import Namespace from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager -from functools import partial from http import HTTPStatus from typing import Annotated, Any, Callable, Optional @@ -41,9 +39,6 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore -from vllm.engine.multiprocessing.client import MQLLMEngineClient -from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (load_chat_template, resolve_hf_chat_template, @@ -102,13 +97,10 @@ log_non_default_args, with_cancellation) from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.transformers_utils.config import ( - maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, - get_open_zmq_ipc_path, is_valid_ipv6_address, - set_ulimit) + is_valid_ipv6_address, set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -236,111 +228,8 @@ async def build_async_engine_client_from_engine_args( if async_llm: async_llm.shutdown() - # V0 AsyncLLM. - elif (MQLLMEngineClient.is_unsupported_config(vllm_config) - or disable_frontend_multiprocessing): - - engine_client: Optional[EngineClient] = None - try: - engine_client = AsyncLLMEngine.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, - disable_log_stats=engine_args.disable_log_stats) - yield engine_client - finally: - if engine_client and hasattr(engine_client, "shutdown"): - engine_client.shutdown() - - # V0MQLLMEngine. else: - if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: - # Make TemporaryDirectory for prometheus multiprocessing - # Note: global TemporaryDirectory will be automatically - # cleaned up upon exit. - global prometheus_multiproc_dir - prometheus_multiproc_dir = tempfile.TemporaryDirectory() - os.environ[ - "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name - else: - logger.warning( - "Found PROMETHEUS_MULTIPROC_DIR was set by user. " - "This directory must be wiped between vLLM runs or " - "you will find inaccurate metrics. Unset the variable " - "and vLLM will properly handle cleanup.") - - # Select random path for IPC. - ipc_path = get_open_zmq_ipc_path() - logger.debug("Multiprocessing frontend to use %s for IPC Path.", - ipc_path) - - # Start RPCServer in separate process (holds the LLMEngine). - # the current process might have CUDA context, - # so we need to spawn a new process - context = multiprocessing.get_context("spawn") - - # Ensure we can serialize transformer config before spawning - maybe_register_config_serialize_by_value() - - # The Process can raise an exception during startup, which may - # not actually result in an exitcode being reported. As a result - # we use a shared variable to communicate the information. - engine_alive = multiprocessing.Value('b', True, lock=False) - engine_process = context.Process( - target=run_mp_engine, - args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, - engine_args.disable_log_stats, - engine_args.enable_log_requests, engine_alive)) - engine_process.start() - engine_pid = engine_process.pid - assert engine_pid is not None, "Engine process failed to start." - logger.info("Started engine process with PID %d", engine_pid) - - def _cleanup_ipc_path(): - socket_path = ipc_path.replace("ipc://", "") - if os.path.exists(socket_path): - os.remove(socket_path) - - # Ensure we clean up the local IPC socket file on exit. - atexit.register(_cleanup_ipc_path) - - # Build RPCClient, which conforms to EngineClient Protocol. - build_client = partial(MQLLMEngineClient, ipc_path, vllm_config, - engine_pid) - mq_engine_client = await asyncio.get_running_loop().run_in_executor( - None, build_client) - try: - while True: - try: - await mq_engine_client.setup() - break - except TimeoutError: - if (not engine_process.is_alive() - or not engine_alive.value): - raise RuntimeError( - "Engine process failed to start. See stack " - "trace for the root cause.") from None - - yield mq_engine_client # type: ignore[misc] - finally: - # Ensure rpc server process was terminated - engine_process.terminate() - - # Close all open connections to the backend - mq_engine_client.close() - - # Wait for engine process to join - engine_process.join(4) - if engine_process.exitcode is None: - # Kill if taking longer than 5 seconds to stop - engine_process.kill() - - # Lazy import for prometheus multiprocessing. - # We need to set PROMETHEUS_MULTIPROC_DIR environment variable - # before prometheus_client is imported. - # See https://prometheus.github.io/client_python/multiprocess/ - from prometheus_client import multiprocess - multiprocess.mark_process_dead(engine_process.pid) + assert False async def validate_json_request(raw_request: Request): From 6c89e6248de1d42803d7fe18cda5acd7dd917038 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 16 Sep 2025 21:16:58 -0700 Subject: [PATCH 02/17] fix assert false Signed-off-by: Woosuk Kwon --- vllm/entrypoints/openai/api_server.py | 58 +++++++++++++-------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 85706738835e..361e9dbbb743 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -198,38 +198,34 @@ async def build_async_engine_client_from_engine_args( vllm_config = engine_args.create_engine_config(usage_context=usage_context) # V1 AsyncLLM. - if envs.VLLM_USE_V1: - if disable_frontend_multiprocessing: - logger.warning( - "V1 is enabled, but got --disable-frontend-multiprocessing. " - "To disable frontend multiprocessing, set VLLM_USE_V1=0.") - - from vllm.v1.engine.async_llm import AsyncLLM - async_llm: Optional[AsyncLLM] = None - client_count = client_config.pop( - "client_count") if client_config else 1 - client_index = client_config.pop( - "client_index") if client_config else 0 - try: - async_llm = AsyncLLM.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, - disable_log_stats=engine_args.disable_log_stats, - client_addresses=client_config, - client_count=client_count, - client_index=client_index) - - # Don't keep the dummy data in memory - await async_llm.reset_mm_cache() - - yield async_llm - finally: - if async_llm: - async_llm.shutdown() + assert envs.VLLM_USE_V1 - else: - assert False + if disable_frontend_multiprocessing: + logger.warning( + "V1 is enabled, but got --disable-frontend-multiprocessing. " + "To disable frontend multiprocessing, set VLLM_USE_V1=0.") + + from vllm.v1.engine.async_llm import AsyncLLM + async_llm: Optional[AsyncLLM] = None + client_count = client_config.pop("client_count") if client_config else 1 + client_index = client_config.pop("client_index") if client_config else 0 + try: + async_llm = AsyncLLM.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + enable_log_requests=engine_args.enable_log_requests, + disable_log_stats=engine_args.disable_log_stats, + client_addresses=client_config, + client_count=client_count, + client_index=client_index) + + # Don't keep the dummy data in memory + await async_llm.reset_mm_cache() + + yield async_llm + finally: + if async_llm: + async_llm.shutdown() async def validate_json_request(raw_request: Request): From 51a326de9a35098548ef402166322d4f20c0c91b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 11:31:50 -0700 Subject: [PATCH 03/17] fix Signed-off-by: Woosuk Kwon --- tests/distributed/test_pipeline_parallel.py | 12 - tests/engine/conftest.py | 12 - tests/engine/test_computed_prefix_blocks.py | 37 - tests/engine/test_executor.py | 111 -- tests/engine/test_multiproc_workers.py | 179 --- tests/engine/test_options.py | 58 - tests/engine/test_short_mm_context.py | 1 + tests/engine/test_stop_checker.py | 225 ---- .../openai/correctness/test_lmeval.py | 10 - tests/v1/engine/test_output_processor.py | 1000 ----------------- 10 files changed, 1 insertion(+), 1644 deletions(-) delete mode 100644 tests/engine/conftest.py delete mode 100644 tests/engine/test_computed_prefix_blocks.py delete mode 100644 tests/engine/test_executor.py delete mode 100644 tests/engine/test_multiproc_workers.py delete mode 100644 tests/engine/test_options.py delete mode 100644 tests/engine/test_stop_checker.py delete mode 100644 tests/v1/engine/test_output_processor.py diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 9da9672d9597..76b105e8a8ec 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -26,18 +26,6 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - For PP, we fall back to V0 by default. This means - that the TP baseline runs with V1 while the PP engine - runs with V0. This gives divergent results with dummy - weights. Once we enable V1 by default for PP, we can - remove this. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - class ParallelSetup(NamedTuple): tp_size: int pp_size: int diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py deleted file mode 100644 index 375b248ebeda..000000000000 --- a/tests/engine/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py deleted file mode 100644 index ac5a1f957dfe..000000000000 --- a/tests/engine/test_computed_prefix_blocks.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine -from vllm.sampling_params import SamplingParams - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -@pytest.mark.parametrize("block_size", [16]) -def test_computed_prefix_blocks(model: str, block_size: int): - # This test checks if we are able to run the engine to completion - # without triggering asserts. - # We are in a scenario where all blocks from the second request's prompt - # are full and already computed when the second request arrives. - prompt = ( - "You are a helpful assistant. How do I build a car from cardboard and " - "paper clips? Is there an easy to follow video tutorial available " - "online for free?") - prompt2 = ( - " Please recommend to me some resources where I can learn not only to " - "handle technical difficulties of building a car, but also " - "decoration.") - - engine_args = EngineArgs(model=model, - block_size=block_size, - enable_prefix_caching=True) - - engine = LLMEngine.from_engine_args(engine_args) - sampling_params = SamplingParams() - - engine.add_request("0", prompt + prompt2, sampling_params) - engine.step() - engine.add_request("1", prompt, sampling_params) - engine.step() diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py deleted file mode 100644 index 67064aff3ae9..000000000000 --- a/tests/engine/test_executor.py +++ /dev/null @@ -1,111 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import os -from typing import Any, Callable, Optional, Union - -import pytest - -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.llm_engine import LLMEngine -from vllm.executor.uniproc_executor import UniProcExecutor -from vllm.sampling_params import SamplingParams - - -class Mock: - ... - - -class CustomUniExecutor(UniProcExecutor): - - def collective_rpc(self, - method: Union[str, Callable], - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None) -> list[Any]: - # Drop marker to show that this was run - with open(".marker", "w"): - ... - return super().collective_rpc(method, timeout, args, kwargs) - - -CustomUniExecutorAsync = CustomUniExecutor - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_custom_executor_type_checking(model): - with pytest.raises(ValueError): - engine_args = EngineArgs(model=model, - distributed_executor_backend=Mock) - LLMEngine.from_engine_args(engine_args) - with pytest.raises(ValueError): - engine_args = AsyncEngineArgs(model=model, - distributed_executor_backend=Mock) - AsyncLLMEngine.from_engine_args(engine_args) - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_custom_executor(model, tmp_path): - cwd = os.path.abspath(".") - os.chdir(tmp_path) - try: - assert not os.path.exists(".marker") - - engine_args = EngineArgs( - model=model, - distributed_executor_backend=CustomUniExecutor, - enforce_eager=True, # reduce test time - ) - engine = LLMEngine.from_engine_args(engine_args) - sampling_params = SamplingParams(max_tokens=1) - - engine.add_request("0", "foo", sampling_params) - engine.step() - - assert os.path.exists(".marker") - finally: - os.chdir(cwd) - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_custom_executor_async(model, tmp_path): - cwd = os.path.abspath(".") - os.chdir(tmp_path) - try: - assert not os.path.exists(".marker") - - engine_args = AsyncEngineArgs( - model=model, - distributed_executor_backend=CustomUniExecutorAsync, - enforce_eager=True, # reduce test time - ) - engine = AsyncLLMEngine.from_engine_args(engine_args) - sampling_params = SamplingParams(max_tokens=1) - - async def t(): - stream = await engine.add_request("0", "foo", sampling_params) - async for x in stream: - ... - - asyncio.run(t()) - - assert os.path.exists(".marker") - finally: - os.chdir(cwd) - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_respect_ray(model): - # even for TP=1 and PP=1, - # if users specify ray, we should use ray. - # users might do this if they want to manage the - # resources using ray. - engine_args = EngineArgs( - model=model, - distributed_executor_backend="ray", - enforce_eager=True, # reduce test time - ) - engine = LLMEngine.from_engine_args(engine_args) - assert engine.model_executor.uses_ray diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py deleted file mode 100644 index b5381b61a020..000000000000 --- a/tests/engine/test_multiproc_workers.py +++ /dev/null @@ -1,179 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -from concurrent.futures import ThreadPoolExecutor -from functools import partial -from time import sleep -from typing import Any - -import pytest - -from vllm.config import VllmConfig -from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, - ResultHandler, WorkerMonitor) -from vllm.worker.worker_base import WorkerWrapperBase - - -class DummyWorkerWrapper(WorkerWrapperBase): - """Dummy version of vllm.worker.worker.Worker""" - - def worker_method(self, worker_input: Any) -> tuple[int, Any]: - sleep(0.05) - - if isinstance(worker_input, Exception): - # simulate error case - raise worker_input - - return self.rpc_rank, input - - -def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]: - result_handler = ResultHandler() - vllm_config = VllmConfig() - workers = [ - ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config, - rank) for rank in range(8) - ] - - worker_monitor = WorkerMonitor(workers, result_handler) - assert not worker_monitor.is_alive() - - result_handler.start() - worker_monitor.start() - assert worker_monitor.is_alive() - - return workers, worker_monitor - - -def test_local_workers() -> None: - """Test workers with sync task submission""" - - workers, worker_monitor = _start_workers() - - def execute_workers(worker_input: str) -> None: - worker_outputs = [ - worker.execute_method("worker_method", worker_input) - for worker in workers - ] - - for rank, output in enumerate(worker_outputs): - assert output.get() == (rank, input) - - executor = ThreadPoolExecutor(max_workers=4) - - # Test concurrent submission from different threads - futures = [ - executor.submit(partial(execute_workers, f"thread {thread_num}")) - for thread_num in range(4) - ] - - for future in futures: - future.result() - - # Test error case - exception = ValueError("fake error") - result = workers[0].execute_method("worker_method", exception) - try: - result.get() - pytest.fail("task should have failed") - except Exception as e: - assert isinstance(e, ValueError) - assert str(e) == "fake error" - - # Test cleanup when a worker fails - assert worker_monitor.is_alive() - workers[3].process.kill() - - # Other workers should get shut down here - worker_monitor.join(20) - - # Ensure everything is stopped - assert not worker_monitor.is_alive() - assert all(not worker.process.is_alive() for worker in workers) - - # Further attempts to submit tasks should fail - try: - _result = workers[0].execute_method("worker_method", "test") - pytest.fail("task should fail once workers have been shut down") - except Exception as e: - assert isinstance(e, ChildProcessError) - - -def test_local_workers_clean_shutdown() -> None: - """Test clean shutdown""" - - workers, worker_monitor = _start_workers() - - assert worker_monitor.is_alive() - assert all(worker.process.is_alive() for worker in workers) - - # Clean shutdown - worker_monitor.close() - - worker_monitor.join(20) - - # Ensure everything is stopped - assert not worker_monitor.is_alive() - assert all(not worker.process.is_alive() for worker in workers) - - # Further attempts to submit tasks should fail - try: - _result = workers[0].execute_method("worker_method", "test") - pytest.fail("task should fail once workers have been shut down") - except Exception as e: - assert isinstance(e, ChildProcessError) - - -@pytest.mark.asyncio -async def test_local_workers_async() -> None: - """Test local workers with async task submission""" - - workers, worker_monitor = _start_workers() - - async def execute_workers(worker_input: str) -> None: - worker_coros = [ - worker.execute_method_async("worker_method", worker_input) - for worker in workers - ] - - results = await asyncio.gather(*worker_coros) - for rank, result in enumerate(results): - assert result == (rank, input) - - tasks = [ - asyncio.create_task(execute_workers(f"task {task_num}")) - for task_num in range(4) - ] - - for task in tasks: - await task - - # Test error case - exception = ValueError("fake error") - try: - _result = await workers[0].execute_method_async( - "worker_method", exception) - pytest.fail("task should have failed") - except Exception as e: - assert isinstance(e, ValueError) - assert str(e) == "fake error" - - # Test cleanup when a worker fails - assert worker_monitor.is_alive() - workers[3].process.kill() - - # Other workers should get shut down here - worker_monitor.join(20) - - # Ensure everything is stopped - assert not worker_monitor.is_alive() - assert all(not worker.process.is_alive() for worker in workers) - - # Further attempts to submit tasks should fail - try: - _result = await workers[0].execute_method_async( - "worker_method", "test") - pytest.fail("task should fail once workers have been shut down") - except Exception as e: - assert isinstance(e, ChildProcessError) diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py deleted file mode 100644 index 42e88e84770a..000000000000 --- a/tests/engine/test_options.py +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from contextlib import nullcontext - -import pytest - -from vllm.entrypoints.llm import LLM -from vllm.sampling_params import SamplingParams - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_skip_tokenizer_initialization(model: str): - # This test checks if the flag skip_tokenizer_init skips the initialization - # of tokenizer and detokenizer. The generated output is expected to contain - # token ids. - llm = LLM( - model=model, - skip_tokenizer_init=True, - enforce_eager=True, - ) - sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) - - with pytest.raises(ValueError, match="cannot pass text prompts when"): - llm.generate("abc", sampling_params) - - outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, - sampling_params=sampling_params) - assert len(outputs) > 0 - completions = outputs[0].outputs - assert len(completions) > 0 - assert completions[0].text == "" - assert completions[0].token_ids - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -@pytest.mark.parametrize("enable_prompt_embeds", [True, False]) -def test_enable_prompt_embeds(hf_runner, model: str, - enable_prompt_embeds: bool): - prompt = "abc" - - with hf_runner(model) as hf_model: - token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids - token_ids = token_ids.to(hf_model.model.device) - - embed_layer = hf_model.model.get_input_embeddings() - prompt_embeds = embed_layer(token_ids).squeeze(0) - - ctx = (nullcontext() if enable_prompt_embeds else pytest.raises( - ValueError, match="set `--enable-prompt-embeds`")) - - llm = LLM( - model=model, - enable_prompt_embeds=enable_prompt_embeds, - enforce_eager=True, - ) - - with ctx: - llm.generate({"prompt_embeds": prompt_embeds}) diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py index 9c62761d78af..9eb3dfc09224 100644 --- a/tests/engine/test_short_mm_context.py +++ b/tests/engine/test_short_mm_context.py @@ -25,6 +25,7 @@ def test_context_length_too_short(vllm_runner, image_assets, model): model, max_model_len=128, # LLaVA has a feature size of 576 enforce_eager=True, + load_format="dummy", ) with vllm_model: diff --git a/tests/engine/test_stop_checker.py b/tests/engine/test_stop_checker.py deleted file mode 100644 index 34f4cb13ab0a..000000000000 --- a/tests/engine/test_stop_checker.py +++ /dev/null @@ -1,225 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -from transformers import AutoTokenizer - -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.reasoning import ReasoningParser -from vllm.sampling_params import SamplingParams -from vllm.sequence import Sequence, SequenceStatus - -REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" - - -class MockReasoningParser(ReasoningParser): - """Mock reasoning parser for testing purposes.""" - - def __init__(self, - tokenizer: AutoTokenizer, - reasoning_active: bool = False): - super().__init__(tokenizer) - self.reasoning_active = reasoning_active - - def is_reasoning_end(self, input_ids: list[int]) -> bool: - return not self.reasoning_active - - def extract_content_ids(self, input_ids: list[int]) -> list[int]: - return input_ids - - -class MockSequence(Sequence): - """Mock sequence for testing purposes.""" - - def __init__(self, token_ids, output_text="test_output", eos_token_id=0): - self.token_ids = token_ids - self.output_text = output_text - self.eos_token_id = eos_token_id - self.status = SequenceStatus.RUNNING - self.stop_reason = None - - def get_token_ids(self): - return self.token_ids - - def get_last_token_id(self): - return self.token_ids[-1] if self.token_ids else None - - def get_len(self): - return len(self.token_ids) - - def get_output_len(self): - return len(self.token_ids) - 1 # Simulating prompt + outputs - - -@pytest.fixture -def deepseek_r1_qwen_tokenizer(): - return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) - - -@pytest.fixture -def stop_checker(): - return StopChecker(max_model_len=10) - - -@pytest.fixture -def stop_checker_with_reasoner(): - reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer) - return StopChecker(max_model_len=10, reasoner=reasoner) - - -def test_eos_token_stopping(stop_checker): - """Test sequence stopping when EOS token is encountered.""" - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams() - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - - -def test_ignore_eos(stop_checker): - """Test sequence continuing when EOS token is ignored.""" - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams(ignore_eos=True) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.RUNNING - - -def test_min_tokens(stop_checker): - """Test min_tokens prevents early stopping.""" - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams(min_tokens=3) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.RUNNING - - -def test_stop_token_ids(stop_checker): - """Test sequence stopping with custom stop token IDs.""" - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(stop_token_ids=[3]) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.stop_reason == 3 - - -def test_stop_strings(stop_checker): - """Test sequence stopping with stop strings.""" - seq = MockSequence(token_ids=[1, 2, 3], - output_text="test output with STOP", - eos_token_id=0) - sampling_params = SamplingParams(stop=["STOP"]) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.stop_reason == "STOP" - assert "STOP" not in seq.output_text # Default behavior removes stop string - - -def test_include_stop_str_in_output(stop_checker): - """Test keeping stop strings in output.""" - seq = MockSequence(token_ids=[1, 2, 3], - output_text="test output with STOP", - eos_token_id=0) - sampling_params = SamplingParams(stop=["STOP"], - include_stop_str_in_output=True) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=5, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert "STOP" in seq.output_text - - -def test_max_tokens(stop_checker): - """Test sequence stopping at max_tokens.""" - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(max_tokens=2) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED - - -def test_max_model_len(stop_checker): - """Test sequence stopping at max_model_len.""" - seq = MockSequence(token_ids=list(range(11)), - eos_token_id=0) # 11 tokens, max is 10 - sampling_params = SamplingParams() - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED - - -def test_reasoning_skip_stops(stop_checker_with_reasoner): - """Test that stop tokens and strings are ignored during reasoning.""" - # Set reasoning_active to True to simulate being in reasoning mode - stop_checker_with_reasoner.reasoner.reasoning_active = True - - # Test with stop token - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(stop_token_ids=[3]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=1, sampling_params=sampling_params) - assert seq.status == SequenceStatus.RUNNING - - # Test with stop string - seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") - sampling_params = SamplingParams(stop=["STOP"]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=4, sampling_params=sampling_params) - assert seq.status == SequenceStatus.RUNNING - - # But EOS token still stops the sequence - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams() - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=1, sampling_params=sampling_params) - assert seq.status == SequenceStatus.FINISHED_STOPPED - - -def test_reasoning_end_enables_stops(stop_checker_with_reasoner): - """Test that stop tokens work after reasoning ends.""" - # Set reasoning_active to False to simulate being out of reasoning mode - stop_checker_with_reasoner.reasoner.reasoning_active = False - - # Test with stop token - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(stop_token_ids=[3]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=1, sampling_params=sampling_params) - assert seq.status == SequenceStatus.FINISHED_STOPPED - - # Test with stop string - seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") - sampling_params = SamplingParams(stop=["STOP"]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=4, sampling_params=sampling_params) - assert seq.status == SequenceStatus.FINISHED_STOPPED diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index 684407cd6ee9..624acd5ffde7 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): more_args = ["--max-num-seqs", "64"] run_test(more_args) - - -@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, - more_args): - """Run with the V0 Engine.""" - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - run_test(more_args) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py deleted file mode 100644 index a9632ce54eac..000000000000 --- a/tests/v1/engine/test_output_processor.py +++ /dev/null @@ -1,1000 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -import time -from typing import Optional - -import pytest - -from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST, - NUM_SAMPLE_LOGPROBS_UNDER_TEST, - STOP_STRINGS, - DummyOutputProcessorTestVectors, - MockEngineCore) -from vllm.outputs import CompletionOutput, RequestOutput -from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.sequence import PromptLogprobs, SampleLogprobs -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.output_processor import (OutputProcessor, - RequestOutputCollector) -from vllm.v1.metrics.stats import IterationStats - - -def _ref_convert_id_to_token( - tokenizer: AnyTokenizer, - token_id: int, -) -> str: - """Reference impl of logprobs detokenization. - - Args: - tokenizer: tokenizer used by the model under test - token_id: convert this token id - - Returns: - String representation of input token id - """ - return tokenizer.decode([token_id]) or "" - - -@pytest.mark.parametrize( - "request_output_kind", - [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) -def test_incremental_detokenization(request_output_kind: RequestOutputKind, - dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer, - log_stats=False) - engine_core = MockEngineCore( - tokens_list=dummy_test_vectors.generation_tokens) - - # Make N requests. - requests = [ - EngineCoreRequest(request_id=f"request-{idx}", - prompt_token_ids=prompt_tokens, - mm_features=None, - eos_token_id=None, - arrival_time=0, - lora_request=None, - cache_salt=None, - data_parallel_rank=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=request_output_kind, - stop=[], - include_stop_str_in_output=False, - ), - pooling_params=None) - for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) - ] - - # Add requests to the detokenizer. - for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): - output_processor.add_request(request, prompt) - - gen_strings = {} - gen_tokens = {} - while True: - # Mock output from the EngineCore. - outputs = engine_core.get_outputs() - if len(outputs) == 0: - break - - # Step the Detokenizer. - processed_outputs = output_processor.process_outputs(outputs) - request_outputs = processed_outputs.request_outputs - requests_to_abort = processed_outputs.reqs_to_abort - assert len(requests_to_abort) == 0 - - # Update tracking. - for request_output in request_outputs: - request_id = request_output.request_id - new_text = request_output.outputs[0].text - new_tokens = request_output.outputs[0].token_ids - if request_id not in gen_strings: - gen_strings[request_id] = new_text - gen_tokens[request_id] = new_tokens - else: - gen_strings[request_id] += new_text - gen_tokens[request_id].extend(new_tokens) - - # Confirmed tracked values matches what we expected. - for idx, (ref_gen_str, ref_gen_toks) in enumerate( - zip(dummy_test_vectors.generation_strings, - dummy_test_vectors.generation_tokens)): - gen_str = gen_strings[f"request-{idx}"] - gen_toks = gen_tokens[f"request-{idx}"] - - assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" - assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" - - assert output_processor.get_num_unfinished_requests() == 0 - assert not output_processor.has_unfinished_requests() - - -def _validate_logprobs( - gen_tokens: dict[str, list[int]], - gen_logprobs: dict[str, Optional[SampleLogprobs]], - gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]], - gen_cumulative_logprob: dict[str, float], - dtv: DummyOutputProcessorTestVectors, - request_id_list: list[str], - num_sample_logprobs: Optional[int], - num_prompt_logprobs: Optional[int], -) -> None: - for req_idx, req_id in enumerate(request_id_list): - new_tokens = gen_tokens[req_id] - logprobs = gen_logprobs[req_id] - prompt_logprobs = gen_prompt_logprobs[req_id] - cumulative_logprob = gen_cumulative_logprob[req_id] - prompt_token_ids = dtv.prompt_tokens[req_idx] - ref_logprobs = dtv.generation_logprobs[req_idx] - ref_prompt_logprobs = dtv.prompt_logprobs[req_idx] - if num_sample_logprobs is not None: - # Validate sample logprobs - assert logprobs is not None, (f"Request {req_id} requires sample" - " logprobs but sample logprobs are" - " None.") - # Require num sampled tokens to match num - # sampled logprobs - especially important - # to check since the detokenizer can cause - # a request to finish early due to a stop - # string being hit - num_new_tokens = len(new_tokens) - len_sample_logprobs = len(logprobs) - assert num_new_tokens == len_sample_logprobs, ( - f"Request {req_id} has {num_new_tokens}" - " completion tokens but has" - f" {len_sample_logprobs} sample logprobs.") - ref_cumulative_logprob = 0.0 - for idx, (sampled_token, - pos_logprob_dict) in enumerate(zip(new_tokens, - logprobs)): - # Break out the reference log probability value & - # logprob token id tensors associated with this - # position in the completion. Also break out the - # sampled token ranks - (ref_pos_logprob_toks, ref_pos_logprob_vals, - ref_sampled_token_rank) = ref_logprobs[idx] - # For each position in the completion sequence, - # ensure the actual sampled token is among the - # logprobs - assert sampled_token in pos_logprob_dict, ( - f"Sampled token {sampled_token} not" - f" present in logprob at index {idx}") - - # Validate number of sample logprobs - num_lp_toks = len(pos_logprob_dict) - assert (num_lp_toks == num_sample_logprobs - or num_lp_toks == num_sample_logprobs + - 1), ("Valid numbers of sample logprobs are" - f" {num_sample_logprobs} or" - f" {num_sample_logprobs+1} but" - f" {num_lp_toks} logprobs found at" - f" position {idx}. Logprobs dict:" - f" {pos_logprob_dict}") - - # Validate sampled token logprob rank - smp_lp = pos_logprob_dict[sampled_token] - smp_lp_rank = smp_lp.rank - assert (ref_sampled_token_rank == smp_lp_rank), ( - "Sampled token logprob rank" - f" {smp_lp_rank} does not match" - " correct value" - f" {ref_sampled_token_rank}" - f" in Logprob {smp_lp}") - - # Validate that the logprob processor yields - # the correct log probabilities and valid - # rankings - rank_one_appears = False - for jdx in range(1, len(ref_pos_logprob_toks)): - # Iterate over the (logprob val,logprob tok id) - # pairs expected by the test fixture at this - # position in the completion. - ref_lp_val = ref_pos_logprob_vals[jdx] - ref_tok_id = ref_pos_logprob_toks[jdx] - assert ref_tok_id in pos_logprob_dict, ( - f"Expected token {ref_tok_id} to be" - f" in logprob dict but it is not.") - - # Extract actually-generated logprob - # info - lp = pos_logprob_dict[ref_tok_id] - lp_val = lp.logprob - lp_rank = lp.rank - - # A "top" (rank 1) logprob must be - # present - rank_one_appears = (True - if lp_rank == 1 else rank_one_appears) - - # Rank must be >= 1 - assert lp_rank >= 1, (f"Logprob {lp} has invalid" - f" rank {lp_rank} < 1." - f" Logprob dict: {pos_logprob_dict}") - - # Validate log probability - assert math.isclose(lp_val, ref_lp_val), ( - f"Token id {ref_tok_id} appears in logprobs dict" - f" at position {idx} in completion with log" - f" probability {lp_val} but {ref_lp_val} was" - f" expected. Logprob: {lp}") - - assert rank_one_appears, (f"No Logprob has rank 1" - " in the following Logprob" - f" dict: {pos_logprob_dict}") - - # Validate logprobs detokenization - for lp_tok in pos_logprob_dict: - # Confirm that sample logprob decoded token matches - # the logprob token id at this sequence position - decoded_token = pos_logprob_dict[lp_tok].decoded_token - ref_decoded_token = _ref_convert_id_to_token( - dtv.tokenizer, lp_tok) - assert decoded_token == ref_decoded_token, ( - f"Sampled logprob token id {lp_tok} decodes to" - f" {ref_decoded_token} but Logprob decoded" - f" token is {decoded_token} instead" - f" (at position {idx})") - - ref_cumulative_logprob += pos_logprob_dict[ - sampled_token].logprob - # Assert that cumulative logprobs are correct - assert math.isclose(cumulative_logprob, ref_cumulative_logprob) - else: - # Sample logprobs disabled for this request - assert logprobs is None - assert cumulative_logprob is None - - if num_prompt_logprobs is not None: - # Validate prompt logprobs - assert prompt_logprobs is not None, ( - f"Request {req_id} requires prompt" - " logprobs but prompt logprobs are" - " None.") - # Require num prompt tokens to match num - # prompt logprobs - num_prompt_tokens = len(prompt_token_ids) - len_prompt_logprobs = len(prompt_logprobs) - assert num_prompt_tokens == len_prompt_logprobs, ( - f"Request {req_id} has {num_prompt_tokens}" - " prompt tokens but has" - f" {len_prompt_logprobs} prompt logprobs.") - # First prompt logprob is None - first_plp_dict = prompt_logprobs[0] - assert first_plp_dict is None, ( - f"Request {req_id} first prompt logprob" - f" should be None but has following value" - f" instead: {first_plp_dict}") - # Break out the reference prompt log prob value & - # logprob token id matrices for the whole prompt. - # Also break out the prompt token rank vector - (ref_prompt_logprob_toks, ref_prompt_logprob_vals, - ref_prompt_token_ranks) = ref_prompt_logprobs - for idx, (prompt_token, pos_logprob_dict) in enumerate( - zip(prompt_token_ids[1:], prompt_logprobs[1:])): - - # Break out the reference prompt log prob value - # vector, prompt logprob token id vector, and - # prompt token rank at the current position. - (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals, - ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :], - ref_prompt_logprob_vals[idx, :], - ref_prompt_token_ranks[idx]) - - # For each position in the prompt sequence, - # ensure the actual prompt token is among the - # logprobs - assert prompt_token in pos_logprob_dict, ( - f"Prompt token {prompt_token} not" - f" present in logprob at index {idx}") - # Validate number of prompt logprobs - num_plp_toks = len(pos_logprob_dict) - assert (num_plp_toks == num_prompt_logprobs - or num_plp_toks == num_prompt_logprobs + - 1), ("Valid numbers of prompt logprobs are" - f" {num_prompt_logprobs} or" - f" {num_prompt_logprobs+1} but" - f" {num_plp_toks} logprobs found at" - f" position {idx}. Logprobs dict:" - f" {pos_logprob_dict}") - - # Validate prompt token logprob rank - prmpt_tok_lp = pos_logprob_dict[prompt_token] - prmpt_tok_lp_rank = prmpt_tok_lp.rank - ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank - assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), ( - "Prompt token logprob rank" - f" {prmpt_tok_lp_rank} does not match" - " correct value" - f" {ref_prmpt_tok_lp_rank}" - f" in Logprob {prmpt_tok_lp}") - - # Validate that the logprob processor yields - # the correct prompt log probs and valid - # rankings - rank_one_appears = False - for jdx in range(1, len(ref_pos_prompt_logprob_toks)): - # Iterate over the (logprob val,logprob tok id) - # pairs expected by the test fixture at this - # position in the completion. - ref_plp_val = float(ref_pos_prompt_logprob_vals[jdx]) - ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx]) - assert ref_tok_id in pos_logprob_dict, ( - f"Expected token {ref_tok_id} to be" - f" in logprob dict but it is not.") - - # Extract actually-generated logprob - # info - plp = pos_logprob_dict[ref_tok_id] - plp_val = plp.logprob - plp_rank = plp.rank - - # A "top" (rank 1) logprob must be - # present - rank_one_appears = (True - if plp_rank == 1 else rank_one_appears) - - # Rank must be >= 1 - assert plp_rank >= 1, ( - f"Logprob {plp} has invalid" - f" rank {plp_rank} < 1." - f" Logprob dict: {pos_logprob_dict}") - - # Validate log probability - assert math.isclose(plp_val, ref_plp_val), ( - f"Token id {ref_tok_id} appears in logprobs dict" - f" at position {idx} in completion with log" - f" probability {plp_val} but {ref_plp_val} was" - f" expected. Logprob: {plp}") - - assert rank_one_appears, (f"No Logprob has rank 1" - " in the following Logprob" - f" dict: {pos_logprob_dict}") - - # Validate prompt logprob detokenization - for plp_tok in pos_logprob_dict: - # Confirm that prompt logprob decoded token matches - # the logprob token id at this sequence position - decoded_token = pos_logprob_dict[plp_tok].decoded_token - ref_decoded_token = _ref_convert_id_to_token( - dtv.tokenizer, plp_tok) - assert decoded_token == ref_decoded_token, ( - f"Prompt logprob token id {plp_tok} decodes to" - f" {ref_decoded_token} but Logprob decoded" - f" token is {decoded_token} instead" - f" (at position {idx})") - else: - # Prompt logprobs disabled for this request - assert prompt_logprobs is None - - -@pytest.mark.parametrize( - "request_output_kind", - [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) -@pytest.mark.parametrize("num_sample_logprobs", - [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST]) -@pytest.mark.parametrize("num_prompt_logprobs", - [None, NUM_PROMPT_LOGPROBS_UNDER_TEST]) -def test_logprobs_processor(request_output_kind: RequestOutputKind, - num_sample_logprobs: Optional[int], - num_prompt_logprobs: Optional[int], - dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer, - log_stats=False) - engine_core = MockEngineCore( - tokens_list=dummy_test_vectors.generation_tokens, - generated_logprobs_raw=None if num_sample_logprobs is None else - dummy_test_vectors.generation_logprobs, - prompt_logprobs_raw=None - if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs) - - # Make N requests. - request_id_list = [ - f"request-{idx}" - for idx in range(len(dummy_test_vectors.prompt_strings)) - ] - requests = [ - EngineCoreRequest(request_id=request_id_list[idx], - prompt_token_ids=prompt_tokens, - mm_features=None, - eos_token_id=None, - arrival_time=0, - lora_request=None, - cache_salt=None, - data_parallel_rank=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=request_output_kind, - stop=[], - include_stop_str_in_output=False, - logprobs=num_sample_logprobs, - prompt_logprobs=num_prompt_logprobs, - ), - pooling_params=None) - for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) - ] - - # Add requests to the detokenizer. - for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): - output_processor.add_request(request, prompt) - - gen_tokens = {} - gen_logprobs = {} - gen_prompt_logprobs = {} - gen_cumulative_logprobs = {} - while True: - # Mock output from the EngineCore. - outputs = engine_core.get_outputs() - if len(outputs) == 0: - break - - # Step the logprobs processor. - processed_outputs = output_processor.process_outputs(outputs) - request_outputs = processed_outputs.request_outputs - requests_to_abort = processed_outputs.reqs_to_abort - assert len(requests_to_abort) == 0 - - # Update tracking. - for request_output in request_outputs: - request_id = request_output.request_id - new_tokens = request_output.outputs[0].token_ids - prompt_logprobs = request_output.prompt_logprobs - logprobs = request_output.outputs[0].logprobs - gen_cumulative_logprobs[request_id] = request_output.outputs[ - 0].cumulative_logprob - if request_id not in gen_logprobs: - # Start tracking sample and prompt logprobs for this request - gen_tokens[request_id] = new_tokens - gen_logprobs[request_id] = logprobs - gen_prompt_logprobs[request_id] = prompt_logprobs - else: - # Extend logprobs tracker - gen_tokens[request_id].extend(new_tokens) - lp = gen_logprobs[request_id] - plp = gen_prompt_logprobs[request_id] - if lp: - lp.extend(logprobs) - if plp: - plp.extend(prompt_logprobs) - - # Confirmed tracked logprobs match what we expect - _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs, - gen_cumulative_logprobs, dummy_test_vectors, - request_id_list, num_sample_logprobs, - num_prompt_logprobs) - - assert output_processor.get_num_unfinished_requests() == 0 - assert not output_processor.has_unfinished_requests() - - -@pytest.mark.parametrize( - "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs", - [(False, "stop_token_ids", False, None), - (True, "stop_token_ids", False, None), - (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST), - (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST), - (False, "eos_token_id", False, None), (True, "eos_token_id", False, None), - (False, "eos_token_id", True, None)]) -def test_stop_token(include_stop_str_in_output: bool, - num_sample_logprobs: Optional[int], stop_token_type: str, - ignore_eos: bool, dummy_test_vectors): - """Test output processor EOS/stop token handling. - - Send mock engine core request to mock engine core and pass core outputs - to output processor. Validate output processor tokens, text and - (if enabled) sample logprobs. Batch-size one. - - The test emulates a scenario where a model outputs text tokens followed - by two identical control tokens: - ... - - If EOS is under test, the control tokens are EOS; otherwise, they are - some other token id. - - Test behavior: - - * If EOS is under test and `ignore_eos=True`, the detokenized string - should be ... and the finish - reason should be "length" (i.e. no stop occurs) - - * else, if `include_stop_str_in_output==True`, the detokenized - string should be ... and the finish - reason should be "stop" (i.e. first control token causes stop - and is represented in output text) - - * else, the detokenized string should be - ... and the finish reason should be "stop" - (i.e. first control token causes stop but is not represented - in output text.) - - Note: some test details are tuned for meta-llama/Llama-3.2-1B, - another model should work only if the test is modified. - - Args: - include_stop_str_in_output: stop token str appears in output text - num_sample_logprobs: number of sample logprobs (`None` for no logprobs) - stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token - ignore_eos: if True, EOS stops are disabled - dummy_test_vectors: dummy engine core outputs and other data structures - """ - model_id = dummy_test_vectors.tokenizer.name_or_path - if model_id != 'meta-llama/Llama-3.2-1B': - raise AssertionError("Test requires meta-llama/Llama-3.2-1B but " - f"{model_id} is in use.") - do_logprobs = num_sample_logprobs is not None - # EOS under test; if False, stop_token_ids under test - is_eos_test = stop_token_type == "eos_token_id" - # EOS under test but ignore_eos enabled - is_eos_ignore_test = is_eos_test and ignore_eos - eos_token_id = ( - dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None - ) # '<|end_of_text|>' - stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>' - - output_processor = OutputProcessor(dummy_test_vectors.tokenizer, - log_stats=False) - # Dummy engine core outputs, with control tokens suffixed to test stops - suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids) - assert suffix_token is not None and isinstance(suffix_token[0], int) - generation_string = dummy_test_vectors.generation_strings[0] - generation_tokens = (dummy_test_vectors.generation_tokens[0] + - 2 * suffix_token) - if do_logprobs: - generation_logprobs = ( - dummy_test_vectors.generation_logprobs[0] + - 2 * [dummy_test_vectors.generation_logprobs[0][-1]]) - prompt_string = dummy_test_vectors.prompt_strings[0] - prompt_tokens = dummy_test_vectors.prompt_tokens[0] - engine_core = MockEngineCore( - tokens_list=[generation_tokens], - generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, - prompt_logprobs_raw=None, - eos_token_id=eos_token_id, - stop_token_ids=stop_token_ids, - ignore_eos=ignore_eos) - - # Make request. - request_id = "request-0" - request = EngineCoreRequest( - request_id=request_id, - prompt_token_ids=prompt_tokens, - mm_features=None, - eos_token_id=eos_token_id, - arrival_time=0, - lora_request=None, - cache_salt=None, - data_parallel_rank=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=[], - stop_token_ids=stop_token_ids, - include_stop_str_in_output=include_stop_str_in_output, - logprobs=num_sample_logprobs, - prompt_logprobs=None, - ignore_eos=ignore_eos, - ), - pooling_params=None) - - # Add request to the detokenizer. - output_processor.add_request(request, prompt_string) - - # Loop over engine core steps; run output processor - gen_string = "" - gen_tokens = [] - gen_logprobs = [] - while True: - # Mock output from the EngineCore. - outputs = engine_core.get_outputs() - if len(outputs) == 0: - break - - # Step the Detokenizer. - processed_outputs = output_processor.process_outputs(outputs) - request_outputs = processed_outputs.request_outputs - assert len(request_outputs) == 1 - # Stop token does not rely on abort - assert not processed_outputs.reqs_to_abort - - # Update tracking. - request_output = request_outputs[0] - if request_output.finished: - finish_reason = ("length" if is_eos_ignore_test else "stop") - assert request_output.outputs[0].finish_reason == finish_reason - - gen_string += request_output.outputs[0].text - gen_tokens.extend(request_output.outputs[0].token_ids) - if do_logprobs: - gen_logprobs.extend(request_output.outputs[0].logprobs) - - # Validate generated text - control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>' - if is_eos_ignore_test: - # Length-based stop; expect full string - ref_str = generation_string + 2 * control_token - elif include_stop_str_in_output: - # Stop token triggered; include in output - ref_str = generation_string + control_token - else: - # Stop token triggered but not in output - ref_str = generation_string - assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}") - - if do_logprobs: - # Validate number of sample logprobs - num_tokens = len(gen_tokens) - num_logprobs = len(gen_logprobs) - assert num_tokens == num_logprobs, ( - f"Token count ({num_tokens}) != logprobs count ({num_logprobs})") - - # Check requests are finished - assert output_processor.get_num_unfinished_requests() == 0 - assert not output_processor.has_unfinished_requests() - - -@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) -@pytest.mark.parametrize("num_sample_logprobs", - [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST]) -def test_stop_string(include_stop_str_in_output: bool, - num_sample_logprobs: Optional[int], dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer, - log_stats=False) - engine_core = MockEngineCore( - tokens_list=dummy_test_vectors.generation_tokens, - generated_logprobs_raw=dummy_test_vectors.generation_logprobs - if num_sample_logprobs else None, - prompt_logprobs_raw=None) - - # Make N requests. - request_id_list = [ - f"request-{idx}" - for idx in range(len(dummy_test_vectors.prompt_strings)) - ] - requests = [ - EngineCoreRequest( - request_id=request_id_list[idx], - prompt_token_ids=prompt_tokens, - mm_features=None, - eos_token_id=None, - arrival_time=0, - lora_request=None, - cache_salt=None, - data_parallel_rank=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=STOP_STRINGS, - include_stop_str_in_output=include_stop_str_in_output, - logprobs=num_sample_logprobs, - prompt_logprobs=None, - ), - pooling_params=None) - for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) - ] - - # Add requests to the detokenizer. - for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): - output_processor.add_request(request, prompt) - - gen_strings = {} - gen_tokens = {} - gen_logprobs = {} - gen_prompt_logprobs = {} - gen_cumulative_logprobs = {} - aborted = [] - while True: - # Mock output from the EngineCore. - outputs = engine_core.get_outputs() - if len(outputs) == 0: - break - - # Step the Detokenizer. - processed_outputs = output_processor.process_outputs(outputs) - request_outputs = processed_outputs.request_outputs - requests_to_abort = processed_outputs.reqs_to_abort - for request_output in request_outputs: - # If aborted, we should not get a request output. - assert request_output.request_id not in aborted - aborted.extend(requests_to_abort) - - # Update tracking. - for request_output in request_outputs: - if request_output.finished: - assert request_output.outputs[0].finish_reason == "stop" - - request_id = request_output.request_id - new_text = request_output.outputs[0].text - new_tokens = request_output.outputs[0].token_ids - prompt_logprobs = request_output.prompt_logprobs - logprobs = request_output.outputs[0].logprobs - gen_cumulative_logprobs[request_id] = request_output.outputs[ - 0].cumulative_logprob - if request_id not in gen_strings: - gen_strings[request_id] = new_text - gen_tokens[request_id] = new_tokens - gen_logprobs[request_id] = logprobs - gen_prompt_logprobs[request_id] = prompt_logprobs - else: - gen_strings[request_id] += new_text - gen_tokens[request_id].extend(new_tokens) - lp = gen_logprobs[request_id] - plp = gen_prompt_logprobs[request_id] - if lp: - lp.extend(logprobs) - if plp: - plp.extend(prompt_logprobs) - - # Confirmed tracked values matches what we expected. - for idx, (ref_gen_str, stop_str) in enumerate( - zip(dummy_test_vectors.generation_strings, STOP_STRINGS)): - - # Request should be aborted. - request_id = f"request-{idx}" - assert request_id in aborted - - # Collected values that were generated. - gen_str = gen_strings[request_id] - - # Construct reference strings. - stop_str_idx = ref_gen_str.find(stop_str) - ref_str_exc_stop = ref_gen_str[:stop_str_idx] - ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str - - if include_stop_str_in_output: - assert gen_str == ref_str_inc_stop, ( - f"{gen_str=}, {ref_str_inc_stop=}") - else: - assert gen_str == ref_str_exc_stop, ( - f"{gen_str=}, {ref_str_exc_stop=}") - - # Confirmed tracked logprobs match what we expect - _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs, - gen_cumulative_logprobs, dummy_test_vectors, - request_id_list, num_sample_logprobs, None) - - assert output_processor.get_num_unfinished_requests() == 0 - assert not output_processor.has_unfinished_requests() - - -def test_iteration_stats(dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer, - log_stats=True) - engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) - engine_core_timestamp = time.monotonic() - - # Make N requests. - requests = [ - EngineCoreRequest( - request_id=f"request-{idx}", - prompt_token_ids=prompt_tokens, - mm_features=None, - eos_token_id=None, - arrival_time=0, - lora_request=None, - cache_salt=None, - data_parallel_rank=None, - sampling_params=SamplingParams(), - pooling_params=None, - ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) - ] - - # Add all requests except one to the OutputProcessor. - num_active = len(dummy_test_vectors.generation_tokens) - 1 - for request in requests[:num_active]: - output_processor.add_request(request, None) - inactive_request = requests[num_active] - - # First iteration has 2 prefills. - outputs = engine_core.get_outputs()[:num_active] - iteration_stats = IterationStats() - output_processor.process_outputs(outputs, engine_core_timestamp, - iteration_stats) - total_prompt_tokens = sum([ - len(prompt_tokens) - for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active] - ]) - - assert iteration_stats.num_prompt_tokens == total_prompt_tokens - assert iteration_stats.num_generation_tokens == num_active - - # Just decodes in this step. - outputs = engine_core.get_outputs()[:num_active] - iteration_stats = IterationStats() - output_processor.process_outputs(outputs, engine_core_timestamp, - iteration_stats) - - assert iteration_stats.num_prompt_tokens == 0 - assert iteration_stats.num_generation_tokens == num_active - - # Add a new request - prefill and 2 decodes in this step. - output_processor.add_request(inactive_request, None) - num_active += 1 - outputs = engine_core.get_outputs()[:num_active] - iteration_stats = IterationStats() - output_processor.process_outputs(outputs, engine_core_timestamp, - iteration_stats) - total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1]) - - assert iteration_stats.num_prompt_tokens == total_prompt_tokens - assert iteration_stats.num_generation_tokens == num_active - - # Just decodes in this step. - outputs = engine_core.get_outputs()[:num_active] - iteration_stats = IterationStats() - output_processor.process_outputs(outputs, engine_core_timestamp, - iteration_stats) - - assert iteration_stats.num_prompt_tokens == 0 - assert iteration_stats.num_generation_tokens == num_active - - -@pytest.mark.asyncio -async def test_request_output_collector(): - NUM_REQS = 3 - TEXT = "a" - - def make_outputs() -> list[RequestOutput]: - return [ - RequestOutput( - request_id="my-request-id", - prompt=None, - prompt_token_ids=[1, 2, 3], - prompt_logprobs=None, - outputs=[ - CompletionOutput( - index=0, - text=TEXT, - token_ids=[idx], - cumulative_logprob=(idx + 1 * 1.0), - logprobs=[{ - "a": idx, - "b": idx - }], - finish_reason="length" if - (idx == NUM_REQS - 1) else None, - ) - ], - finished=(idx == NUM_REQS - 1), - ) for idx in range(NUM_REQS) - ] - - collector = RequestOutputCollector(RequestOutputKind.DELTA) - - # CASE 1: Put then get. - outputs = make_outputs() - collector.put(outputs[0]) - output = await collector.get() - assert not collector.ready.is_set() - assert collector.output is None - assert output.outputs[0].text == "a" - assert output.outputs[0].token_ids == [0] - - # CASE 2: 2 puts then get. - num_to_put = 2 - outputs = make_outputs() - for i in range(num_to_put): - collector.put(outputs[i]) - output = await collector.get() - assert not collector.ready.is_set() - assert collector.output is None - - assert not output.finished - # Text, token_ids, and logprobs should get merged. - assert output.outputs[0].text == TEXT * num_to_put - for tok_0, tok_1 in zip(output.outputs[0].token_ids, - list(range(num_to_put))): - assert tok_0 == tok_1 - assert len(output.outputs[0].logprobs) == num_to_put - - # Cumulative logprobs should be the last one. - cumulative_logprob_expected = 1.0 * num_to_put - assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected - - # CASE 3: Put all 3 (including a finished). - num_to_put = 3 - outputs = make_outputs() - for i in range(num_to_put): - collector.put(outputs[i]) - output = await collector.get() - assert not collector.ready.is_set() - assert collector.output is None - - assert output.finished - assert output.outputs[0].finish_reason == "length" - # Text, token_ids, and logprobs should get merged. - assert output.outputs[0].text == TEXT * num_to_put - for tok_0, tok_1 in zip(output.outputs[0].token_ids, - list(range(num_to_put))): - assert tok_0 == tok_1 - assert len(output.outputs[0].logprobs) == num_to_put - - # Cumulative logprobs should be the last one. - cumulative_logprob_expected = 1.0 * num_to_put - assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected - - -@pytest.mark.asyncio -async def test_cumulative_output_collector_n(): - """Test collector correctly handles multiple outputs by index.""" - collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE) - outputs = [ - RequestOutput( - request_id="my-request-id", - prompt=None, - prompt_token_ids=[1, 2, 3], - prompt_logprobs=None, - outputs=[ - CompletionOutput( - index=0, - text="a", - token_ids=[0], - cumulative_logprob=None, - logprobs=None, - finish_reason=None, - ), - CompletionOutput( - index=1, - text="b", - token_ids=[1], - cumulative_logprob=None, - logprobs=None, - finish_reason=None, - ), - ], - finished=False, - ), - RequestOutput( - request_id="my-request-id", - prompt=None, - prompt_token_ids=[1, 2, 3], - prompt_logprobs=None, - outputs=[ - CompletionOutput( - index=0, - text="ab", - token_ids=[0, 1], - cumulative_logprob=None, - logprobs=None, - finish_reason=None, - ), - CompletionOutput( - index=2, - text="c", - token_ids=[2], - cumulative_logprob=None, - logprobs=None, - finish_reason=None, - ), - ], - finished=False, - ), - ] - for output in outputs: - collector.put(output) - - # Get the output and check that the text and token_ids are correct. - result = await collector.get() - # We are expecting - # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}] - assert len(result.outputs) == 3 - # First is the one where index is 0 - first = [k for k in result.outputs if k.index == 0] - assert len(first) == 1 - assert first[0].text == "ab" - - # Second is the one where index is 1 - second = [k for k in result.outputs if k.index == 1] - assert len(second) == 1 - assert second[0].text == "b" - assert second[0].token_ids == [1] - - # Third is the one where index is 2 - third = [k for k in result.outputs if k.index == 2] - assert len(third) == 1 - assert third[0].text == "c" From 8c2eb56bff4ef0da36d695dcb4f0247a212c3f5e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 11:33:04 -0700 Subject: [PATCH 04/17] revert Signed-off-by: Woosuk Kwon --- tests/v1/engine/test_output_processor.py | 1000 ++++++++++++++++++++++ 1 file changed, 1000 insertions(+) create mode 100644 tests/v1/engine/test_output_processor.py diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py new file mode 100644 index 000000000000..a9632ce54eac --- /dev/null +++ b/tests/v1/engine/test_output_processor.py @@ -0,0 +1,1000 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +import time +from typing import Optional + +import pytest + +from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST, + NUM_SAMPLE_LOGPROBS_UNDER_TEST, + STOP_STRINGS, + DummyOutputProcessorTestVectors, + MockEngineCore) +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.sequence import PromptLogprobs, SampleLogprobs +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.output_processor import (OutputProcessor, + RequestOutputCollector) +from vllm.v1.metrics.stats import IterationStats + + +def _ref_convert_id_to_token( + tokenizer: AnyTokenizer, + token_id: int, +) -> str: + """Reference impl of logprobs detokenization. + + Args: + tokenizer: tokenizer used by the model under test + token_id: convert this token id + + Returns: + String representation of input token id + """ + return tokenizer.decode([token_id]) or "" + + +@pytest.mark.parametrize( + "request_output_kind", + [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +def test_incremental_detokenization(request_output_kind: RequestOutputKind, + dummy_test_vectors): + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, + log_stats=False) + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens) + + # Make N requests. + requests = [ + EngineCoreRequest(request_id=f"request-{idx}", + prompt_token_ids=prompt_tokens, + mm_features=None, + eos_token_id=None, + arrival_time=0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=request_output_kind, + stop=[], + include_stop_str_in_output=False, + ), + pooling_params=None) + for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) + ] + + # Add requests to the detokenizer. + for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): + output_processor.add_request(request, prompt) + + gen_strings = {} + gen_tokens = {} + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + processed_outputs = output_processor.process_outputs(outputs) + request_outputs = processed_outputs.request_outputs + requests_to_abort = processed_outputs.reqs_to_abort + assert len(requests_to_abort) == 0 + + # Update tracking. + for request_output in request_outputs: + request_id = request_output.request_id + new_text = request_output.outputs[0].text + new_tokens = request_output.outputs[0].token_ids + if request_id not in gen_strings: + gen_strings[request_id] = new_text + gen_tokens[request_id] = new_tokens + else: + gen_strings[request_id] += new_text + gen_tokens[request_id].extend(new_tokens) + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, ref_gen_toks) in enumerate( + zip(dummy_test_vectors.generation_strings, + dummy_test_vectors.generation_tokens)): + gen_str = gen_strings[f"request-{idx}"] + gen_toks = gen_tokens[f"request-{idx}"] + + assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" + assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" + + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() + + +def _validate_logprobs( + gen_tokens: dict[str, list[int]], + gen_logprobs: dict[str, Optional[SampleLogprobs]], + gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]], + gen_cumulative_logprob: dict[str, float], + dtv: DummyOutputProcessorTestVectors, + request_id_list: list[str], + num_sample_logprobs: Optional[int], + num_prompt_logprobs: Optional[int], +) -> None: + for req_idx, req_id in enumerate(request_id_list): + new_tokens = gen_tokens[req_id] + logprobs = gen_logprobs[req_id] + prompt_logprobs = gen_prompt_logprobs[req_id] + cumulative_logprob = gen_cumulative_logprob[req_id] + prompt_token_ids = dtv.prompt_tokens[req_idx] + ref_logprobs = dtv.generation_logprobs[req_idx] + ref_prompt_logprobs = dtv.prompt_logprobs[req_idx] + if num_sample_logprobs is not None: + # Validate sample logprobs + assert logprobs is not None, (f"Request {req_id} requires sample" + " logprobs but sample logprobs are" + " None.") + # Require num sampled tokens to match num + # sampled logprobs - especially important + # to check since the detokenizer can cause + # a request to finish early due to a stop + # string being hit + num_new_tokens = len(new_tokens) + len_sample_logprobs = len(logprobs) + assert num_new_tokens == len_sample_logprobs, ( + f"Request {req_id} has {num_new_tokens}" + " completion tokens but has" + f" {len_sample_logprobs} sample logprobs.") + ref_cumulative_logprob = 0.0 + for idx, (sampled_token, + pos_logprob_dict) in enumerate(zip(new_tokens, + logprobs)): + # Break out the reference log probability value & + # logprob token id tensors associated with this + # position in the completion. Also break out the + # sampled token ranks + (ref_pos_logprob_toks, ref_pos_logprob_vals, + ref_sampled_token_rank) = ref_logprobs[idx] + # For each position in the completion sequence, + # ensure the actual sampled token is among the + # logprobs + assert sampled_token in pos_logprob_dict, ( + f"Sampled token {sampled_token} not" + f" present in logprob at index {idx}") + + # Validate number of sample logprobs + num_lp_toks = len(pos_logprob_dict) + assert (num_lp_toks == num_sample_logprobs + or num_lp_toks == num_sample_logprobs + + 1), ("Valid numbers of sample logprobs are" + f" {num_sample_logprobs} or" + f" {num_sample_logprobs+1} but" + f" {num_lp_toks} logprobs found at" + f" position {idx}. Logprobs dict:" + f" {pos_logprob_dict}") + + # Validate sampled token logprob rank + smp_lp = pos_logprob_dict[sampled_token] + smp_lp_rank = smp_lp.rank + assert (ref_sampled_token_rank == smp_lp_rank), ( + "Sampled token logprob rank" + f" {smp_lp_rank} does not match" + " correct value" + f" {ref_sampled_token_rank}" + f" in Logprob {smp_lp}") + + # Validate that the logprob processor yields + # the correct log probabilities and valid + # rankings + rank_one_appears = False + for jdx in range(1, len(ref_pos_logprob_toks)): + # Iterate over the (logprob val,logprob tok id) + # pairs expected by the test fixture at this + # position in the completion. + ref_lp_val = ref_pos_logprob_vals[jdx] + ref_tok_id = ref_pos_logprob_toks[jdx] + assert ref_tok_id in pos_logprob_dict, ( + f"Expected token {ref_tok_id} to be" + f" in logprob dict but it is not.") + + # Extract actually-generated logprob + # info + lp = pos_logprob_dict[ref_tok_id] + lp_val = lp.logprob + lp_rank = lp.rank + + # A "top" (rank 1) logprob must be + # present + rank_one_appears = (True + if lp_rank == 1 else rank_one_appears) + + # Rank must be >= 1 + assert lp_rank >= 1, (f"Logprob {lp} has invalid" + f" rank {lp_rank} < 1." + f" Logprob dict: {pos_logprob_dict}") + + # Validate log probability + assert math.isclose(lp_val, ref_lp_val), ( + f"Token id {ref_tok_id} appears in logprobs dict" + f" at position {idx} in completion with log" + f" probability {lp_val} but {ref_lp_val} was" + f" expected. Logprob: {lp}") + + assert rank_one_appears, (f"No Logprob has rank 1" + " in the following Logprob" + f" dict: {pos_logprob_dict}") + + # Validate logprobs detokenization + for lp_tok in pos_logprob_dict: + # Confirm that sample logprob decoded token matches + # the logprob token id at this sequence position + decoded_token = pos_logprob_dict[lp_tok].decoded_token + ref_decoded_token = _ref_convert_id_to_token( + dtv.tokenizer, lp_tok) + assert decoded_token == ref_decoded_token, ( + f"Sampled logprob token id {lp_tok} decodes to" + f" {ref_decoded_token} but Logprob decoded" + f" token is {decoded_token} instead" + f" (at position {idx})") + + ref_cumulative_logprob += pos_logprob_dict[ + sampled_token].logprob + # Assert that cumulative logprobs are correct + assert math.isclose(cumulative_logprob, ref_cumulative_logprob) + else: + # Sample logprobs disabled for this request + assert logprobs is None + assert cumulative_logprob is None + + if num_prompt_logprobs is not None: + # Validate prompt logprobs + assert prompt_logprobs is not None, ( + f"Request {req_id} requires prompt" + " logprobs but prompt logprobs are" + " None.") + # Require num prompt tokens to match num + # prompt logprobs + num_prompt_tokens = len(prompt_token_ids) + len_prompt_logprobs = len(prompt_logprobs) + assert num_prompt_tokens == len_prompt_logprobs, ( + f"Request {req_id} has {num_prompt_tokens}" + " prompt tokens but has" + f" {len_prompt_logprobs} prompt logprobs.") + # First prompt logprob is None + first_plp_dict = prompt_logprobs[0] + assert first_plp_dict is None, ( + f"Request {req_id} first prompt logprob" + f" should be None but has following value" + f" instead: {first_plp_dict}") + # Break out the reference prompt log prob value & + # logprob token id matrices for the whole prompt. + # Also break out the prompt token rank vector + (ref_prompt_logprob_toks, ref_prompt_logprob_vals, + ref_prompt_token_ranks) = ref_prompt_logprobs + for idx, (prompt_token, pos_logprob_dict) in enumerate( + zip(prompt_token_ids[1:], prompt_logprobs[1:])): + + # Break out the reference prompt log prob value + # vector, prompt logprob token id vector, and + # prompt token rank at the current position. + (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals, + ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :], + ref_prompt_logprob_vals[idx, :], + ref_prompt_token_ranks[idx]) + + # For each position in the prompt sequence, + # ensure the actual prompt token is among the + # logprobs + assert prompt_token in pos_logprob_dict, ( + f"Prompt token {prompt_token} not" + f" present in logprob at index {idx}") + # Validate number of prompt logprobs + num_plp_toks = len(pos_logprob_dict) + assert (num_plp_toks == num_prompt_logprobs + or num_plp_toks == num_prompt_logprobs + + 1), ("Valid numbers of prompt logprobs are" + f" {num_prompt_logprobs} or" + f" {num_prompt_logprobs+1} but" + f" {num_plp_toks} logprobs found at" + f" position {idx}. Logprobs dict:" + f" {pos_logprob_dict}") + + # Validate prompt token logprob rank + prmpt_tok_lp = pos_logprob_dict[prompt_token] + prmpt_tok_lp_rank = prmpt_tok_lp.rank + ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank + assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), ( + "Prompt token logprob rank" + f" {prmpt_tok_lp_rank} does not match" + " correct value" + f" {ref_prmpt_tok_lp_rank}" + f" in Logprob {prmpt_tok_lp}") + + # Validate that the logprob processor yields + # the correct prompt log probs and valid + # rankings + rank_one_appears = False + for jdx in range(1, len(ref_pos_prompt_logprob_toks)): + # Iterate over the (logprob val,logprob tok id) + # pairs expected by the test fixture at this + # position in the completion. + ref_plp_val = float(ref_pos_prompt_logprob_vals[jdx]) + ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx]) + assert ref_tok_id in pos_logprob_dict, ( + f"Expected token {ref_tok_id} to be" + f" in logprob dict but it is not.") + + # Extract actually-generated logprob + # info + plp = pos_logprob_dict[ref_tok_id] + plp_val = plp.logprob + plp_rank = plp.rank + + # A "top" (rank 1) logprob must be + # present + rank_one_appears = (True + if plp_rank == 1 else rank_one_appears) + + # Rank must be >= 1 + assert plp_rank >= 1, ( + f"Logprob {plp} has invalid" + f" rank {plp_rank} < 1." + f" Logprob dict: {pos_logprob_dict}") + + # Validate log probability + assert math.isclose(plp_val, ref_plp_val), ( + f"Token id {ref_tok_id} appears in logprobs dict" + f" at position {idx} in completion with log" + f" probability {plp_val} but {ref_plp_val} was" + f" expected. Logprob: {plp}") + + assert rank_one_appears, (f"No Logprob has rank 1" + " in the following Logprob" + f" dict: {pos_logprob_dict}") + + # Validate prompt logprob detokenization + for plp_tok in pos_logprob_dict: + # Confirm that prompt logprob decoded token matches + # the logprob token id at this sequence position + decoded_token = pos_logprob_dict[plp_tok].decoded_token + ref_decoded_token = _ref_convert_id_to_token( + dtv.tokenizer, plp_tok) + assert decoded_token == ref_decoded_token, ( + f"Prompt logprob token id {plp_tok} decodes to" + f" {ref_decoded_token} but Logprob decoded" + f" token is {decoded_token} instead" + f" (at position {idx})") + else: + # Prompt logprobs disabled for this request + assert prompt_logprobs is None + + +@pytest.mark.parametrize( + "request_output_kind", + [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +@pytest.mark.parametrize("num_sample_logprobs", + [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST]) +@pytest.mark.parametrize("num_prompt_logprobs", + [None, NUM_PROMPT_LOGPROBS_UNDER_TEST]) +def test_logprobs_processor(request_output_kind: RequestOutputKind, + num_sample_logprobs: Optional[int], + num_prompt_logprobs: Optional[int], + dummy_test_vectors): + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, + log_stats=False) + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + generated_logprobs_raw=None if num_sample_logprobs is None else + dummy_test_vectors.generation_logprobs, + prompt_logprobs_raw=None + if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs) + + # Make N requests. + request_id_list = [ + f"request-{idx}" + for idx in range(len(dummy_test_vectors.prompt_strings)) + ] + requests = [ + EngineCoreRequest(request_id=request_id_list[idx], + prompt_token_ids=prompt_tokens, + mm_features=None, + eos_token_id=None, + arrival_time=0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=request_output_kind, + stop=[], + include_stop_str_in_output=False, + logprobs=num_sample_logprobs, + prompt_logprobs=num_prompt_logprobs, + ), + pooling_params=None) + for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) + ] + + # Add requests to the detokenizer. + for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): + output_processor.add_request(request, prompt) + + gen_tokens = {} + gen_logprobs = {} + gen_prompt_logprobs = {} + gen_cumulative_logprobs = {} + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the logprobs processor. + processed_outputs = output_processor.process_outputs(outputs) + request_outputs = processed_outputs.request_outputs + requests_to_abort = processed_outputs.reqs_to_abort + assert len(requests_to_abort) == 0 + + # Update tracking. + for request_output in request_outputs: + request_id = request_output.request_id + new_tokens = request_output.outputs[0].token_ids + prompt_logprobs = request_output.prompt_logprobs + logprobs = request_output.outputs[0].logprobs + gen_cumulative_logprobs[request_id] = request_output.outputs[ + 0].cumulative_logprob + if request_id not in gen_logprobs: + # Start tracking sample and prompt logprobs for this request + gen_tokens[request_id] = new_tokens + gen_logprobs[request_id] = logprobs + gen_prompt_logprobs[request_id] = prompt_logprobs + else: + # Extend logprobs tracker + gen_tokens[request_id].extend(new_tokens) + lp = gen_logprobs[request_id] + plp = gen_prompt_logprobs[request_id] + if lp: + lp.extend(logprobs) + if plp: + plp.extend(prompt_logprobs) + + # Confirmed tracked logprobs match what we expect + _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs, + gen_cumulative_logprobs, dummy_test_vectors, + request_id_list, num_sample_logprobs, + num_prompt_logprobs) + + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() + + +@pytest.mark.parametrize( + "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs", + [(False, "stop_token_ids", False, None), + (True, "stop_token_ids", False, None), + (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST), + (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST), + (False, "eos_token_id", False, None), (True, "eos_token_id", False, None), + (False, "eos_token_id", True, None)]) +def test_stop_token(include_stop_str_in_output: bool, + num_sample_logprobs: Optional[int], stop_token_type: str, + ignore_eos: bool, dummy_test_vectors): + """Test output processor EOS/stop token handling. + + Send mock engine core request to mock engine core and pass core outputs + to output processor. Validate output processor tokens, text and + (if enabled) sample logprobs. Batch-size one. + + The test emulates a scenario where a model outputs text tokens followed + by two identical control tokens: + ... + + If EOS is under test, the control tokens are EOS; otherwise, they are + some other token id. + + Test behavior: + + * If EOS is under test and `ignore_eos=True`, the detokenized string + should be ... and the finish + reason should be "length" (i.e. no stop occurs) + + * else, if `include_stop_str_in_output==True`, the detokenized + string should be ... and the finish + reason should be "stop" (i.e. first control token causes stop + and is represented in output text) + + * else, the detokenized string should be + ... and the finish reason should be "stop" + (i.e. first control token causes stop but is not represented + in output text.) + + Note: some test details are tuned for meta-llama/Llama-3.2-1B, + another model should work only if the test is modified. + + Args: + include_stop_str_in_output: stop token str appears in output text + num_sample_logprobs: number of sample logprobs (`None` for no logprobs) + stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token + ignore_eos: if True, EOS stops are disabled + dummy_test_vectors: dummy engine core outputs and other data structures + """ + model_id = dummy_test_vectors.tokenizer.name_or_path + if model_id != 'meta-llama/Llama-3.2-1B': + raise AssertionError("Test requires meta-llama/Llama-3.2-1B but " + f"{model_id} is in use.") + do_logprobs = num_sample_logprobs is not None + # EOS under test; if False, stop_token_ids under test + is_eos_test = stop_token_type == "eos_token_id" + # EOS under test but ignore_eos enabled + is_eos_ignore_test = is_eos_test and ignore_eos + eos_token_id = ( + dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None + ) # '<|end_of_text|>' + stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>' + + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, + log_stats=False) + # Dummy engine core outputs, with control tokens suffixed to test stops + suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids) + assert suffix_token is not None and isinstance(suffix_token[0], int) + generation_string = dummy_test_vectors.generation_strings[0] + generation_tokens = (dummy_test_vectors.generation_tokens[0] + + 2 * suffix_token) + if do_logprobs: + generation_logprobs = ( + dummy_test_vectors.generation_logprobs[0] + + 2 * [dummy_test_vectors.generation_logprobs[0][-1]]) + prompt_string = dummy_test_vectors.prompt_strings[0] + prompt_tokens = dummy_test_vectors.prompt_tokens[0] + engine_core = MockEngineCore( + tokens_list=[generation_tokens], + generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, + prompt_logprobs_raw=None, + eos_token_id=eos_token_id, + stop_token_ids=stop_token_ids, + ignore_eos=ignore_eos) + + # Make request. + request_id = "request-0" + request = EngineCoreRequest( + request_id=request_id, + prompt_token_ids=prompt_tokens, + mm_features=None, + eos_token_id=eos_token_id, + arrival_time=0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=[], + stop_token_ids=stop_token_ids, + include_stop_str_in_output=include_stop_str_in_output, + logprobs=num_sample_logprobs, + prompt_logprobs=None, + ignore_eos=ignore_eos, + ), + pooling_params=None) + + # Add request to the detokenizer. + output_processor.add_request(request, prompt_string) + + # Loop over engine core steps; run output processor + gen_string = "" + gen_tokens = [] + gen_logprobs = [] + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + processed_outputs = output_processor.process_outputs(outputs) + request_outputs = processed_outputs.request_outputs + assert len(request_outputs) == 1 + # Stop token does not rely on abort + assert not processed_outputs.reqs_to_abort + + # Update tracking. + request_output = request_outputs[0] + if request_output.finished: + finish_reason = ("length" if is_eos_ignore_test else "stop") + assert request_output.outputs[0].finish_reason == finish_reason + + gen_string += request_output.outputs[0].text + gen_tokens.extend(request_output.outputs[0].token_ids) + if do_logprobs: + gen_logprobs.extend(request_output.outputs[0].logprobs) + + # Validate generated text + control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>' + if is_eos_ignore_test: + # Length-based stop; expect full string + ref_str = generation_string + 2 * control_token + elif include_stop_str_in_output: + # Stop token triggered; include in output + ref_str = generation_string + control_token + else: + # Stop token triggered but not in output + ref_str = generation_string + assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}") + + if do_logprobs: + # Validate number of sample logprobs + num_tokens = len(gen_tokens) + num_logprobs = len(gen_logprobs) + assert num_tokens == num_logprobs, ( + f"Token count ({num_tokens}) != logprobs count ({num_logprobs})") + + # Check requests are finished + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() + + +@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) +@pytest.mark.parametrize("num_sample_logprobs", + [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST]) +def test_stop_string(include_stop_str_in_output: bool, + num_sample_logprobs: Optional[int], dummy_test_vectors): + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, + log_stats=False) + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + generated_logprobs_raw=dummy_test_vectors.generation_logprobs + if num_sample_logprobs else None, + prompt_logprobs_raw=None) + + # Make N requests. + request_id_list = [ + f"request-{idx}" + for idx in range(len(dummy_test_vectors.prompt_strings)) + ] + requests = [ + EngineCoreRequest( + request_id=request_id_list[idx], + prompt_token_ids=prompt_tokens, + mm_features=None, + eos_token_id=None, + arrival_time=0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=STOP_STRINGS, + include_stop_str_in_output=include_stop_str_in_output, + logprobs=num_sample_logprobs, + prompt_logprobs=None, + ), + pooling_params=None) + for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) + ] + + # Add requests to the detokenizer. + for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): + output_processor.add_request(request, prompt) + + gen_strings = {} + gen_tokens = {} + gen_logprobs = {} + gen_prompt_logprobs = {} + gen_cumulative_logprobs = {} + aborted = [] + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + processed_outputs = output_processor.process_outputs(outputs) + request_outputs = processed_outputs.request_outputs + requests_to_abort = processed_outputs.reqs_to_abort + for request_output in request_outputs: + # If aborted, we should not get a request output. + assert request_output.request_id not in aborted + aborted.extend(requests_to_abort) + + # Update tracking. + for request_output in request_outputs: + if request_output.finished: + assert request_output.outputs[0].finish_reason == "stop" + + request_id = request_output.request_id + new_text = request_output.outputs[0].text + new_tokens = request_output.outputs[0].token_ids + prompt_logprobs = request_output.prompt_logprobs + logprobs = request_output.outputs[0].logprobs + gen_cumulative_logprobs[request_id] = request_output.outputs[ + 0].cumulative_logprob + if request_id not in gen_strings: + gen_strings[request_id] = new_text + gen_tokens[request_id] = new_tokens + gen_logprobs[request_id] = logprobs + gen_prompt_logprobs[request_id] = prompt_logprobs + else: + gen_strings[request_id] += new_text + gen_tokens[request_id].extend(new_tokens) + lp = gen_logprobs[request_id] + plp = gen_prompt_logprobs[request_id] + if lp: + lp.extend(logprobs) + if plp: + plp.extend(prompt_logprobs) + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, stop_str) in enumerate( + zip(dummy_test_vectors.generation_strings, STOP_STRINGS)): + + # Request should be aborted. + request_id = f"request-{idx}" + assert request_id in aborted + + # Collected values that were generated. + gen_str = gen_strings[request_id] + + # Construct reference strings. + stop_str_idx = ref_gen_str.find(stop_str) + ref_str_exc_stop = ref_gen_str[:stop_str_idx] + ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str + + if include_stop_str_in_output: + assert gen_str == ref_str_inc_stop, ( + f"{gen_str=}, {ref_str_inc_stop=}") + else: + assert gen_str == ref_str_exc_stop, ( + f"{gen_str=}, {ref_str_exc_stop=}") + + # Confirmed tracked logprobs match what we expect + _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs, + gen_cumulative_logprobs, dummy_test_vectors, + request_id_list, num_sample_logprobs, None) + + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() + + +def test_iteration_stats(dummy_test_vectors): + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, + log_stats=True) + engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) + engine_core_timestamp = time.monotonic() + + # Make N requests. + requests = [ + EngineCoreRequest( + request_id=f"request-{idx}", + prompt_token_ids=prompt_tokens, + mm_features=None, + eos_token_id=None, + arrival_time=0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + sampling_params=SamplingParams(), + pooling_params=None, + ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) + ] + + # Add all requests except one to the OutputProcessor. + num_active = len(dummy_test_vectors.generation_tokens) - 1 + for request in requests[:num_active]: + output_processor.add_request(request, None) + inactive_request = requests[num_active] + + # First iteration has 2 prefills. + outputs = engine_core.get_outputs()[:num_active] + iteration_stats = IterationStats() + output_processor.process_outputs(outputs, engine_core_timestamp, + iteration_stats) + total_prompt_tokens = sum([ + len(prompt_tokens) + for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active] + ]) + + assert iteration_stats.num_prompt_tokens == total_prompt_tokens + assert iteration_stats.num_generation_tokens == num_active + + # Just decodes in this step. + outputs = engine_core.get_outputs()[:num_active] + iteration_stats = IterationStats() + output_processor.process_outputs(outputs, engine_core_timestamp, + iteration_stats) + + assert iteration_stats.num_prompt_tokens == 0 + assert iteration_stats.num_generation_tokens == num_active + + # Add a new request - prefill and 2 decodes in this step. + output_processor.add_request(inactive_request, None) + num_active += 1 + outputs = engine_core.get_outputs()[:num_active] + iteration_stats = IterationStats() + output_processor.process_outputs(outputs, engine_core_timestamp, + iteration_stats) + total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1]) + + assert iteration_stats.num_prompt_tokens == total_prompt_tokens + assert iteration_stats.num_generation_tokens == num_active + + # Just decodes in this step. + outputs = engine_core.get_outputs()[:num_active] + iteration_stats = IterationStats() + output_processor.process_outputs(outputs, engine_core_timestamp, + iteration_stats) + + assert iteration_stats.num_prompt_tokens == 0 + assert iteration_stats.num_generation_tokens == num_active + + +@pytest.mark.asyncio +async def test_request_output_collector(): + NUM_REQS = 3 + TEXT = "a" + + def make_outputs() -> list[RequestOutput]: + return [ + RequestOutput( + request_id="my-request-id", + prompt=None, + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[ + CompletionOutput( + index=0, + text=TEXT, + token_ids=[idx], + cumulative_logprob=(idx + 1 * 1.0), + logprobs=[{ + "a": idx, + "b": idx + }], + finish_reason="length" if + (idx == NUM_REQS - 1) else None, + ) + ], + finished=(idx == NUM_REQS - 1), + ) for idx in range(NUM_REQS) + ] + + collector = RequestOutputCollector(RequestOutputKind.DELTA) + + # CASE 1: Put then get. + outputs = make_outputs() + collector.put(outputs[0]) + output = await collector.get() + assert not collector.ready.is_set() + assert collector.output is None + assert output.outputs[0].text == "a" + assert output.outputs[0].token_ids == [0] + + # CASE 2: 2 puts then get. + num_to_put = 2 + outputs = make_outputs() + for i in range(num_to_put): + collector.put(outputs[i]) + output = await collector.get() + assert not collector.ready.is_set() + assert collector.output is None + + assert not output.finished + # Text, token_ids, and logprobs should get merged. + assert output.outputs[0].text == TEXT * num_to_put + for tok_0, tok_1 in zip(output.outputs[0].token_ids, + list(range(num_to_put))): + assert tok_0 == tok_1 + assert len(output.outputs[0].logprobs) == num_to_put + + # Cumulative logprobs should be the last one. + cumulative_logprob_expected = 1.0 * num_to_put + assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected + + # CASE 3: Put all 3 (including a finished). + num_to_put = 3 + outputs = make_outputs() + for i in range(num_to_put): + collector.put(outputs[i]) + output = await collector.get() + assert not collector.ready.is_set() + assert collector.output is None + + assert output.finished + assert output.outputs[0].finish_reason == "length" + # Text, token_ids, and logprobs should get merged. + assert output.outputs[0].text == TEXT * num_to_put + for tok_0, tok_1 in zip(output.outputs[0].token_ids, + list(range(num_to_put))): + assert tok_0 == tok_1 + assert len(output.outputs[0].logprobs) == num_to_put + + # Cumulative logprobs should be the last one. + cumulative_logprob_expected = 1.0 * num_to_put + assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected + + +@pytest.mark.asyncio +async def test_cumulative_output_collector_n(): + """Test collector correctly handles multiple outputs by index.""" + collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE) + outputs = [ + RequestOutput( + request_id="my-request-id", + prompt=None, + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[ + CompletionOutput( + index=0, + text="a", + token_ids=[0], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ), + CompletionOutput( + index=1, + text="b", + token_ids=[1], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ), + ], + finished=False, + ), + RequestOutput( + request_id="my-request-id", + prompt=None, + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[ + CompletionOutput( + index=0, + text="ab", + token_ids=[0, 1], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ), + CompletionOutput( + index=2, + text="c", + token_ids=[2], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ), + ], + finished=False, + ), + ] + for output in outputs: + collector.put(output) + + # Get the output and check that the text and token_ids are correct. + result = await collector.get() + # We are expecting + # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}] + assert len(result.outputs) == 3 + # First is the one where index is 0 + first = [k for k in result.outputs if k.index == 0] + assert len(first) == 1 + assert first[0].text == "ab" + + # Second is the one where index is 1 + second = [k for k in result.outputs if k.index == 1] + assert len(second) == 1 + assert second[0].text == "b" + assert second[0].token_ids == [1] + + # Third is the one where index is 2 + third = [k for k in result.outputs if k.index == 2] + assert len(third) == 1 + assert third[0].text == "c" From 7a92f1791bcf75063c6a7b2b77d2e08ec021c1e8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 12:56:03 -0700 Subject: [PATCH 05/17] fix test_chat Signed-off-by: Woosuk Kwon --- tests/entrypoints/openai/test_chat.py | 49 ++++++++++----------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d5924b7b3ae3..be62586f9741 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -28,11 +28,9 @@ def monkeypatch_module(): mpatch.undo() -@pytest.fixture(scope="module", params=[False, True]) -def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 - - use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') +@pytest.fixture(scope="module") +def server(monkeypatch_module, zephyr_lora_files): #noqa: F811 + monkeypatch_module.setenv('VLLM_USE_V1', '1') args = [ # use half precision for speed and memory savings in CI environment @@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 yield remote_server -@pytest.fixture -def is_v1_server(server): - import os - assert os.environ['VLLM_USE_V1'] in ['0', '1'] - return os.environ['VLLM_USE_V1'] == '1' - - @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: @@ -481,9 +472,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_guided_choice_chat(client: openai.AsyncOpenAI, - sample_guided_choice, is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + sample_guided_choice): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -519,10 +508,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") +async def test_guided_json_chat( + client: openai.AsyncOpenAI, + sample_json_schema, +): messages = [{ "role": "system", @@ -565,10 +554,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, @pytest.mark.asyncio -async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") +async def test_guided_regex_chat( + client: openai.AsyncOpenAI, + sample_regex, +): messages = [{ "role": "system", @@ -653,10 +642,10 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Tool use is only supported in v1 engine") +async def test_named_tool_use( + client: openai.AsyncOpenAI, + sample_json_schema, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -826,11 +815,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_response_format_json_schema(client: openai.AsyncOpenAI, - is_v1_server: bool): - if not is_v1_server: - pytest.skip( - "JSON schema response format is only supported in v1 engine") +async def test_response_format_json_schema(client: openai.AsyncOpenAI): prompt = 'what is 1+1? The format is "result": 2' # Check that this prompt cannot lead to a valid JSON without json_schema for _ in range(2): From d80a45534ff96c7e04994cf1cea3d68b69cb754e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 13:03:29 -0700 Subject: [PATCH 06/17] fix pp test Signed-off-by: Woosuk Kwon --- tests/distributed/test_pipeline_parallel.py | 75 ++++++--------------- tests/metrics/test_metrics.py | 9 --- 2 files changed, 19 insertions(+), 65 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 76b105e8a8ec..efc15a00f0c7 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -41,23 +41,10 @@ class PPTestOptions(NamedTuple): @dataclass class PPTestSettings: parallel_setups: list[ParallelSetup] - # NOTE: the length of distributed_backends and - # vllm_major_versions should be the same, and they - # are first zipped together to iterate over all - # test settings. distributed_backends: list[str] - # vllm major version: "0" for V0, "1" for V1 - vllm_major_versions: list[str] runner: RunnerOption test_options: PPTestOptions - def __post_init__(self): - if len(self.distributed_backends) != len(self.vllm_major_versions): - raise ValueError( - f"Length mismatch: distributed_backends " - f"({len(self.distributed_backends)}) != " - f"vllm_major_versions ({len(self.vllm_major_versions)})") - @staticmethod def detailed( *, @@ -90,8 +77,7 @@ def detailed( eager_mode=True, chunked_prefill=False), ], - distributed_backends=["mp", "mp", "ray", "ray"], - vllm_major_versions=["0", "1", "0", "1"], + distributed_backends=["mp", "ray"], runner=runner, test_options=PPTestOptions(multi_node_only=multi_node_only, load_format=load_format), @@ -106,7 +92,6 @@ def fast( multi_node_only: bool = False, load_format: Optional[str] = None, ): - vllm_major_versions = ["1"] if runner == "pooling" else ["0"] return PPTestSettings( parallel_setups=[ @@ -116,7 +101,6 @@ def fast( chunked_prefill=False), ], distributed_backends=["mp"], - vllm_major_versions=vllm_major_versions, runner=runner, test_options=PPTestOptions(multi_node_only=multi_node_only, load_format=load_format), @@ -126,10 +110,8 @@ def iter_params(self, model_id: str): opts = self.test_options for parallel_setup in self.parallel_setups: - for backend, vllm_major_version in zip(self.distributed_backends, - self.vllm_major_versions): - yield (model_id, parallel_setup, backend, vllm_major_version, - self.runner, opts) + for backend in self.distributed_backends: + yield (model_id, parallel_setup, backend, self.runner, opts) # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU @@ -257,7 +239,6 @@ def _compare_tp( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available: int, @@ -341,14 +322,11 @@ def _compare_tp( if max_num_seqs: common_args.extend(["--max-num-seqs", f"{max_num_seqs}"]) - specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill - testing_ray_compiled_graph = False - if distributed_backend == "ray" and (vllm_major_version == "1" - or specific_case): + if distributed_backend == "ray": # For V1, test Ray Compiled Graph for all the tests # For V0, test Ray Compiled Graph for a subset of the tests pp_env = { - "VLLM_USE_V1": vllm_major_version, + "VLLM_USE_V1": "1", "VLLM_USE_RAY_COMPILED_DAG": "1", "VLLM_USE_RAY_SPMD_WORKER": "1", "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", @@ -356,17 +334,16 @@ def _compare_tp( # Temporary. Currently when zeromq + SPMD is used, it does not properly # terminate because of a Ray Compiled Graph issue. common_args.append("--disable-frontend-multiprocessing") - testing_ray_compiled_graph = True elif distributed_backend == "mp": # Both V0/V1 of multiprocessing executor support PP pp_env = { - "VLLM_USE_V1": vllm_major_version, + "VLLM_USE_V1": "1", } else: pp_env = None tp_env = { - "VLLM_USE_V1": vllm_major_version, + "VLLM_USE_V1": "1", } pp_args = [ @@ -392,25 +369,17 @@ def _compare_tp( "mp", ] - try: - compare_two_settings(model_id, - pp_args, - tp_args, - pp_env, - tp_env, - method=method) - except Exception: - if testing_ray_compiled_graph and vllm_major_version == "0": - # Ray Compiled Graph tests are flaky for V0, - # so we don't want to fail the test - logger.exception("Ray Compiled Graph tests failed") - else: - raise + compare_two_settings(model_id, + pp_args, + tp_args, + pp_env, + tp_env, + method=method) @pytest.mark.parametrize( - ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", - "runner", "test_options"), + ("model_id", "parallel_setup", "distributed_backend", "runner", + "test_options"), [ params for model_id, settings in TEXT_GENERATION_MODELS.items() for params in settings.iter_params(model_id) if model_id in TEST_MODELS @@ -421,7 +390,6 @@ def test_tp_language_generation( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available, @@ -429,7 +397,6 @@ def test_tp_language_generation( _compare_tp(model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, @@ -438,8 +405,8 @@ def test_tp_language_generation( @pytest.mark.parametrize( - ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", - "runner", "test_options"), + ("model_id", "parallel_setup", "distributed_backend", "runner", + "test_options"), [ params for model_id, settings in EMBEDDING_MODELS.items() for params in settings.iter_params(model_id) if model_id in TEST_MODELS @@ -450,7 +417,6 @@ def test_tp_language_embedding( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available, @@ -458,7 +424,6 @@ def test_tp_language_embedding( _compare_tp(model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, @@ -467,8 +432,8 @@ def test_tp_language_embedding( @pytest.mark.parametrize( - ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", - "runner", "test_options"), + ("model_id", "parallel_setup", "distributed_backend", "runner", + "test_options"), [ params for model_id, settings in MULTIMODAL_MODELS.items() for params in settings.iter_params(model_id) if model_id in TEST_MODELS @@ -479,7 +444,6 @@ def test_tp_multimodal_generation( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available, @@ -487,7 +451,6 @@ def test_tp_multimodal_generation( _compare_tp(model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index dbd9c518e020..3c0de9782fc9 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -13,15 +13,6 @@ from vllm.sampling_params import SamplingParams from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - MODELS = [ "distilbert/distilgpt2", ] From 9bb81febb4f898ac7808d2fd9f7af50258f7cb2e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 13:03:58 -0700 Subject: [PATCH 07/17] fix Signed-off-by: Woosuk Kwon --- tests/distributed/test_pipeline_parallel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index efc15a00f0c7..aa4b4ac7fe52 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -324,7 +324,6 @@ def _compare_tp( if distributed_backend == "ray": # For V1, test Ray Compiled Graph for all the tests - # For V0, test Ray Compiled Graph for a subset of the tests pp_env = { "VLLM_USE_V1": "1", "VLLM_USE_RAY_COMPILED_DAG": "1", @@ -335,7 +334,6 @@ def _compare_tp( # terminate because of a Ray Compiled Graph issue. common_args.append("--disable-frontend-multiprocessing") elif distributed_backend == "mp": - # Both V0/V1 of multiprocessing executor support PP pp_env = { "VLLM_USE_V1": "1", } From c855f921be48196cbad92a2751ae83def5364ec8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 17:16:33 -0700 Subject: [PATCH 08/17] rm more tests Signed-off-by: Woosuk Kwon --- tests/basic_correctness/test_preemption.py | 189 ----- tests/entrypoints/openai/test_completion.py | 831 -------------------- tests/metrics/__init__.py | 0 tests/metrics/test_metrics.py | 259 ------ tests/tracing/__init__.py | 0 tests/tracing/test_tracing.py | 237 ------ 6 files changed, 1516 deletions(-) delete mode 100644 tests/basic_correctness/test_preemption.py delete mode 100644 tests/entrypoints/openai/test_completion.py delete mode 100644 tests/metrics/__init__.py delete mode 100644 tests/metrics/test_metrics.py delete mode 100644 tests/tracing/__init__.py delete mode 100644 tests/tracing/test_tracing.py diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py deleted file mode 100644 index db2fa2f6bef6..000000000000 --- a/tests/basic_correctness/test_preemption.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the short outputs of HF and vLLM when using greedy sampling. - -VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. - -Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 -pytest tests/basic_correctness/test_preemption.py`. -""" -import pytest -from prometheus_client import REGISTRY - -import vllm.envs as envs -from vllm import SamplingParams -from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, - ENABLE_ARTIFICIAL_PREEMPT) - -from ..models.utils import check_outputs_equal - -MODELS = [ - "distilbert/distilgpt2", -] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT, - so use VLLM_USE_V1=0 for all tests in the file. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.fixture(scope="module", autouse=True) -def check_settings(): - assert ENABLE_ARTIFICIAL_PREEMPT is True, ( - "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1." - "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 " - "pytest tests/basic_correctness/test_preemption.py`") - - -@pytest.fixture -def distributed_executor_backend() -> str: - # When SPMD worker is used, use distributed_executor_backend="ray" - # to test delta input optimization works with preemption. - return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [96]) -@pytest.mark.parametrize("chunked_prefill_token_size", [16]) -def test_chunked_prefill_recompute( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - chunked_prefill_token_size: int, - distributed_executor_backend: str, -) -> None: - """Ensure that chunked prefill works with preemption.""" - max_num_seqs = min(chunked_prefill_token_size, 256) - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=enable_chunked_prefill, - max_num_seqs=max_num_seqs, - distributed_executor_backend=distributed_executor_backend, - disable_log_stats=False, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - for i in range(len(example_prompts)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_outputs[i] - assert hf_output_str == vllm_output_str, ( - f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") - assert hf_output_ids == vllm_output_ids, ( - f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption( - caplog_vllm, - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """By default, recompute preemption is enabled""" - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - disable_log_stats=False, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - total_preemption = ( - vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - assert ("is preempted by PreemptionMode.RECOMPUTE mode because there " - "is not enough KV cache space." in caplog_vllm.text) - # Ensure the count bucket of request-level histogram metrics matches - # the number of requests as a simple sanity check to ensure metrics are - # generated - preemption_metrics = None - for m in REGISTRY.collect(): - if m.name == "vllm:num_preemptions": - preemption_metrics = m - assert preemption_metrics is not None - total_recorded_preemption = 0 - for sample in preemption_metrics.samples: - total_recorded_preemption += sample.value - assert total_preemption == total_recorded_preemption - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption_infeasible( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """Verify infeasible preemption request will be ignored.""" - BLOCK_SIZE = 16 - prefill_blocks = 2 - decode_blocks = max_tokens // BLOCK_SIZE - with vllm_runner( - model, - dtype=dtype, - block_size=BLOCK_SIZE, - # Not enough gpu blocks to complete a single sequence. - # preemption should happen, and the sequence should be - # ignored instead of hanging forever. - num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, - max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - sampling_params = SamplingParams(max_tokens=max_tokens, - ignore_eos=True) - req_outputs = vllm_model.llm.generate( - example_prompts, - sampling_params=sampling_params, - ) - - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - # Verify the request is ignored and not hang. - for req_output in req_outputs: - outputs = req_output.outputs - assert len(outputs) == 1 - assert outputs[0].finish_reason == "length" diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py deleted file mode 100644 index 3650b1579257..000000000000 --- a/tests/entrypoints/openai/test_completion.py +++ /dev/null @@ -1,831 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for guided decoding tests -import json -import os -from typing import Optional - -import jsonschema -import openai # use the official client for correctness check -import pytest -import pytest_asyncio -import regex as re -import requests -# downloading lora to test lora requests -from openai import BadRequestError - -from vllm.transformers_utils.tokenizer import get_tokenizer - -from ...utils import RemoteOpenAIServer - -# any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically these adapters use a different base model, -# but we're not testing generation quality here - -GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"] - - -@pytest.fixture(scope="module") -def default_server_args(zephyr_lora_files): - return [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--max-num-seqs", - "128", - "--enforce-eager", - # lora config - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - ] - - -@pytest.fixture(scope="module", - params=["", "--disable-frontend-multiprocessing"]) -def server(default_server_args, request): - if request.param: - default_server_args.append(request.param) - - original_value = os.environ.get('VLLM_USE_V1') - os.environ['VLLM_USE_V1'] = '0' - try: - with RemoteOpenAIServer(MODEL_NAME, - default_server_args) as remote_server: - yield remote_server - finally: - # Restore original env value - if original_value is None: - os.environ.pop('VLLM_USE_V1', None) - else: - os.environ['VLLM_USE_V1'] = original_value - - -@pytest.fixture -def is_v1_server(server): - import os - - # For completion tests, we assume v0 since there's no explicit v1 setup - return os.environ.get('VLLM_USE_V1', '0') == '1' - - -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): - completion = await client.completions.create(model=model_name, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 1 - assert completion.choices[0].prompt_logprobs is None - - -@pytest.mark.asyncio -async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): - # test using token IDs - with pytest.raises(openai.BadRequestError, match="out of vocabulary"): - # Added tokens should be rejected by the base model - await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=None, - ) - choice = completion.choices[0] - assert choice.logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # just test 1 lora - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=0, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert len(choice.logprobs.top_logprobs[0]) == 1 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=5, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, - model_name: str): - - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=21, - ) - ... - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - stream = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=30, - stream=True, - ) - async for chunk in stream: - ... - - # the server should still work afterwards - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1), - (MODEL_NAME, 0), - (MODEL_NAME, 1), - (MODEL_NAME, None)]) -async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, - model_name: str, - prompt_logprobs: Optional[int]): - params: dict = { - "prompt": ["A robot may not injure another robot", "My name is"], - "model": model_name, - } - if prompt_logprobs is not None: - params["extra_body"] = {"prompt_logprobs": prompt_logprobs} - - if prompt_logprobs is not None and prompt_logprobs < 0: - with pytest.raises(BadRequestError): - await client.completions.create(**params) - else: - completion = await client.completions.create(**params) - if prompt_logprobs is not None: - assert completion.choices[0].prompt_logprobs is not None - assert len(completion.choices[0].prompt_logprobs) > 0 - - assert completion.choices[1].prompt_logprobs is not None - assert len(completion.choices[1].prompt_logprobs) > 0 - - else: - assert completion.choices[0].prompt_logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_streaming(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is an LLM?" - - single_completion = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - ) - single_output = single_completion.choices[0].text - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True) - chunks: list[str] = [] - finish_reason_count = 0 - async for chunk in stream: - chunks.append(chunk.choices[0].text) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == "length" - assert chunk.choices[0].text - assert "".join(chunks) == single_output - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): - """Streaming for parallel sampling. - The tokens from multiple samples, are flattened into a single stream, - with an index to indicate which sample the token belongs to. - """ - - prompt = "What is an LLM?" - n = 3 - max_tokens = 5 - - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=max_tokens, - n=n, - stream=True) - chunks: list[list[str]] = [[] for i in range(n)] - finish_reason_count = 0 - async for chunk in stream: - index = chunk.choices[0].index - text = chunk.choices[0].text - chunks[index].append(text) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - assert finish_reason_count == n - for chunk in chunks: - assert len(chunk) == max_tokens - print("".join(chunk)) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_stream_options(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is the capital of France?" - - # Test stream=True, stream_options= - # {"include_usage": False, "continuous_usage_stats": False} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": False, - "continuous_usage_stats": - False, - }) - - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options= - # {"include_usage": False, "continuous_usage_stats": True} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": False, - "continuous_usage_stats": - True, - }) - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options= - # {"include_usage": True, "continuous_usage_stats": False} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": True, - "continuous_usage_stats": - False, - }) - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - else: - assert chunk.usage is None - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=True, stream_options= - # {"include_usage": True, "continuous_usage_stats": True} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": True, - "continuous_usage_stats": - True, - }) - async for chunk in stream: - assert chunk.usage is not None - assert chunk.usage.prompt_tokens > 0 - assert chunk.usage.completion_tokens > 0 - assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + - chunk.usage.completion_tokens) - if chunk.choices[0].finish_reason is not None: - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=False, stream_options= - # {"include_usage": None} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": None}) - - # Test stream=False, stream_options= - # {"include_usage": True} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - - # Test stream=False, stream_options= - # {"continuous_usage_stats": None} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"continuous_usage_stats": None}) - - # Test stream=False, stream_options= - # {"continuous_usage_stats": True} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"continuous_usage_stats": True}) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): - # test both text and token IDs - for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): - # test simple list - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=model_name, - prompt=prompts, - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but - # not necessary for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" - - # test streaming - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] - - -@pytest.mark.asyncio -async def test_logits_bias(client: openai.AsyncOpenAI): - prompt = "Hello, my name is" - max_tokens = 5 - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - - # Test exclusive selection - token_id = 1000 - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token_id): 100}, - seed=42, - ) - assert len(completion.choices[0].text) >= 5 - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), - add_special_tokens=False)["input_ids"] - assert all([ - response == expected - for response, expected in zip(response_tokens, expected_tokens) - ]) - - # Test ban - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - ) - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - first_response = completion.choices[0].text - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token): -100 - for token in response_tokens}, - ) - assert first_response != completion.choices[0].text - - -@pytest.mark.asyncio -async def test_allowed_token_ids(client: openai.AsyncOpenAI): - prompt = "Hello, my name is" - max_tokens = 1 - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - - # Test exclusive selection - allowed_ids = [21555, 21557, 21558] - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - seed=42, - extra_body=dict(allowed_token_ids=allowed_ids), - logprobs=1, - ) - response_tokens = completion.choices[0].logprobs.tokens - assert len(response_tokens) == 1 - assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_json_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_json_schema, is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example JSON for an employee profile " - f"that fits this schema: {sample_json_schema}", - n=3, - temperature=1.0, - max_tokens=500, - extra_body=dict(guided_json=sample_json_schema, - guided_decoding_backend=guided_decoding_backend)) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - output_json = json.loads(completion.choices[i].text) - jsonschema.validate(instance=output_json, schema=sample_json_schema) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_regex_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_regex, is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example IPv4 address with this regex: {sample_regex}", - n=3, - temperature=1.0, - max_tokens=20, - extra_body=dict(guided_regex=sample_regex, - guided_decoding_backend=guided_decoding_backend)) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - assert re.fullmatch(sample_regex, - completion.choices[i].text) is not None - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_choice_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_guided_choice, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt="The best language for type-safe systems programming is ", - n=2, - temperature=1.0, - max_tokens=10, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) - - assert completion.id is not None - assert len(completion.choices) == 2 - for i in range(2): - assert completion.choices[i].text in sample_guided_choice - - -@pytest.mark.asyncio -async def test_guided_grammar(client: openai.AsyncOpenAI, - sample_sql_statements, is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided grammar is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=("Generate a sql state that select col_1 from " - "table_1 where it is equals to 1"), - temperature=1.0, - max_tokens=500, - extra_body=dict(guided_grammar=sample_sql_statements)) - - content = completion.choices[0].text - - # use Lark to parse the output, and make sure it's a valid parse tree - from lark import Lark - parser = Lark(sample_sql_statements) - parser.parse(content) - - # remove spaces for comparison b/c we removed them in the grammar - ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "") - - assert content.strip() == ground_truth - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -@pytest.mark.parametrize("logprobs_arg", [1, 0]) -async def test_echo_logprob_completion(client: openai.AsyncOpenAI, - model_name: str, logprobs_arg: int): - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - # test using text and token IDs - for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]): - completion = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - echo=True, - logprobs=logprobs_arg) - - prompt_text = tokenizer.decode(prompt) if isinstance(prompt, - list) else prompt - assert re.search(r"^" + prompt_text, completion.choices[0].text) - logprobs = completion.choices[0].logprobs - assert logprobs is not None - assert len(logprobs.text_offset) > 5 - assert (len(logprobs.token_logprobs) > 5 - and logprobs.token_logprobs[0] is None) - assert (len(logprobs.top_logprobs) > 5 - and logprobs.top_logprobs[0] is None) - for top_logprobs in logprobs.top_logprobs[1:]: - assert max(logprobs_arg, - 1) <= len(top_logprobs) <= logprobs_arg + 1 - assert len(logprobs.tokens) > 5 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_json_schema, sample_regex, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") - - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example JSON that fits this schema: 42", - extra_body=dict(guided_json=42, - guided_decoding_backend=guided_decoding_backend)) - - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example string that fits this regex", - extra_body=dict(guided_regex=sample_regex, - guided_json=sample_json_schema)) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name,stream,echo", - [ - (MODEL_NAME, False, False), - (MODEL_NAME, False, True), - (MODEL_NAME, True, False), - (MODEL_NAME, True, True) # should not raise BadRequestError error - ], -) -async def test_echo_stream_completion(client: openai.AsyncOpenAI, - model_name: str, stream: bool, - echo: bool): - saying: str = "Hello, my name is" - result = await client.completions.create(model=model_name, - prompt=saying, - max_tokens=10, - temperature=0.0, - echo=echo, - stream=stream) - - stop_reason = "length" - - if not stream: - completion = result - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == stop_reason - - if echo: - assert choice.text is not None and saying in choice.text - else: - assert choice.text is not None and saying not in choice.text - - else: - chunks: list[str] = [] - final_finish_reason = None - async for chunk in result: - if chunk.choices and chunk.choices[0].text: - chunks.append(chunk.choices[0].text) - if chunk.choices and chunk.choices[0].finish_reason: - final_finish_reason = chunk.choices[0].finish_reason - - assert final_finish_reason == stop_reason - content = "".join(chunks) - if echo: - assert content is not None and saying in content - else: - assert content is not None and saying not in content - - -@pytest.mark.asyncio -async def test_invocations(server: RemoteOpenAIServer, - client: openai.AsyncOpenAI): - request_args = { - "model": MODEL_NAME, - "prompt": "Hello, my name is", - "max_tokens": 5, - "temperature": 0.0, - "logprobs": None, - } - - completion = await client.completions.create(**request_args) - - invocation_response = requests.post(server.url_for("invocations"), - json=request_args) - invocation_response.raise_for_status() - - completion_output = completion.model_dump() - invocation_output = invocation_response.json() - - assert completion_output.keys() == invocation_output.keys() - assert completion_output["choices"] == invocation_output["choices"] diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py deleted file mode 100644 index 3c0de9782fc9..000000000000 --- a/tests/metrics/test_metrics.py +++ /dev/null @@ -1,259 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import ray -from prometheus_client import REGISTRY - -import vllm.envs as envs -from vllm import EngineArgs, LLMEngine -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.metrics import RayPrometheusStatLogger -from vllm.sampling_params import SamplingParams -from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET - -MODELS = [ - "distilbert/distilgpt2", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_metric_counter_prompt_tokens( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - with vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4) as vllm_model: - tokenizer = vllm_model.llm.get_tokenizer() - prompt_token_counts = [ - len(tokenizer.encode(p)) for p in example_prompts - ] - # This test needs at least 2 prompts in a batch of different lengths to - # verify their token count is correct despite padding. - assert len(example_prompts) > 1, "at least 2 prompts are required" - assert prompt_token_counts[0] != prompt_token_counts[1], ( - "prompts of different lengths are required") - vllm_prompt_token_count = sum(prompt_token_counts) - - _ = vllm_model.generate_greedy(example_prompts, max_tokens) - stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] - metric_count = stat_logger.metrics.counter_prompt_tokens.labels( - **stat_logger.labels)._value.get() - - assert vllm_prompt_token_count == metric_count, ( - f"prompt token count: {vllm_prompt_token_count!r}\n" - f"metric: {metric_count!r}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_metric_counter_generation_tokens( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - with vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.llm.get_tokenizer() - stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] - metric_count = stat_logger.metrics.counter_generation_tokens.labels( - **stat_logger.labels)._value.get() - vllm_generation_count = 0 - for i in range(len(example_prompts)): - vllm_output_ids, vllm_output_str = vllm_outputs[i] - prompt_ids = tokenizer.encode(example_prompts[i]) - # vllm_output_ids contains both prompt tokens and generation tokens. - # We're interested only in the count of the generation tokens. - vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) - - assert vllm_generation_count == metric_count, ( - f"generation token count: {vllm_generation_count!r}\n" - f"metric: {metric_count!r}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize( - "served_model_name", - [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) -def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, - served_model_name: list[str]) -> None: - with vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.3, - served_model_name=served_model_name) as vllm_model: - stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] - metrics_tag_content = stat_logger.labels["model_name"] - - if envs.VLLM_CI_USE_S3: - model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}" - if served_model_name is None or served_model_name == []: - assert metrics_tag_content == model, ( - f"Metrics tag model_name is wrong! expect: {model!r}\n" - f"actual: {metrics_tag_content!r}") - else: - assert metrics_tag_content == served_model_name[0], ( - f"Metrics tag model_name is wrong! expect: " - f"{served_model_name[0]!r}\n" - f"actual: {metrics_tag_content!r}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("disable_log_stats", [True, False]) -@pytest.mark.asyncio -async def test_async_engine_log_metrics_regression( - example_prompts, - model: str, - dtype: str, - max_tokens: int, - disable_log_stats: bool, -) -> None: - """ - Regression test ensuring async engine generates metrics - when disable_log_stats=False - (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678) - """ - engine_args = AsyncEngineArgs( - model=model, - dtype=dtype, - disable_log_stats=disable_log_stats, - ) - async_engine = AsyncLLMEngine.from_engine_args(engine_args) - for i, prompt in enumerate(example_prompts): - results = async_engine.generate( - prompt, - SamplingParams(max_tokens=max_tokens), - f"request-id-{i}", - ) - # Exhaust the async iterator to make the async engine work - async for _ in results: - pass - - assert_metrics(model, async_engine.engine, disable_log_stats, - len(example_prompts)) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("disable_log_stats", [True, False]) -def test_engine_log_metrics_regression( - example_prompts, - model: str, - dtype: str, - max_tokens: int, - disable_log_stats: bool, -) -> None: - engine_args = EngineArgs( - model=model, - dtype=dtype, - disable_log_stats=disable_log_stats, - ) - engine = LLMEngine.from_engine_args(engine_args) - for i, prompt in enumerate(example_prompts): - engine.add_request( - f"request-id-{i}", - prompt, - SamplingParams(max_tokens=max_tokens), - ) - while engine.has_unfinished_requests(): - engine.step() - - if envs.VLLM_CI_USE_S3: - model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}" - assert_metrics(model, engine, disable_log_stats, len(example_prompts)) - - -def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool, - num_requests: int) -> None: - if disable_log_stats: - with pytest.raises(AttributeError): - _ = engine.stat_loggers - else: - assert (engine.stat_loggers - is not None), "engine.stat_loggers should be set" - # Ensure the count bucket of request-level histogram metrics matches - # the number of requests as a simple sanity check to ensure metrics are - # generated - labels = {'model_name': model} - request_histogram_metrics = [ - "vllm:e2e_request_latency_seconds", - "vllm:request_prompt_tokens", - "vllm:request_generation_tokens", - "vllm:request_params_n", - "vllm:request_params_max_tokens", - ] - for metric_name in request_histogram_metrics: - metric_value = REGISTRY.get_sample_value(f"{metric_name}_count", - labels) - assert ( - metric_value == num_requests), "Metrics should be collected" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [16]) -def test_engine_log_metrics_ray( - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - # This test is quite weak - it only checks that we can use - # RayPrometheusStatLogger without exceptions. - # Checking whether the metrics are actually emitted is unfortunately - # non-trivial. - - # We have to run in a Ray task for Ray metrics to be emitted correctly - @ray.remote(num_gpus=1) - def _inner(): - - class _RayPrometheusStatLogger(RayPrometheusStatLogger): - - def __init__(self, *args, **kwargs): - self._i = 0 - super().__init__(*args, **kwargs) - - def log(self, *args, **kwargs): - self._i += 1 - return super().log(*args, **kwargs) - - engine_args = EngineArgs( - model=model, - dtype=dtype, - disable_log_stats=False, - ) - engine = LLMEngine.from_engine_args(engine_args) - logger = _RayPrometheusStatLogger( - local_interval=0.5, - labels=dict(model_name=engine.model_config.served_model_name), - vllm_config=engine.vllm_config) - engine.add_logger("ray", logger) - for i, prompt in enumerate(example_prompts): - engine.add_request( - f"request-id-{i}", - prompt, - SamplingParams(max_tokens=max_tokens), - ) - while engine.has_unfinished_requests(): - engine.step() - assert logger._i > 0, ".log must be called at least once" - - ray.get(_inner.remote()) diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py deleted file mode 100644 index 4dbae7c15de3..000000000000 --- a/tests/tracing/test_tracing.py +++ /dev/null @@ -1,237 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa -# type: ignore -from __future__ import annotations - -import threading -from collections.abc import Iterable -from concurrent import futures -from typing import Callable, Generator, Literal - -import grpc -import pytest -from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ( - ExportTraceServiceResponse) -from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import ( - TraceServiceServicer, add_TraceServiceServicer_to_server) -from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue -from opentelemetry.sdk.environment_variables import ( - OTEL_EXPORTER_OTLP_TRACES_INSECURE) - -from vllm import LLM, SamplingParams -from vllm.tracing import SpanAttributes - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch: pytest.MonkeyPatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - with monkeypatch.context() as m: - m.setenv('VLLM_USE_V1', '0') - yield - - -FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" - -FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value', - 'array_value'] - - -def decode_value(value: AnyValue): - field_decoders: dict[FieldName, Callable] = { - "bool_value": (lambda v: v.bool_value), - "string_value": (lambda v: v.string_value), - "int_value": (lambda v: v.int_value), - "double_value": (lambda v: v.double_value), - "array_value": - (lambda v: [decode_value(item) for item in v.array_value.values]), - } - for field, decoder in field_decoders.items(): - if value.HasField(field): - return decoder(value) - raise ValueError(f"Couldn't decode value: {value}") - - -def decode_attributes(attributes: Iterable[KeyValue]): - return {kv.key: decode_value(kv.value) for kv in attributes} - - -class FakeTraceService(TraceServiceServicer): - - def __init__(self): - self.request = None - self.evt = threading.Event() - - def Export(self, request, context): - self.request = request - self.evt.set() - return ExportTraceServiceResponse() - - -@pytest.fixture -def trace_service() -> Generator[FakeTraceService, None, None]: - """Fixture to set up a fake gRPC trace service""" - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) - service = FakeTraceService() - add_TraceServiceServicer_to_server(service, server) - server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS) - server.start() - - yield service - - server.stop(None) - - -def test_traces( - monkeypatch: pytest.MonkeyPatch, - trace_service: FakeTraceService, -): - with monkeypatch.context() as m: - m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - - sampling_params = SamplingParams( - temperature=0.01, - top_p=0.1, - max_tokens=256, - ) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") - - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS - ) == sampling_params.max_tokens - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE - ) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - # Model forward and model execute should be none, since detailed traces is - # not enabled. - assert metrics.model_forward_time is None - assert metrics.model_execute_time is None - - -def test_traces_with_detailed_steps( - monkeypatch: pytest.MonkeyPatch, - trace_service: FakeTraceService, -): - with monkeypatch.context() as m: - m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - - sampling_params = SamplingParams( - temperature=0.01, - top_p=0.1, - max_tokens=256, - ) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - collect_detailed_traces=["all"], - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") - - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS - ) == sampling_params.max_tokens - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE - ) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - assert metrics.model_forward_time > 0 - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD - ) == pytest.approx(metrics.model_forward_time / 1000) - assert metrics.model_execute_time > 0 - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE - ) == metrics.model_execute_time - assert metrics.model_forward_time < 1000 * metrics.model_execute_time From c12bc3e5b595ef55e9046f9a100a996bf5ea8e5f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 17:18:11 -0700 Subject: [PATCH 09/17] fix Signed-off-by: Woosuk Kwon --- tests/v1/tracing/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/v1/tracing/__init__.py diff --git a/tests/v1/tracing/__init__.py b/tests/v1/tracing/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 From 3d7c3612eb6db4d2718835044b6435bd910b17de Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 17:20:04 -0700 Subject: [PATCH 10/17] fix Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d7aae4e1c71a..8c5599591663 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -215,16 +215,14 @@ steps: num_gpus: 2 source_file_dependencies: - vllm/ - - tests/metrics - tests/v1/tracing commands: - - pytest -v -s metrics - "pip install \ 'opentelemetry-sdk>=1.26.0' \ 'opentelemetry-api>=1.26.0' \ 'opentelemetry-exporter-otlp>=1.26.0' \ 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s tracing + - pytest -v -s v1/tracing ##### fast check tests ##### ##### 1 GPU test ##### From c17fb8fccad29802da708fd75fd017d464c9db23 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 17:37:16 -0700 Subject: [PATCH 11/17] [V0 Deprecation] Remove more V0 tests Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 4 - tests/async_engine/__init__.py | 0 tests/async_engine/api_server_async_engine.py | 54 -- tests/async_engine/conftest.py | 12 - tests/async_engine/test_api_server.py | 139 ------ tests/async_engine/test_request_tracker.py | 71 --- tests/basic_correctness/test_preemption.py | 189 ------- tests/detokenizer/conftest.py | 11 - tests/detokenizer/test_stop_checker.py | 83 ---- .../openai/correctness/test_lmeval.py | 10 - tests/samplers/test_logprobs.py | 182 ------- tests/worker/__init__.py | 0 tests/worker/conftest.py | 11 - tests/worker/test_model_input.py | 113 ----- tests/worker/test_model_runner.py | 462 ------------------ tests/worker/test_profile.py | 68 --- tests/worker/test_swap.py | 87 ---- 17 files changed, 1496 deletions(-) delete mode 100644 tests/async_engine/__init__.py delete mode 100644 tests/async_engine/api_server_async_engine.py delete mode 100644 tests/async_engine/conftest.py delete mode 100644 tests/async_engine/test_api_server.py delete mode 100644 tests/async_engine/test_request_tracker.py delete mode 100644 tests/basic_correctness/test_preemption.py delete mode 100644 tests/detokenizer/conftest.py delete mode 100644 tests/detokenizer/test_stop_checker.py delete mode 100644 tests/samplers/test_logprobs.py delete mode 100644 tests/worker/__init__.py delete mode 100644 tests/worker/conftest.py delete mode 100644 tests/worker/test_model_input.py delete mode 100644 tests/worker/test_model_runner.py delete mode 100644 tests/worker/test_profile.py delete mode 100644 tests/worker/test_swap.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 08c10180fc22..b7e9746bb745 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -46,22 +46,18 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - - tests/async_engine - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - tests/utils_ - - tests/worker - tests/standalone_tests/lazy_imports.py - tests/transformers_utils commands: - python3 standalone_tests/lazy_imports.py - - pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s multimodal - pytest -v -s utils_ # Utils - - pytest -v -s worker # Worker - pytest -v -s transformers_utils # transformers_utils - label: Python-only Installation Test # 10min diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py deleted file mode 100644 index ec6b20f5e04b..000000000000 --- a/tests/async_engine/api_server_async_engine.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""vllm.entrypoints.api_server with some extra logging for testing.""" -from collections.abc import Iterable -from typing import Any - -import uvicorn -from fastapi.responses import JSONResponse, Response - -import vllm.entrypoints.api_server -import vllm.envs as envs -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.utils import FlexibleArgumentParser - -app = vllm.entrypoints.api_server.app - - -class AsyncLLMEngineWithStats(AsyncLLMEngine): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._num_aborts = 0 - - async def _engine_abort(self, request_ids: Iterable[str]): - ids = list(request_ids) - self._num_aborts += len(ids) - await super()._engine_abort(ids) - - def testing_stats(self) -> dict[str, Any]: - return {"num_aborted_requests": self._num_aborts} - - -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(engine.testing_stats()) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser = AsyncEngineArgs.add_cli_args(parser) - args = parser.parse_args() - - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) - vllm.entrypoints.api_server.engine = engine - uvicorn.run(app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE) diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py deleted file mode 100644 index 375b248ebeda..000000000000 --- a/tests/async_engine/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py deleted file mode 100644 index 07370a880329..000000000000 --- a/tests/async_engine/test_api_server.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copyreg -import os -import subprocess -import sys -import time -from multiprocessing import Pool -from pathlib import Path - -import pytest -import requests -import urllib3.exceptions - - -def _pickle_new_connection_error(obj): - """Custom pickler for NewConnectionError to fix tblib compatibility.""" - # Extract the original message by removing the "conn: " prefix - full_message = obj.args[0] if obj.args else "" - if ': ' in full_message: - # Split off the connection part and keep the actual message - _, actual_message = full_message.split(': ', 1) - else: - actual_message = full_message - return _unpickle_new_connection_error, (actual_message, ) - - -def _unpickle_new_connection_error(message): - """Custom unpickler for NewConnectionError.""" - # Create with None as conn and the actual message - return urllib3.exceptions.NewConnectionError(None, message) - - -# Register the custom pickle/unpickle functions for tblib compatibility -copyreg.pickle(urllib3.exceptions.NewConnectionError, - _pickle_new_connection_error) - - -def _query_server(prompt: str, max_tokens: int = 5) -> dict: - response = requests.post("http://localhost:8000/generate", - json={ - "prompt": prompt, - "max_tokens": max_tokens, - "temperature": 0, - "ignore_eos": True - }) - response.raise_for_status() - return response.json() - - -def _query_server_long(prompt: str) -> dict: - return _query_server(prompt, max_tokens=500) - - -@pytest.fixture -def api_server(distributed_executor_backend: str): - script_path = Path(__file__).parent.joinpath( - "api_server_async_engine.py").absolute() - commands = [ - sys.executable, - "-u", - str(script_path), - "--model", - "facebook/opt-125m", - "--host", - "127.0.0.1", - "--distributed-executor-backend", - distributed_executor_backend, - ] - - # API Server Test Requires V0. - my_env = os.environ.copy() - my_env["VLLM_USE_V1"] = "0" - uvicorn_process = subprocess.Popen(commands, env=my_env) - yield - uvicorn_process.terminate() - - -@pytest.mark.timeout(300) -@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) -def test_api_server(api_server, distributed_executor_backend: str): - """ - Run the API server and test it. - - We run both the server and requests in separate processes. - - We test that the server can handle incoming requests, including - multiple requests at the same time, and that it can handle requests - being cancelled without crashing. - """ - with Pool(32) as pool: - # Wait until the server is ready - prompts = ["warm up"] * 1 - result = None - while not result: - try: - for r in pool.map(_query_server, prompts): - result = r - break - except requests.exceptions.ConnectionError: - time.sleep(1) - - # Actual tests start here - # Try with 1 prompt - for result in pool.map(_query_server, prompts): - assert result - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests == 0 - - # Try with 100 prompts - prompts = ["test prompt"] * 100 - for result in pool.map(_query_server, prompts): - assert result - - with Pool(32) as pool: - # Cancel requests - prompts = ["canceled requests"] * 100 - pool.map_async(_query_server_long, prompts) - time.sleep(0.01) - pool.terminate() - pool.join() - - # check cancellation stats - # give it some time to update the stats - time.sleep(1) - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests > 0 - - # check that server still runs after cancellations - with Pool(32) as pool: - # Try with 100 prompts - prompts = ["test prompt after canceled"] * 100 - for result in pool.map(_query_server, prompts): - assert result diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py deleted file mode 100644 index 1851eeeda790..000000000000 --- a/tests/async_engine/test_request_tracker.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.async_llm_engine import RequestTracker -from vllm.outputs import RequestOutput - - -@pytest.mark.asyncio -async def test_request_tracker(): - tracker = RequestTracker() - stream_1 = tracker.add_request("1") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 1 - assert new[0]["request_id"] == "1" - assert not aborted - assert not stream_1.finished - - stream_2 = tracker.add_request("2") - stream_3 = tracker.add_request("3") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 2 - assert new[0]["request_id"] == "2" - assert new[1]["request_id"] == "3" - assert not aborted - assert not stream_2.finished - assert not stream_3.finished - - # request_ids must be unique - with pytest.raises(KeyError): - tracker.add_request("1") - assert not tracker.new_requests_event.is_set() - - tracker.abort_request("1") - new, aborted = tracker.get_new_and_aborted_requests() - assert len(aborted) == 1 - assert "1" in aborted - assert not new - assert stream_1.finished - - stream_4 = tracker.add_request("4") - tracker.abort_request("4") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - # aborted new requests will cancel each other out - - # there's no need for them to propagate into the - # engine - assert not aborted - assert not new - assert stream_4.finished - - stream_5 = tracker.add_request("5") - assert tracker.new_requests_event.is_set() - tracker.process_request_output( - RequestOutput("2", "output", [], [], [], finished=True)) - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert not aborted - assert len(new) == 1 - assert new[0]["request_id"] == "5" - assert stream_2.finished - assert not stream_5.finished diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py deleted file mode 100644 index db2fa2f6bef6..000000000000 --- a/tests/basic_correctness/test_preemption.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the short outputs of HF and vLLM when using greedy sampling. - -VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. - -Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 -pytest tests/basic_correctness/test_preemption.py`. -""" -import pytest -from prometheus_client import REGISTRY - -import vllm.envs as envs -from vllm import SamplingParams -from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, - ENABLE_ARTIFICIAL_PREEMPT) - -from ..models.utils import check_outputs_equal - -MODELS = [ - "distilbert/distilgpt2", -] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT, - so use VLLM_USE_V1=0 for all tests in the file. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.fixture(scope="module", autouse=True) -def check_settings(): - assert ENABLE_ARTIFICIAL_PREEMPT is True, ( - "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1." - "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 " - "pytest tests/basic_correctness/test_preemption.py`") - - -@pytest.fixture -def distributed_executor_backend() -> str: - # When SPMD worker is used, use distributed_executor_backend="ray" - # to test delta input optimization works with preemption. - return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [96]) -@pytest.mark.parametrize("chunked_prefill_token_size", [16]) -def test_chunked_prefill_recompute( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - chunked_prefill_token_size: int, - distributed_executor_backend: str, -) -> None: - """Ensure that chunked prefill works with preemption.""" - max_num_seqs = min(chunked_prefill_token_size, 256) - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=enable_chunked_prefill, - max_num_seqs=max_num_seqs, - distributed_executor_backend=distributed_executor_backend, - disable_log_stats=False, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - for i in range(len(example_prompts)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_outputs[i] - assert hf_output_str == vllm_output_str, ( - f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") - assert hf_output_ids == vllm_output_ids, ( - f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption( - caplog_vllm, - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """By default, recompute preemption is enabled""" - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - disable_log_stats=False, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - total_preemption = ( - vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - assert ("is preempted by PreemptionMode.RECOMPUTE mode because there " - "is not enough KV cache space." in caplog_vllm.text) - # Ensure the count bucket of request-level histogram metrics matches - # the number of requests as a simple sanity check to ensure metrics are - # generated - preemption_metrics = None - for m in REGISTRY.collect(): - if m.name == "vllm:num_preemptions": - preemption_metrics = m - assert preemption_metrics is not None - total_recorded_preemption = 0 - for sample in preemption_metrics.samples: - total_recorded_preemption += sample.value - assert total_preemption == total_recorded_preemption - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption_infeasible( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """Verify infeasible preemption request will be ignored.""" - BLOCK_SIZE = 16 - prefill_blocks = 2 - decode_blocks = max_tokens // BLOCK_SIZE - with vllm_runner( - model, - dtype=dtype, - block_size=BLOCK_SIZE, - # Not enough gpu blocks to complete a single sequence. - # preemption should happen, and the sequence should be - # ignored instead of hanging forever. - num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, - max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - sampling_params = SamplingParams(max_tokens=max_tokens, - ignore_eos=True) - req_outputs = vllm_model.llm.generate( - example_prompts, - sampling_params=sampling_params, - ) - - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - # Verify the request is ignored and not hang. - for req_output in req_outputs: - outputs = req_output.outputs - assert len(outputs) == 1 - assert outputs[0].finish_reason == "length" diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py deleted file mode 100644 index f2c125355c83..000000000000 --- a/tests/detokenizer/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py deleted file mode 100644 index 2ca10c072b34..000000000000 --- a/tests/detokenizer/test_stop_checker.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.inputs import token_inputs -from vllm.sampling_params import SamplingParams -from vllm.sequence import Logprob, Sequence, SequenceStatus - - -def sequence_with_eos(text: str, eos_token: str, - eos_token_id: int) -> Sequence: - """ - Create a Sequence that ends with an EOS token. - """ - seq = Sequence( - seq_id=0, - inputs=token_inputs([]), - block_size=16, - eos_token_id=eos_token_id, - ) - seq.output_text = text + eos_token - - offset = eos_token_id + 1 - for i in range(offset, len(text) + offset): - seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)}) - seq.append_token_id(token_id=eos_token_id, - logprobs={eos_token_id: Logprob(0.0)}) - - seq.status = SequenceStatus.RUNNING - - return seq - - -@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [ - ("This text ends with EOS token", "", 2), -]) -@pytest.mark.parametrize("ignore_eos", [True, False]) -@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) -@pytest.mark.skip_global_cleanup -def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int, - ignore_eos: bool, include_stop_str_in_output: bool): - """ - Test the behavior of the StopChecker's maybe_stop_sequence method - when an EOS token is encountered. - - This test covers: - - When the EOS token should stop the sequence and be removed from the output - - When the EOS token should stop the sequence and be included in the output - - When the EOS token should be ignored, and the sequence continues - """ - - stop_checker = StopChecker(max_model_len=1024) - - seq = sequence_with_eos( - text=text_wo_eos, - eos_token=eos_token, - eos_token_id=eos_token_id, - ) - new_char_count = len(eos_token) - - # Note that `stop` and `stop_token_ids` are not specified - sampling_params = SamplingParams( - min_tokens=1, - ignore_eos=ignore_eos, - include_stop_str_in_output=include_stop_str_in_output) - - stop_checker.maybe_stop_sequence( - seq=seq, - new_char_count=new_char_count, - sampling_params=sampling_params, - ) - - if ignore_eos: - assert seq.status == SequenceStatus.RUNNING - assert seq.output_text == text_wo_eos + eos_token - elif include_stop_str_in_output: - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.output_text == text_wo_eos + eos_token - else: - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.output_text == text_wo_eos diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index 684407cd6ee9..624acd5ffde7 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): more_args = ["--max-num-seqs", "64"] run_test(more_args) - - -@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, - more_args): - """Run with the V0 Engine.""" - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - run_test(more_args) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py deleted file mode 100644 index 87f40b100531..000000000000 --- a/tests/samplers/test_logprobs.py +++ /dev/null @@ -1,182 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm import SamplingParams - -from ..conftest import VllmRunner - -MODELS = ["distilbert/distilgpt2"] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module is V0 only since it uses dtype=float, so - set VLLM_USE_V1=0 for all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", - ["float"]) # needed for comparing logprobs with HF -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) -@pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size -@pytest.mark.parametrize("detokenize", [True, False]) -def test_get_prompt_logprobs( - hf_runner, - vllm_runner, - model, - dtype, - chunked_prefill_token_size: int, - num_top_logprobs: int, - detokenize: bool, - example_prompts, -): - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - - max_tokens = 5 - with hf_runner(model, dtype=dtype) as hf_model: - hf_logprobs = hf_model.generate_greedy_logprobs( - example_prompts, - max_tokens=max_tokens, - ) - - with vllm_runner( - model, - dtype=dtype, - max_logprobs=num_top_logprobs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs, - ) as vllm_model: - vllm_sampling_params = SamplingParams(max_tokens=max_tokens, - logprobs=num_top_logprobs, - prompt_logprobs=num_top_logprobs, - temperature=0.0, - detokenize=detokenize) - vllm_results = vllm_model.llm.generate( - example_prompts, sampling_params=vllm_sampling_params) - - # Test whether logprobs are included in the results. - for result in vllm_results: - assert result.prompt_logprobs is not None - assert result.outputs[0].logprobs is not None - assert len(result.outputs[0].logprobs) == max_tokens - for logprobs in result.outputs[0].logprobs: - # If the output token is not included in the top X - # logprob, it can return 1 more data - assert (len(logprobs) == num_top_logprobs - or len(logprobs) == num_top_logprobs + 1) - output_text = result.outputs[0].text - output_string_from_most_likely_tokens_lst: list[str] = [] - for top_logprobs in result.outputs[0].logprobs: - top_logprob = next(iter(top_logprobs.values())) - output_string_from_most_likely_tokens_lst.append( - top_logprob.decoded_token) - - if detokenize: - output_string_from_most_likely_tokens = "".join( - output_string_from_most_likely_tokens_lst) - assert output_text == output_string_from_most_likely_tokens, ( - "The output text from the top logprob for each token position " - "should be the same as the output text in the result.") - else: - assert output_text == '' - assert output_string_from_most_likely_tokens_lst == ([None] * - max_tokens) - - # The first prompt logprob is always None - assert result.prompt_logprobs[0] is None - for prompt_logprobs in result.prompt_logprobs[1:]: - # If the prompt token is not included in the top X - # logprob, it can return 1 more data - assert (len(prompt_logprobs) == num_top_logprobs - or len(prompt_logprobs) == num_top_logprobs + 1) - - # Test whether prompt logprobs are consistent with HF - for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs): - # Check prompt logprobs - # The first prompt logprob is always None, so we compare it from 1:. - vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:] - for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs): - for token_id, logprob in vllm_prompt_logprob_dict.items(): - torch.testing.assert_close(logprob.logprob, - hf_logprob[0][i][token_id].item(), - atol=1e-2, - rtol=1e-2) - vllm_sample_logprobs = vllm_result.outputs[0].logprobs - for i, top_logprobs in enumerate(vllm_sample_logprobs): - for token_id, sample_logprob in top_logprobs.items(): - logprob = sample_logprob.logprob - torch.testing.assert_close(logprob, - hf_logprob[i][-1][token_id].item(), - atol=1e-2, - rtol=1e-2) - if detokenize: - assert isinstance(sample_logprob.decoded_token, str), ( - "The token should be decoded by the time it is returned" - " to the user.") - - # Test if prompt logprobs are correctly set. - for vllm_result in vllm_results: - token_ids = vllm_result.prompt_token_ids - prompt_logprobs = vllm_result.prompt_logprobs - - # The first token doesn't have logprob. - assert prompt_logprobs[0] is None - - for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]): - assert token_id in logprob_dict - - -def test_max_logprobs(): - runner = VllmRunner("facebook/opt-125m", max_logprobs=1) - vllm_sampling_params = SamplingParams(logprobs=1) - # should pass - runner.generate(["Hello world"], sampling_params=vllm_sampling_params) - - bad_sampling_params = SamplingParams(logprobs=2) - with pytest.raises(ValueError): - runner.generate(["Hello world"], sampling_params=bad_sampling_params) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) -@pytest.mark.parametrize("detokenize", [True, False]) -def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, - detokenize: bool, example_prompts): - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - max_tokens = 5 - - with vllm_runner( - model, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs, - ) as vllm_model: - sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, - logprobs=None, - temperature=0.0, - detokenize=detokenize) - results_logprobs_none = vllm_model.llm.generate( - example_prompts, sampling_params=sampling_params_logprobs_none) - - for i in range(len(results_logprobs_none)): - assert results_logprobs_none[i].outputs[0].logprobs is None - assert results_logprobs_none[i].outputs[0].cumulative_logprob is None diff --git a/tests/worker/__init__.py b/tests/worker/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py deleted file mode 100644 index 3f202d4dbe94..000000000000 --- a/tests/worker/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') \ No newline at end of file diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py deleted file mode 100644 index 0f28ef2ba857..000000000000 --- a/tests/worker/test_model_input.py +++ /dev/null @@ -1,113 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses - -import torch - -from vllm.attention import AttentionMetadata, AttentionMetadataBuilder -from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.utils import CommonAttentionState -from vllm.model_executor import SamplingMetadata -from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - - -class MockAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - raise NotImplementedError - - @staticmethod - def get_impl_cls(): - raise NotImplementedError - - @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: - return AttentionMetadata - - @staticmethod - def get_builder_cls() -> type["AttentionMetadataBuilder"]: - return AttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> tuple[int, ...]: - raise NotImplementedError - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - pass - - @staticmethod - def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - pass - - -def test_model_runner_input(): - sampling_metadata = SamplingMetadata( - ["seq_group"], - "selected_token_indices", - "categorized_sample_indices", - "num_prompts", - ) - attn_metadata = AttentionMetadata( - num_prefills=1, - num_prefill_tokens=2, - num_decode_tokens=3, - slot_mapping=torch.zeros(1), - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - ) - model_input = ModelInputForGPUWithSamplingMetadata( - input_tokens=torch.ones(10), - input_positions=torch.ones(10), - sampling_metadata=sampling_metadata, - attn_metadata=attn_metadata) - - assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata) - - # Test round trip serialization. - tensor_dict = model_input.as_broadcastable_tensor_dict() - attn_backend = MockAttentionBackend() - received_model_input = ( - ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, attn_backend=attn_backend)) - # Check that received copy has correct values. - assert isinstance(received_model_input, - ModelInputForGPUWithSamplingMetadata) - assert received_model_input.input_tokens is not None - assert ( - received_model_input.input_tokens == model_input.input_tokens).all() - assert received_model_input.input_positions is not None - assert (received_model_input.input_positions == model_input.input_positions - ).all() - assert received_model_input.multi_modal_kwargs is None - assert (received_model_input.multi_modal_kwargs == - model_input.multi_modal_kwargs) - assert received_model_input.lora_requests is None - assert received_model_input.lora_requests == model_input.lora_requests - assert received_model_input.lora_mapping is None - assert received_model_input.lora_mapping == model_input.lora_mapping - for field in dataclasses.fields(AttentionMetadata): - assert getattr(received_model_input.attn_metadata, field.name, - None) == getattr(attn_metadata, field.name, None) - # For sampling metadata, only selected_token_indices is copied. - assert (received_model_input.sampling_metadata.selected_token_indices == - sampling_metadata.selected_token_indices) - assert received_model_input.sampling_metadata.seq_groups is None diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py deleted file mode 100644 index 0be25aa2fc35..000000000000 --- a/tests/worker/test_model_runner.py +++ /dev/null @@ -1,462 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import get_open_port -from vllm.worker.model_runner import ModelRunner - - -def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - model_runner = ModelRunner( - vllm_config=engine_config, - is_driver_worker=True, - ) - return model_runner - - -def test_deepseek_mla_attn_backend_module(): - model_runner = _create_model_runner( - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", - trust_remote_code=True, - enable_chunked_prefill=False, - ) - assert model_runner.attn_backend.__name__ == "TritonMLABackend" - - -@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) -@pytest.mark.parametrize("use_prompt_embeds", [True, False]) -def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enable_prompt_embeds=True, - ) - - seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - expected_input_embeds_len = 0 - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * seq_len, - prompt_embeds=torch.rand(seq_len, 10), - ) - expected_input_embeds_len += seq_len - else: - seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len)) - - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - - expected_selected_token_indices = [] - selected_token_start_idx = 0 - for seq_len in seq_lens: - expected_selected_token_indices.append(selected_token_start_idx + - seq_len - 1) - selected_token_start_idx += seq_len - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - assert return_seq_lens == seq_lens - assert len(slot_mapping) == len(input_tokens) - - # Verify input metadata is correct for prompts. - device = model_runner.device - assert attn_metadata.num_prefills > 0 - assert attn_metadata.num_decode_tokens == 0 - torch.testing.assert_close( - attn_metadata.seq_lens_tensor, - torch.tensor(seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.max_prefill_seq_len == max(seq_lens) - assert attn_metadata.max_decode_seq_len == 0 - - # Test subquery start locs. - start_idx = 0 - start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - - # Test seq start locs. Note that for normal prefill it is - # equivalent to query_start_loc. - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - - torch.testing.assert_close( - attn_metadata.seq_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - torch.testing.assert_close( - attn_metadata.context_lens_tensor, - torch.zeros(attn_metadata.context_lens_tensor.shape[0], - dtype=torch.int, - device=device)) - - expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))], - dtype=torch.int32, - device=model_runner.device) - torch.testing.assert_close(attn_metadata.block_tables, expected) - # Cuda graph should not be used for prerill. - assert attn_metadata.use_cuda_graph is False - - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - if expected_input_embeds_len == 0: - torch.testing.assert_close(input_tokens, input_positions) - assert input_embeds is None - else: - assert len(input_embeds) == expected_input_embeds_len - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=model_runner.device, - pin_memory=model_runner.pin_memory) - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - torch.allclose(input_tokens, input_positions) - - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - - -@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) -@pytest.mark.parametrize("use_prompt_embeds", [True, False]) -def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=False, - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enable_prompt_embeds=True, - ) - - context_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - # Assume each seq group finishes prefill. - for i in range(batch_size): - # make sure all tokens fit into one block - context_len = i % (model_runner.block_size - 1) + 1 - context_lens.append(context_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * context_len, - prompt_embeds=torch.rand(context_len, 10), - ) - output_embed = torch.rand(10) - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(context_len)) - output_embed = None - seq_data.update_num_computed_tokens(context_len) - # Append one token ID since prefill is finished. - seq_data.append_token_id(1, 0, output_embed) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables={0: [1]}, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - slot_mapping = attn_metadata.slot_mapping - - assert len(slot_mapping) == len(input_tokens) - - expected_bs = model_runner.vllm_config.pad_for_cudagraph( - len(seq_group_metadata_list)) - # Verify input metadata is correct for prompts. - device = model_runner.device - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_prefill_tokens == 0 - seq_lens = [context_len + 1 for context_len in context_lens] - # seq_lens are padded to expected_bs - for _ in range(expected_bs - len(seq_lens)): - seq_lens.append(1) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.num_decode_tokens == len(seq_lens) - start_idx = 0 - start_loc = [start_idx] - for _ in context_lens: - # decode has only 1 token for query. - start_idx += 1 - start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.seq_start_loc, - torch.tensor(seq_start_loc, dtype=torch.int32, device=device)) - - torch.testing.assert_close( - attn_metadata.context_lens_tensor, - torch.tensor(context_lens, dtype=torch.int, device=device)) - assert attn_metadata.max_decode_seq_len == max(seq_lens) - torch.testing.assert_close( - attn_metadata.seq_lens_tensor[:len(seq_lens)], - torch.tensor(seq_lens, dtype=torch.int, device=device)) - - # block table's first index corresponds to each batch, meaning in - # decoding it is each token. - assert attn_metadata.block_tables.shape[0] == len(input_tokens) - # Block table's second dim corresponds to each token's block number. - # It is padded up to - assert attn_metadata.block_tables.shape[1] == ( - model_runner.get_max_block_per_batch()) - assert attn_metadata.use_cuda_graph is True - - assert len(input_tokens) == expected_bs - assert len(input_positions) == expected_bs - if use_prompt_embeds: - expected_input_embeds_length = start_loc[-1] - assert len(input_embeds) == expected_input_embeds_length - assert expected_input_embeds_length <= expected_bs - else: - assert input_embeds is None - - # Verify Sampling - expected_selected_token_indices = [] - for selected_token_start_idx, _ in enumerate(context_lens): - expected_selected_token_indices.append(selected_token_start_idx) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query lens is all 1 for decode. - query_lens=[1 for _ in range(len(context_lens))], - device=model_runner.device, - pin_memory=model_runner.pin_memory) - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - - -def test_empty_seq_group(): - """Verify prepare prompt and decode returns empty output.""" - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=False, - ) - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - - assert input_tokens is None - assert input_positions is None - assert attn_metadata is None - - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - - assert input_tokens is None - assert input_positions is None - assert input_embeds is None - assert attn_metadata is None - assert return_seq_lens is None - - -@pytest.fixture -def distributed_init(): - init_distributed_environment( - world_size=1, - rank=0, - distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}", - local_rank=0) - ensure_model_parallel_initialized(1, 1) - - -@pytest.mark.parametrize("batch_size", list(range(2, 128, 3))) -@pytest.mark.parametrize("enforce_eager", [True, False]) -@pytest.mark.parametrize('use_prompt_embeds', [True, False]) -def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds, - distributed_init, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=enforce_eager, - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=True, - enable_prompt_embeds=True, - ) - - # Add prefill requests. - seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - prefill_metadata_list: list[SequenceGroupMetadata] = [] - decode_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - prefill_batch_size = batch_size // 2 - decode_batch_size = batch_size - prefill_batch_size - expected_input_embeds_len = 0 - for i in range(prefill_batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * seq_len, - prompt_embeds=torch.rand(seq_len, 10), - ) - expected_input_embeds_len += seq_len - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(seq_len), ) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - prefill_metadata_list.append(seq_group_metadata) - - # Add decode requests - for i in range(prefill_batch_size, batch_size): - # make sure all tokens fit into one block - context_len = i % (model_runner.block_size - 1) + 1 - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * context_len, - prompt_embeds=torch.rand(context_len, 10), - ) - output_embed = torch.rand(10) - # This also iterates the expected input_embeds, because the model - # needs both the input and output embeddings passed into together - expected_input_embeds_len += 1 - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(context_len), ) - output_embed = None - assert len(seq_data.prompt_token_ids) == context_len - seq_data.append_token_id(1, 0, output_embed) - seq_data.update_num_computed_tokens(context_len) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables={0: [1]}, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - decode_metadata_list.append(seq_group_metadata) - - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - - prefill_meta_actual = attn_metadata.prefill_metadata - decode_meta_actual = attn_metadata.decode_metadata - - assert len(attn_metadata.slot_mapping) == len(input_tokens) - assert len(input_positions) == len(input_tokens) - assert attn_metadata.num_prefills == prefill_batch_size - assert attn_metadata.num_decode_tokens == decode_batch_size - assert attn_metadata.num_prefill_tokens == sum(seq_lens) - if expected_input_embeds_len == 0: - assert input_embeds is None - else: - assert len(input_embeds) == expected_input_embeds_len - - # Verify attn metadata is consistent. We don't need to test individual - # values here because they are tested above. - attn_metadata = model_runner._prepare_model_input_tensors( - seq_group_metadata_list).attn_metadata - - for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata), - vars(prefill_meta_actual)): - assert attr_expected[1] == attr_actual[1] - for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata), - vars(decode_meta_actual)): - assert attr_expected[1] == attr_actual[1] diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py deleted file mode 100644 index d8767f700b57..000000000000 --- a/tests/worker/test_profile.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.worker import Worker - - -def test_gpu_memory_profiling(): - # Tests the gpu profiling that happens in order to determine the number of - # KV cache blocks that we can allocate on the GPU. - # This test mocks the maximum available gpu memory so that it can run on - # any gpu setup. - - # Set up engine args to build a worker. - engine_args = EngineArgs(model="facebook/opt-125m", - dtype="half", - load_format="dummy") - engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 1000 - engine_config.cache_config.num_cpu_blocks = 1000 - - # Create the worker. - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - worker = Worker( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=True, - ) - - # Set 10GiB as the total gpu ram to be device-agnostic - def mock_mem_info(): - current_usage = torch.cuda.memory_stats( - )["allocated_bytes.all.current"] - mock_total_bytes = 10 * 1024**3 - free = mock_total_bytes - current_usage - - return (free, mock_total_bytes) - - from unittest.mock import patch - with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info): - # Load the model so we can profile it - worker.init_device() - worker.load_model() - gpu_blocks, _ = worker.determine_num_available_blocks() - - # Peak vram usage by torch should be 0.47 GiB - # Model weights take 0.25 GiB - # No memory should be allocated outside of torch - # 9.0 GiB should be the utilization target - # 8.28 GiB should be available for the KV cache - block_size = CacheEngine.get_cache_block_size( - engine_config.cache_config, engine_config.model_config, - engine_config.parallel_config) - - expected_blocks = (8.28 * 1024**3) // block_size - - # Check within a small tolerance for portability - # Hardware, kernel, or dependency changes could all affect memory - # utilization. - # A 100 block tolerance here should be about 60MB of wiggle room. - assert abs(gpu_blocks - expected_blocks) < 100 diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py deleted file mode 100644 index 6d9f404ac207..000000000000 --- a/tests/worker/test_swap.py +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.worker import Worker - - -def test_swap() -> None: - # Configure the engine. - engine_args = EngineArgs(model="distilbert/distilgpt2", - dtype="half", - load_format="dummy") - engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 1000 - engine_config.cache_config.num_cpu_blocks = 1000 - - # Create the worker. - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - worker = Worker( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=True, - ) - - # Initialize the worker. - worker.init_device() - worker.load_model() - worker.initialize_cache( - num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, - num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - - # Randomly initialize the cache. - gpu_cache = worker.cache_engine[0].gpu_cache - cpu_cache = worker.cache_engine[0].cpu_cache - num_layers = len(gpu_cache) - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - gpu_key_cache.random_() - gpu_value_cache.random_() - cpu_key_cache, cpu_value_cache = cpu_cache[i] - cpu_key_cache.random_() - cpu_value_cache.random_() - - allclose = lambda a, b: torch.allclose( - a.cuda(), b.cuda(), rtol=0.0, atol=0.0) - - # Test swap out. - blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)] - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=[], - blocks_to_swap_in=[], - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=[], - ) - worker.execute_model(execute_model_req=execute_model_req) - - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - cpu_key_cache, cpu_value_cache = cpu_cache[i] - for src, dst in blocks_to_swap_out: - assert allclose(gpu_key_cache[src], cpu_key_cache[dst]) - assert allclose(gpu_value_cache[src], cpu_value_cache[dst]) - - # Test swap in. - execute_model_req.blocks_to_swap_out = [] - execute_model_req.blocks_to_swap_in = [ - (19, 45), - (67, 23), - (12, 78), - (40, 99), - (1, 71), - ] - worker.execute_model(execute_model_req=execute_model_req) - - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - cpu_key_cache, cpu_value_cache = cpu_cache[i] - for src, dst in execute_model_req.blocks_to_swap_in: - assert allclose(gpu_key_cache[dst], cpu_key_cache[src]) - assert allclose(gpu_value_cache[dst], cpu_value_cache[src]) From 9011ad269ecab49b0a3f60533f84f57faff803fc Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 17:39:01 -0700 Subject: [PATCH 12/17] minor Signed-off-by: Woosuk Kwon --- .github/CODEOWNERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 771dd2e17258..b8d6db06548d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -41,7 +41,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo -/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao @@ -50,7 +49,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche -/tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm From 2d60e15342e207b6c3e56aaea150f73d61240630 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 19:39:42 -0700 Subject: [PATCH 13/17] fix Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b7e9746bb745..82edb9745544 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -78,14 +78,12 @@ steps: - vllm/ - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_preemption - tests/basic_correctness/test_cumem.py commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Entrypoints Unit Tests # 5min timeout_in_minutes: 10 From 4de8edaf45aca1ae9e18c4cba87134cc86084eec Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 19:49:59 -0700 Subject: [PATCH 14/17] update Signed-off-by: Woosuk Kwon --- .../openai/test_return_tokens_as_ids.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 5f43fdc9588f..ef9d5234f231 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -10,8 +10,30 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer -from .test_completion import default_server_args # noqa: F401 -from .test_completion import MODEL_NAME + +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" + + +@pytest.fixture(scope="module") +def default_server_args(zephyr_lora_files): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--max-num-seqs", + "128", + "--enforce-eager", + # lora config + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + ] @pytest.fixture(scope="module") From 4f6be9c758673ea5730cb0482234a6f7f71de46a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 21:35:44 -0700 Subject: [PATCH 15/17] fix Signed-off-by: Woosuk Kwon --- .../openai/test_completion_with_prompt_embeds.py | 3 +++ tests/entrypoints/openai/test_metrics.py | 2 +- tests/entrypoints/openai/test_skip_tokenizer.py | 8 -------- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index dbfb1b024f7c..b78589e96ba3 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -14,6 +14,9 @@ from ...utils import RemoteOpenAIServer +pytest.skip("Skipping prompt_embeds test until V1 supports it.", + allow_module_level=True) + # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 0c9e0f3a5142..8917aa5a5efb 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -22,7 +22,7 @@ PREV_MINOR_VERSION = version._prev_minor_version() -@pytest.fixture(scope="module", params=[True, False]) +@pytest.fixture(scope="module", params=[True]) def use_v1(request): # Module-scoped variant of run_with_both_engines # diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py index 840e0dac81c9..b469fc76fc7a 100644 --- a/tests/entrypoints/openai/test_skip_tokenizer.py +++ b/tests/entrypoints/openai/test_skip_tokenizer.py @@ -15,14 +15,6 @@ DTYPE = "float16" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def server(): args = [ From a27dabc3b7d6d05af29e4a19a92c2cbccf3591f7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 18 Sep 2025 08:34:17 -0700 Subject: [PATCH 16/17] fix Signed-off-by: Woosuk Kwon --- tests/entrypoints/openai/test_lora_adapters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 10c0cb5f4d15..6f2addd3649d 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -53,12 +53,13 @@ def monkeypatch_module(): mpatch.undo() -@pytest.fixture(scope="module", params=[False, True]) +@pytest.fixture(scope="module", params=[True]) def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files): use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') + assert use_v1 + monkeypatch_module.setenv('VLLM_USE_V1', '1') # Define the json format LoRA module configurations lora_module_1 = { From f2d9c622e9e9a29ee95c31c05bc44680473f347a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 18 Sep 2025 08:37:00 -0700 Subject: [PATCH 17/17] rm is_v1_server Signed-off-by: Woosuk Kwon --- tests/entrypoints/openai/test_chat.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index cbd8fa38a27f..3bdfef7b4adb 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -472,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_structured_outputs_choice_chat( - client: openai.AsyncOpenAI, sample_structured_outputs_choices, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") + client: openai.AsyncOpenAI, + sample_structured_outputs_choices, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -513,12 +512,10 @@ async def test_structured_outputs_choice_chat( @pytest.mark.asyncio -async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, - sample_json_schema, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") - +async def test_structured_outputs_json_chat( + client: openai.AsyncOpenAI, + sample_json_schema, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -560,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI, - sample_regex, is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") +async def test_structured_outputs_regex_chat( + client: openai.AsyncOpenAI, + sample_regex, +): messages = [{ "role": "system",