From e92a8cf7d6e6b45a4e3c95c479a1c6ca746231f9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk@thinkingmachines.ai>
Date: Wed, 17 Sep 2025 02:55:33 +0000
Subject: [PATCH 01/17] [V0 Deprecation] Remove AsyncLLMEngine

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 .buildkite/test-pipeline.yaml                 |    2 -
 tests/async_engine/__init__.py                |    0
 tests/async_engine/api_server_async_engine.py |   54 -
 tests/async_engine/conftest.py                |   12 -
 tests/async_engine/test_api_server.py         |  139 ---
 tests/async_engine/test_request_tracker.py    |   71 --
 tests/v1/test_oracle.py                       |   18 -
 vllm/engine/async_llm_engine.py               | 1042 +----------------
 vllm/entrypoints/launcher.py                  |    2 -
 vllm/entrypoints/openai/api_server.py         |  115 +-
 10 files changed, 4 insertions(+), 1451 deletions(-)
 delete mode 100644 tests/async_engine/__init__.py
 delete mode 100644 tests/async_engine/api_server_async_engine.py
 delete mode 100644 tests/async_engine/conftest.py
 delete mode 100644 tests/async_engine/test_api_server.py
 delete mode 100644 tests/async_engine/test_request_tracker.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6f06099edd53..1ac03ad6348a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -47,7 +47,6 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/mq_llm_engine
-  - tests/async_engine
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
@@ -58,7 +57,6 @@ steps:
   commands:
   - python3 standalone_tests/lazy_imports.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
-  - pytest -v -s async_engine # AsyncLLMEngine
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
deleted file mode 100644
index ec6b20f5e04b..000000000000
--- a/tests/async_engine/api_server_async_engine.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""vllm.entrypoints.api_server with some extra logging for testing."""
-from collections.abc import Iterable
-from typing import Any
-
-import uvicorn
-from fastapi.responses import JSONResponse, Response
-
-import vllm.entrypoints.api_server
-import vllm.envs as envs
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.utils import FlexibleArgumentParser
-
-app = vllm.entrypoints.api_server.app
-
-
-class AsyncLLMEngineWithStats(AsyncLLMEngine):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._num_aborts = 0
-
-    async def _engine_abort(self, request_ids: Iterable[str]):
-        ids = list(request_ids)
-        self._num_aborts += len(ids)
-        await super()._engine_abort(ids)
-
-    def testing_stats(self) -> dict[str, Any]:
-        return {"num_aborted_requests": self._num_aborts}
-
-
-@app.get("/stats")
-def stats() -> Response:
-    """Get the statistics of the engine."""
-    return JSONResponse(engine.testing_stats())
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
-    vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level="debug",
-                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
deleted file mode 100644
index 375b248ebeda..000000000000
--- a/tests/async_engine/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
deleted file mode 100644
index 07370a880329..000000000000
--- a/tests/async_engine/test_api_server.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import copyreg
-import os
-import subprocess
-import sys
-import time
-from multiprocessing import Pool
-from pathlib import Path
-
-import pytest
-import requests
-import urllib3.exceptions
-
-
-def _pickle_new_connection_error(obj):
-    """Custom pickler for NewConnectionError to fix tblib compatibility."""
-    # Extract the original message by removing the "conn: " prefix
-    full_message = obj.args[0] if obj.args else ""
-    if ': ' in full_message:
-        # Split off the connection part and keep the actual message
-        _, actual_message = full_message.split(': ', 1)
-    else:
-        actual_message = full_message
-    return _unpickle_new_connection_error, (actual_message, )
-
-
-def _unpickle_new_connection_error(message):
-    """Custom unpickler for NewConnectionError."""
-    # Create with None as conn and the actual message
-    return urllib3.exceptions.NewConnectionError(None, message)
-
-
-# Register the custom pickle/unpickle functions for tblib compatibility
-copyreg.pickle(urllib3.exceptions.NewConnectionError,
-               _pickle_new_connection_error)
-
-
-def _query_server(prompt: str, max_tokens: int = 5) -> dict:
-    response = requests.post("http://localhost:8000/generate",
-                             json={
-                                 "prompt": prompt,
-                                 "max_tokens": max_tokens,
-                                 "temperature": 0,
-                                 "ignore_eos": True
-                             })
-    response.raise_for_status()
-    return response.json()
-
-
-def _query_server_long(prompt: str) -> dict:
-    return _query_server(prompt, max_tokens=500)
-
-
-@pytest.fixture
-def api_server(distributed_executor_backend: str):
-    script_path = Path(__file__).parent.joinpath(
-        "api_server_async_engine.py").absolute()
-    commands = [
-        sys.executable,
-        "-u",
-        str(script_path),
-        "--model",
-        "facebook/opt-125m",
-        "--host",
-        "127.0.0.1",
-        "--distributed-executor-backend",
-        distributed_executor_backend,
-    ]
-
-    # API Server Test Requires V0.
-    my_env = os.environ.copy()
-    my_env["VLLM_USE_V1"] = "0"
-    uvicorn_process = subprocess.Popen(commands, env=my_env)
-    yield
-    uvicorn_process.terminate()
-
-
-@pytest.mark.timeout(300)
-@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
-def test_api_server(api_server, distributed_executor_backend: str):
-    """
-    Run the API server and test it.
-
-    We run both the server and requests in separate processes.
-
-    We test that the server can handle incoming requests, including
-    multiple requests at the same time, and that it can handle requests
-    being cancelled without crashing.
-    """
-    with Pool(32) as pool:
-        # Wait until the server is ready
-        prompts = ["warm up"] * 1
-        result = None
-        while not result:
-            try:
-                for r in pool.map(_query_server, prompts):
-                    result = r
-                    break
-            except requests.exceptions.ConnectionError:
-                time.sleep(1)
-
-        # Actual tests start here
-        # Try with 1 prompt
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests == 0
-
-        # Try with 100 prompts
-        prompts = ["test prompt"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-    with Pool(32) as pool:
-        # Cancel requests
-        prompts = ["canceled requests"] * 100
-        pool.map_async(_query_server_long, prompts)
-        time.sleep(0.01)
-        pool.terminate()
-        pool.join()
-
-        # check cancellation stats
-        # give it some time to update the stats
-        time.sleep(1)
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests > 0
-
-    # check that server still runs after cancellations
-    with Pool(32) as pool:
-        # Try with 100 prompts
-        prompts = ["test prompt after canceled"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
deleted file mode 100644
index 1851eeeda790..000000000000
--- a/tests/async_engine/test_request_tracker.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.async_llm_engine import RequestTracker
-from vllm.outputs import RequestOutput
-
-
-@pytest.mark.asyncio
-async def test_request_tracker():
-    tracker = RequestTracker()
-    stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 1
-    assert new[0]["request_id"] == "1"
-    assert not aborted
-    assert not stream_1.finished
-
-    stream_2 = tracker.add_request("2")
-    stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 2
-    assert new[0]["request_id"] == "2"
-    assert new[1]["request_id"] == "3"
-    assert not aborted
-    assert not stream_2.finished
-    assert not stream_3.finished
-
-    # request_ids must be unique
-    with pytest.raises(KeyError):
-        tracker.add_request("1")
-    assert not tracker.new_requests_event.is_set()
-
-    tracker.abort_request("1")
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert len(aborted) == 1
-    assert "1" in aborted
-    assert not new
-    assert stream_1.finished
-
-    stream_4 = tracker.add_request("4")
-    tracker.abort_request("4")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    # aborted new requests will cancel each other out -
-    # there's no need for them to propagate into the
-    # engine
-    assert not aborted
-    assert not new
-    assert stream_4.finished
-
-    stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event.is_set()
-    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert not aborted
-    assert len(new) == 1
-    assert new[0]["request_id"] == "5"
-    assert stream_2.finished
-    assert not stream_5.finished
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 794c1f68f147..28c24f62895a 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -7,7 +7,6 @@
 import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch):
         _ = AsyncEngineArgs(model=MODEL).create_engine_config()
         assert envs.VLLM_USE_V1
         m.delenv("VLLM_USE_V1")
-
-
-def test_reject_using_constructor_directly(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-
-        # Sets VLLM_USE_V1=1.
-        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
-
-        # This uses the V0 constructor directly.
-        with pytest.raises(ValueError):
-            AsyncLLMEngine(vllm_config,
-                           AsyncLLMEngine._get_executor_cls(vllm_config),
-                           log_stats=True)
-
-        m.delenv("VLLM_USE_V1")
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index c53ece18964c..ede027759a8b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,1044 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import asyncio
-import time
-import weakref
-from functools import partial
-from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
-                    Mapping, Optional, Set, Tuple, Type, Union)
-from weakref import ReferenceType
+from vllm.v1.engine.async_llm import AsyncLLM
 
-import vllm.envs as envs
-from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
-from vllm.config.lora import LoRAConfig
-from vllm.core.scheduler import SchedulerOutputs
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.metrics_types import StatLoggerBase
-from vllm.engine.protocol import EngineClient
-from vllm.executor.executor_base import ExecutorBase
-from vllm.inputs import PromptType
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import ExecuteModelRequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, deprecate_kwargs, weak_bind
-
-logger = init_logger(__name__)
-ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
-
-
-class AsyncEngineDeadError(RuntimeError):
-    pass
-
-
-def _log_task_completion(task: asyncio.Task,
-                         error_callback: Callable[[Exception], None]) -> None:
-    """This function is only intended for the `engine.run_engine_loop()` task.
-
-    In particular, that task runs a `while True` loop that can only exit if
-    there is an exception.
-    """
-
-    exception = None
-    try:
-        return_value = task.result()
-        raise AssertionError(
-            f"The engine background task should never finish without an "
-            f"exception. {return_value}")
-    except asyncio.exceptions.CancelledError:
-        # We assume that if the task is cancelled, we are gracefully shutting
-        # down. This should only happen on program exit.
-        logger.info("Engine is gracefully shutting down.")
-    except Exception as e:
-        exception = e
-        logger.error("Engine background task failed", exc_info=e)
-        error_callback(exception)
-        raise AsyncEngineDeadError(
-            "Task finished unexpectedly. This should never happen! "
-            "Please open an issue on GitHub. See stack trace above for the "
-            "actual cause.") from e
-
-
-STOP_ITERATION = Exception()  # Sentinel
-
-
-class AsyncStream:
-    """A stream of RequestOutputs for a request that can be iterated over
-    asynchronously via an async generator."""
-
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
-        self.request_id = request_id
-        self._cancel = cancel
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._finished = False
-
-    def put(self, item: Union[RequestOutput, Exception]) -> None:
-        if not self._finished:
-            self._queue.put_nowait(item)
-
-    def finish(
-        self,
-        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
-    ) -> None:
-        if not self._finished:
-            self._finished = True
-            self._queue.put_nowait(
-                exception if self._is_raisable(exception) else STOP_ITERATION)
-
-    @property
-    def finished(self) -> bool:
-        return self._finished
-
-    async def generator(self) -> AsyncGenerator[RequestOutput, None]:
-        try:
-            while True:
-                result = await self._queue.get()
-                if self._is_raisable(result):
-                    if result == STOP_ITERATION:
-                        return
-                    raise result
-                yield result
-        except GeneratorExit:
-            self._cancel(self.request_id)
-            raise asyncio.CancelledError from None
-
-    @staticmethod
-    def _is_raisable(value: Any):
-        return isinstance(value, BaseException) or \
-                (isinstance(value, type) and \
-                 issubclass(value, BaseException))
-
-
-class RequestTracker:
-    """Synchronous abstraction for tracking requests."""
-
-    def __init__(self) -> None:
-        self._request_streams: Dict[str, AsyncStream] = {}
-        self._aborted_requests: asyncio.Queue[str] = asyncio.Queue()
-        self._new_requests: asyncio.Queue[Tuple[AsyncStream,
-                                                dict]] = asyncio.Queue()
-        self.new_requests_event = asyncio.Event()
-
-    def __contains__(self, item):
-        return item in self._request_streams
-
-    def __len__(self) -> int:
-        return len(self._request_streams)
-
-    def propagate_exception(self,
-                            exc: Exception,
-                            request_id: Optional[str] = None) -> None:
-        """Propagate an exception to request streams
-        (all if request_id is None)."""
-        if request_id is not None:
-            self.abort_request(request_id, exception=exc)
-        else:
-            # NB: tuple() used here because self.abort_request pops the stream
-            # out of self._request_streams, so we can't iterate on it directly
-            for rid in tuple(self._request_streams.keys()):
-                self.abort_request(rid, exception=exc)
-
-    def process_request_output(self,
-                               request_output: RequestOutput,
-                               *,
-                               verbose: bool = False) -> None:
-        """Process a request output from the engine."""
-        request_id = request_output.request_id
-        finished = request_output.finished
-
-        if finished:
-            stream = self._request_streams.pop(request_id, None)
-        else:
-            stream = self._request_streams.get(request_id)
-        # Guard against a KeyError which can occur if the request was aborted
-        # while the output was generated
-        if stream is not None:
-            stream.put(request_output)
-            if finished:
-                stream.finish()
-
-        if verbose and finished:
-            logger.info("Finished request %s.", request_id)
-
-    def process_exception(self,
-                          request_id: str,
-                          exception: BaseException,
-                          *,
-                          verbose: bool = False) -> None:
-        """Propagate an exception from the engine."""
-        if verbose:
-            logger.info("Finished request %s.", request_id)
-        self.abort_request(request_id, exception=exception)
-
-    def add_request(self,
-                    request_id: str,
-                    *,
-                    verbose: bool = False,
-                    **engine_add_request_kwargs) -> AsyncStream:
-        """Add a request to be sent to the engine on the next background
-        loop iteration."""
-        if request_id in self._request_streams:
-            raise KeyError(f"Request {request_id} already exists.")
-
-        abort_request = partial(self.abort_request, verbose=verbose)
-        stream = AsyncStream(request_id, abort_request)
-        self._new_requests.put_nowait((stream, {
-            "request_id": request_id,
-            **engine_add_request_kwargs
-        }))
-
-        self.new_requests_event.set()
-
-        if verbose:
-            logger.info("Added request %s.", request_id)
-
-        return stream
-
-    def abort_request(self,
-                      request_id: str,
-                      *,
-                      exception: Optional[Union[BaseException,
-                                                Type[BaseException]]] = None,
-                      verbose: bool = False) -> None:
-        """Abort a request during next background loop iteration."""
-        if verbose:
-            logger.info("Aborted request %s.", request_id)
-
-        self._aborted_requests.put_nowait(request_id)
-
-        stream = self._request_streams.pop(request_id, None)
-        if stream is not None:
-            stream.finish(exception=exception)
-
-    def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]:
-        """Get the new requests and finished requests to be
-        sent to the engine."""
-        new_requests: List[Dict] = []
-        finished_requests: Set[str] = set()
-
-        while not self._aborted_requests.empty():
-            request_id = self._aborted_requests.get_nowait()
-            finished_requests.add(request_id)
-
-        while not self._new_requests.empty():
-            stream, new_request = self._new_requests.get_nowait()
-            request_id = stream.request_id
-            if request_id in finished_requests:
-                # The request has already been aborted.
-                stream.finish(asyncio.CancelledError)
-                finished_requests.discard(request_id)
-            else:
-                self._request_streams[request_id] = stream
-                new_requests.append(new_request)
-
-        return new_requests, finished_requests
-
-    async def wait_for_new_requests(self):
-        if not self.has_new_requests():
-            await self.new_requests_event.wait()
-        self.new_requests_event.clear()
-
-    def has_new_requests(self):
-        return not self._new_requests.empty()
-
-
-class _AsyncLLMEngine(LLMEngine):
-    """Extension of LLMEngine to add async methods."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    async def step_async(self, virtual_engine: int) -> List[RequestOutput]:
-        """Performs one decoding iteration and returns newly generated results.
-        The workers are ran asynchronously if possible.
-
-        This function performs one decoding iteration of the engine. It first
-        schedules the sequences to be executed in the next iteration and the
-        token blocks to be swapped in/out/copy. Then, it executes the model
-        and updates the scheduler with the model outputs. Finally, it decodes
-        the sequences and returns the newly generated results.
-        """
-        # these are cached outputs from previous iterations. None if on first
-        # iteration
-        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
-        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
-        scheduler_outputs = cached_outputs.scheduler_outputs
-        allow_async_output_proc = cached_outputs.allow_async_output_proc
-
-        ctx = self.scheduler_contexts[virtual_engine]
-
-        # Clear outputs for each new scheduler iteration
-        ctx.request_outputs.clear()
-
-        # skip the scheduler if there are any remaining steps in the seq groups.
-        # This ensures that the scheduler is only called again when the current
-        # batch has completed.
-        if not self._has_remaining_steps(seq_group_metadata_list):
-
-            # Schedule iteration
-            (seq_group_metadata_list, scheduler_outputs,
-             allow_async_output_proc
-             ) = self.scheduler[virtual_engine].schedule()
-
-            ctx.seq_group_metadata_list = seq_group_metadata_list
-            ctx.scheduler_outputs = scheduler_outputs
-
-            if not scheduler_outputs.is_empty():
-                # this will cause mamba_cache/minimax_cache failed
-                # to release finished_requests_ids of the last steps
-                finished_requests_ids = self.scheduler[
-                    virtual_engine].get_and_reset_finished_requests_ids()
-
-            # Maybe switch from async mode to sync mode
-            if not allow_async_output_proc and len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-
-        else:
-            finished_requests_ids = list()
-
-        assert seq_group_metadata_list is not None
-        assert scheduler_outputs is not None
-
-        if not scheduler_outputs.is_empty():
-
-            # Check if we have a cached last_output from the previous iteration.
-            # For supporting PP this is probably the best way to pass the
-            # sampled_token_ids, as a separate broadcast over all the PP stages
-            # will cause one virtual engine's microbatch to block the pipeline.
-            last_sampled_token_ids = \
-                self._get_last_sampled_token_ids(virtual_engine)
-
-            execute_model_req = ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
-                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
-                blocks_to_copy=scheduler_outputs.blocks_to_copy,
-                virtual_engine=virtual_engine,
-                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
-                running_queue_size=scheduler_outputs.running_queue_size,
-                finished_requests_ids=finished_requests_ids,
-                # We use ExecuteModelRequest to pass the last sampled_token_ids
-                # to each of the non-last PP stages for in-place prepare_input.
-                last_sampled_token_ids=last_sampled_token_ids)
-
-            if allow_async_output_proc:
-                execute_model_req.async_callback = self.async_callbacks[
-                    virtual_engine]
-
-            # Execute the model.
-            outputs = await self.model_executor.execute_model_async(
-                execute_model_req)
-
-        else:
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            outputs = []
-
-        if not self._has_remaining_steps(seq_group_metadata_list):
-            # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1.
-            is_first_step_output: bool = False if not seq_group_metadata_list \
-                else seq_group_metadata_list[0].state.num_steps == 1
-
-            ctx.append_output(outputs=outputs,
-                              seq_group_metadata_list=seq_group_metadata_list,
-                              scheduler_outputs=scheduler_outputs,
-                              is_async=allow_async_output_proc,
-                              is_last_step=True,
-                              is_first_step_output=is_first_step_output)
-
-            if outputs and allow_async_output_proc:
-                assert len(
-                    outputs
-                ) == 1, "Async postprocessor expects only a single output set"
-                self._advance_to_next_step(
-                    outputs[0], seq_group_metadata_list,
-                    scheduler_outputs.scheduled_seq_groups)
-
-            if not allow_async_output_proc:
-                self._process_model_outputs(ctx=ctx)
-
-                # Log stats.
-                self.do_log_stats(scheduler_outputs, outputs)
-
-                # Tracing
-                self.do_tracing(scheduler_outputs)
-
-        else:
-            # Multi-step case
-            return ctx.request_outputs
-
-        if not self.has_unfinished_requests():
-            # Drain async postprocessor (if exists)
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            assert len(ctx.output_queue) == 0
-
-        return ctx.request_outputs
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        """Stop the remote worker execution loop."""
-        await self.model_executor.stop_remote_worker_execution_loop_async()
-
-    async def get_tokenizer_async(self,
-                                  lora_request: Optional[LoRARequest] = None
-                                  ) -> AnyTokenizer:
-        return await (
-            self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
-
-    async def add_request_async(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams,
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> None:
-        """
-        Async version of
-        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
-        """
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
-        if priority != 0 and not self.scheduler_config.policy == "priority":
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        if data_parallel_rank is not None:
-            raise ValueError("Targeting data_parallel_rank only supported "
-                             "in v1 client.")
-
-        if (isinstance(prompt, dict)
-                and prompt.get("prompt_embeds", None) is not None
-                and not prompt.get("prompt_token_ids", None)):
-            # We use the -2 dimension (instead of 0) in case a batched input
-            # of batch size 1 is passed in.
-            prompt["prompt_token_ids"] = [0
-                                          ] * prompt["prompt_embeds"].shape[-2]
-
-        processed_inputs = await self.input_preprocessor.preprocess_async(
-            prompt,
-            lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
-        )
-
-        self._add_processed_request(
-            request_id=request_id,
-            processed_inputs=processed_inputs,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-        )
-
-    async def check_health_async(self) -> None:
-        self.model_executor.check_health()
-
-    async def collective_rpc_async(self,
-                                   method: str,
-                                   timeout: Optional[float] = None,
-                                   args: tuple = (),
-                                   kwargs: Optional[dict] = None):
-        raise NotImplementedError
-
-
-class AsyncLLMEngine(EngineClient):
-    """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
-
-    This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
-    make it asynchronous. It uses asyncio to create a background loop that keeps
-    processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
-    by the generate method when there are requests in the waiting queue. The
-    generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
-    to the caller.
-
-    Args:
-        log_requests: Whether to log the requests.
-        start_engine_loop: If True, the background task to run the engine
-            will be automatically started in the generate call.
-        *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
-        **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
-    """
-
-    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
-
-    def __init__(self,
-                 *args: Any,
-                 log_requests: bool = True,
-                 start_engine_loop: bool = True,
-                 **kwargs: Any) -> None:
-        if envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
-                "This should not happen. As a workaround, try using "
-                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
-        self.log_requests = log_requests
-        self.engine = self._engine_class(*args, **kwargs)
-
-        # This ensures quick processing of request outputs
-        # so the append to asyncio queues is not delayed,
-        # especially for multi-step.
-        self.use_process_request_outputs_callback = (
-            self.engine.model_config.use_async_output_proc)
-
-        if self.use_process_request_outputs_callback:
-            self.engine.process_request_outputs_callback = \
-                weak_bind(self.process_request_outputs)
-
-        self.background_loop: Optional[asyncio.Future] = None
-        # We need to keep a reference to unshielded
-        # task as well to prevent it from being garbage
-        # collected
-        self._background_loop_unshielded: Optional[asyncio.Task] = None
-        self.start_engine_loop = start_engine_loop
-        self._errored_with: Optional[BaseException] = None
-
-        # Lazy initialized fields
-        self._request_tracker: RequestTracker
-
-    def __del__(self):
-        if rt := getattr(self, "request_tracker", None):
-            # Wake up engine loop so that it will exit cleanly
-            rt.new_requests_event.set()
-
-    @classmethod
-    def _get_executor_cls(cls,
-                          engine_config: VllmConfig) -> Type[ExecutorBase]:
-        return LLMEngine._get_executor_cls(engine_config)
-
-    @classmethod
-    @deprecate_kwargs(
-        "disable_log_requests",
-        additional_message=("This argument will have no effect. "
-                            "Use `enable_log_requests` instead."),
-    )
-    def from_vllm_config(
-            cls,
-            vllm_config: VllmConfig,
-            start_engine_loop: bool = True,
-            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-            stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-            enable_log_requests: bool = False,
-            disable_log_stats: bool = False,
-            disable_log_requests: bool = True,  # Deprecated, will be removed
-    ) -> "AsyncLLMEngine":
-        """Create an AsyncLLMEngine from the EngineArgs."""
-
-        return cls(
-            vllm_config=vllm_config,
-            executor_class=cls._get_executor_cls(vllm_config),
-            start_engine_loop=start_engine_loop,
-            log_requests=enable_log_requests,
-            log_stats=not disable_log_stats,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-        )
-
-    @classmethod
-    def from_engine_args(
-        cls,
-        engine_args: AsyncEngineArgs,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-    ) -> "AsyncLLMEngine":
-        """Creates an async LLM engine from the engine arguments."""
-
-        vllm_config = engine_args.create_engine_config(usage_context)
-
-        async_engine_cls = cls
-        if envs.VLLM_USE_V1:
-            from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine
-            async_engine_cls = V1AsyncLLMEngine
-
-        return async_engine_cls.from_vllm_config(
-            vllm_config=vllm_config,
-            start_engine_loop=start_engine_loop,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-            disable_log_stats=engine_args.disable_log_stats,
-            enable_log_requests=engine_args.enable_log_requests,
-        )
-
-    @property
-    def is_running(self) -> bool:
-        return (self.background_loop is not None
-                and self._background_loop_unshielded is not None
-                and not self._background_loop_unshielded.done())
-
-    @property
-    def is_stopped(self) -> bool:
-        return self.errored or (self.background_loop is not None and
-                                self._background_loop_unshielded is not None
-                                and self._background_loop_unshielded.done())
-
-    @property
-    def errored(self) -> bool:
-        return self._errored_with is not None
-
-    @property
-    def dead_error(self) -> BaseException:
-        return AsyncEngineDeadError(
-            "Background loop is not running. If it was running, "
-            "inspect the output to find the stacktrace of the "
-            "error that caused the background loop to stop "
-            "(AsyncEngineDeadError).")
-
-    def set_errored(self, exc: Exception) -> None:
-        self._errored_with = exc
-
-    def _error_callback(self, exc: Exception) -> None:
-        self.set_errored(exc)
-        self._request_tracker.propagate_exception(exc)
-
-    async def get_input_preprocessor(self) -> InputPreprocessor:
-        return self.engine.input_preprocessor
-
-    async def get_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return await self.engine.get_tokenizer_async(lora_request)
-
-    def start_background_loop(self) -> None:
-        """Start the background loop."""
-        if self.errored:
-            raise AsyncEngineDeadError(
-                "Background loop has errored already.") from self._errored_with
-        if self.is_running:
-            raise RuntimeError("Background loop is already running.")
-        # Initialize the RequestTracker here so it uses the right event loop.
-        self._request_tracker = RequestTracker()
-
-        self._background_loop_unshielded = asyncio.get_event_loop(
-        ).create_task(self.run_engine_loop(weakref.ref(self)))
-        self._background_loop_unshielded.add_done_callback(
-            partial(_log_task_completion, error_callback=self._error_callback))
-        self.background_loop = asyncio.shield(self._background_loop_unshielded)
-
-    def shutdown_background_loop(self) -> None:
-        """
-        Shut down the background loop.
-
-        This method needs to be called during cleanup to remove
-        references to `self` and properly GC the resources held
-        by the async LLM engine (e.g., the executors as well as
-        their resources).
-        """
-        if self._background_loop_unshielded is not None:
-            self._background_loop_unshielded.cancel()
-            self._background_loop_unshielded = None
-        self.background_loop = None
-
-    async def engine_step(self, virtual_engine: int) -> bool:
-        """Kick the engine to process the waiting requests.
-
-        Returns True if there are in-progress requests."""
-
-        new_requests, aborted_requests = (
-            self._request_tracker.get_new_and_aborted_requests())
-
-        for new_request in new_requests:
-            # Add the request into the vLLM engine's waiting queue.
-            try:
-                await self.engine.add_request_async(**new_request)
-            except ValueError as e:
-                # TODO: use a vLLM specific error for failed validation
-                self._request_tracker.process_exception(
-                    new_request["request_id"],
-                    e,
-                    verbose=self.log_requests,
-                )
-
-        if aborted_requests:
-            await self._engine_abort(aborted_requests)
-
-        request_outputs = await self.engine.step_async(virtual_engine)
-
-        # Put the outputs into the corresponding streams.
-        # If used as a callback, then already invoked inside
-        # LLMEngine's _process_model_outputs
-        if not self.use_process_request_outputs_callback:
-            all_finished = self.process_request_outputs(request_outputs)
-        else:
-            # For callback case, we only need to detect when all
-            # requests are finished
-            all_finished = all(request_output.finished
-                               for request_output in request_outputs)
-
-        return not all_finished
-
-    def process_request_outputs(self, request_outputs) -> bool:
-        # Put the outputs into the corresponding streams.
-        all_finished = True
-        for request_output in request_outputs:
-            self._request_tracker.process_request_output(
-                request_output, verbose=self.log_requests)
-            all_finished = all_finished and request_output.finished
-
-        return all_finished
-
-    async def _engine_abort(self, request_ids: Iterable[str]):
-        self.engine.abort_request(request_ids)
-
-    @staticmethod
-    async def run_engine_loop(engine_ref: ReferenceType):
-        """We use a weakref to the engine so that the running loop
-        doesn't prevent the engine being garbage collected."""
-        engine: Optional[AsyncLLMEngine] = engine_ref()
-        if not engine:
-            return
-
-        pipeline_parallel_size = \
-                engine.engine.parallel_config.pipeline_parallel_size
-        has_requests_in_progress = [False] * pipeline_parallel_size
-        while True:
-            if not any(has_requests_in_progress):
-                logger.debug("Waiting for new requests...")
-                # Stop the execute model loop in parallel workers until there
-                # are more requests to process. This avoids waiting
-                # indefinitely in torch.distributed ops which may otherwise
-                # time out, and unblocks the RPC thread in the workers so that
-                # they can process any other queued control plane messages,
-                # such as add/remove lora adapters.
-                await engine.engine.stop_remote_worker_execution_loop_async()
-                request_tracker = engine._request_tracker
-                # Allow engine to be garbage collected while
-                # waiting for new requests
-                del engine
-                await asyncio.sleep(0)
-                if engine_ref() is None:
-                    return
-                await request_tracker.wait_for_new_requests()
-                engine = engine_ref()
-                if not engine:
-                    return
-                logger.debug("Got new requests!")
-                requests_in_progress = [
-                    asyncio.create_task(engine.engine_step(ve))
-                    for ve in range(pipeline_parallel_size)
-                ]
-                has_requests_in_progress = [True] * pipeline_parallel_size
-
-            # Abort if iteration takes too long due to unrecoverable errors
-            # (eg. NCCL timeouts).
-            try:
-                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
-                    done, _ = await asyncio.wait(
-                        requests_in_progress,
-                        return_when=asyncio.FIRST_COMPLETED)
-                    for _ in range(pipeline_parallel_size):
-                        await asyncio.sleep(0)
-                for task in done:
-                    result = task.result()
-                    virtual_engine = requests_in_progress.index(task)
-                    has_unfinished_requests = (
-                        engine.engine.
-                        has_unfinished_requests_for_virtual_engine(
-                            virtual_engine))
-                    if result or has_unfinished_requests:
-                        requests_in_progress[virtual_engine] = (
-                            asyncio.create_task(
-                                engine.engine_step(virtual_engine)))
-                        has_requests_in_progress[virtual_engine] = True
-                    else:
-                        has_requests_in_progress[virtual_engine] = False
-            except asyncio.TimeoutError as exc:
-                logger.error(
-                    "Engine iteration timed out. This should never happen!")
-                engine.set_errored(exc)
-                raise
-            await asyncio.sleep(0)
-
-    async def add_request(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams,
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        if not self.is_running:
-            if self.start_engine_loop:
-                self.start_background_loop()
-            else:
-                raise AsyncEngineDeadError(
-                    "Background loop is not running. If it was running, "
-                    "inspect the output to find the stacktrace of the "
-                    "error that caused the background loop to stop "
-                    "(AsyncEngineDeadError).")
-
-        if (priority != 0
-                and not self.engine.scheduler_config.policy == "priority"):
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-
-        stream = self._request_tracker.add_request(
-            request_id,
-            verbose=self.log_requests,
-            prompt=prompt,
-            params=params,
-            arrival_time=arrival_time or time.time(),
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-            data_parallel_rank=data_parallel_rank,
-            tokenization_kwargs=tokenization_kwargs,
-        )
-
-        return stream.generator()
-
-    async def generate(
-        self,
-        prompt: PromptType,
-        sampling_params: SamplingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        """Generate outputs for a request.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            sampling_params: The sampling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-            data_parallel_rank: The (global) data parallel rank that must
-                handle this request. Only applicable if DP is enabled.
-        Yields:
-            The output `RequestOutput` objects from the LLMEngine
-            for the request.
-
-        Details:
-            - If the engine is not running, start the background loop,
-              which iteratively invokes
-              [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
-              to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-              On the next background loop, this request will be sent to
-              the underlying engine.
-              Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
-
-        Example:
-            >>> # Please refer to entrypoints/api_server.py for
-            >>> # the complete example.
-            >>>
-            >>> # initialize the engine and the example input
-            >>> # note that engine_args here is AsyncEngineArgs instance
-            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
-            >>> example_input = {
-            >>>     "prompt": "What is LLM?",
-            >>>     "stream": False, # assume the non-streaming case
-            >>>     "temperature": 0.0,
-            >>>     "request_id": 0,
-            >>> }
-            >>>
-            >>> # start the generation
-            >>> results_generator = engine.generate(
-            >>>    example_input["prompt"],
-            >>>    SamplingParams(temperature=example_input["temperature"]),
-            >>>    example_input["request_id"])
-            >>>
-            >>> # get the results
-            >>> final_output = None
-            >>> async for request_output in results_generator:
-            >>>     if await request.is_disconnected():
-            >>>         # Abort the request if the client disconnects.
-            >>>         await engine.abort(request_id)
-            >>>         # Return or raise an error
-            >>>         ...
-            >>>     final_output = request_output
-            >>>
-            >>> # Process and return the final output
-            >>> ...
-        """
-        try:
-            async for output in await self.add_request(
-                    request_id,
-                    prompt,
-                    sampling_params,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                    data_parallel_rank=data_parallel_rank,
-            ):
-                yield LLMEngine.validate_output(output, RequestOutput)
-        except asyncio.CancelledError:
-            await self.abort(request_id)
-            raise
-
-    def encode(
-        self,
-        prompt: PromptType,
-        pooling_params: PoolingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        raise NotImplementedError(
-            "Pooling models are not supported in vLLM V0")
-
-    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
-        """Abort a request.
-
-        Abort a submitted request. If the request is finished or not found,
-        this method will be a no-op.
-
-        Args:
-            request_id: The unique id of the request.
-        """
-        if not isinstance(request_id, str):
-            raise RuntimeError("Only single-request abort supported in"
-                               " deprecated V0")
-        if not self.is_running:
-            raise AsyncEngineDeadError(
-                "Background loop is not running. If it was running, "
-                "inspect the output to find the stacktrace of the "
-                "error that caused the background loop to stop "
-                "(AsyncEngineDeadError).")
-
-        return self._abort(request_id)
-
-    def _abort(self, request_id: str) -> None:
-        """Abort a request.
-
-        Abort a submitted request. If the request is finished or not found,
-        this method will be a no-op.
-
-        Args:
-            request_id: The unique id of the request.
-        """
-        self._request_tracker.abort_request(request_id,
-                                            exception=asyncio.CancelledError,
-                                            verbose=self.log_requests)
-
-    async def get_vllm_config(self) -> VllmConfig:
-        """Get the vllm configuration of the vLLM engine."""
-        return self.engine.get_vllm_config()
-
-    async def get_model_config(self) -> ModelConfig:
-        """Get the model configuration of the vLLM engine."""
-        return self.engine.get_model_config()
-
-    async def get_parallel_config(self) -> ParallelConfig:
-        """Get the parallel configuration of the vLLM engine."""
-        return self.engine.get_parallel_config()
-
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
-        return self.engine.get_decoding_config()
-
-    async def get_scheduler_config(self) -> SchedulerConfig:
-        """Get the scheduling configuration of the vLLM engine."""
-        return self.engine.get_scheduler_config()
-
-    async def get_lora_config(self) -> LoRAConfig:
-        """Get the lora configuration of the vLLM engine."""
-        return self.engine.get_lora_config()
-
-    async def do_log_stats(
-            self,
-            scheduler_outputs: Optional[SchedulerOutputs] = None,
-            model_output: Optional[List[SamplerOutput]] = None) -> None:
-        self.engine.do_log_stats()
-
-    async def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        t = time.perf_counter()
-        logger.debug("Starting health check...")
-        if self.is_stopped:
-            raise AsyncEngineDeadError("Background loop is stopped.")
-
-        await self.engine.check_health_async()
-        logger.debug("Health check took %fs", time.perf_counter() - t)
-
-    async def is_tracing_enabled(self) -> bool:
-        return self.engine.is_tracing_enabled()
-
-    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        self.engine.add_logger(logger_name=logger_name, logger=logger)
-
-    def remove_logger(self, logger_name: str) -> None:
-        self.engine.remove_logger(logger_name=logger_name)
-
-    async def start_profile(self) -> None:
-        self.engine.start_profile()
-
-    async def stop_profile(self) -> None:
-        self.engine.stop_profile()
-
-    async def reset_mm_cache(self) -> None:
-        self.engine.reset_mm_cache()
-
-    async def reset_prefix_cache(self,
-                                 device: Optional[Device] = None) -> None:
-        self.engine.reset_prefix_cache(device)
-
-    async def sleep(self, level: int = 1) -> None:
-        await self.reset_prefix_cache()
-        self.engine.sleep(level)
-
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        self.engine.wake_up(tags)
-
-    async def is_sleeping(self) -> bool:
-        return self.engine.is_sleeping()
-
-    async def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.engine.add_lora(lora_request)
-
-    async def collective_rpc(self,
-                             method: str,
-                             timeout: Optional[float] = None,
-                             args: tuple = (),
-                             kwargs: Optional[dict] = None):
-        """
-        Perform a collective RPC call to the given path.
-        """
-        return await self.engine.collective_rpc_async(method, timeout, args,
-                                                      kwargs)
-
-
-# TODO(v1): Remove this class proxy when V1 goes default.
-if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
-    from vllm.v1.engine.async_llm import AsyncLLM
-
-    AsyncLLMEngine = AsyncLLM  # type: ignore
+AsyncLLMEngine = AsyncLLM  # type: ignore
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 887e27710924..657190543269 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -11,7 +11,6 @@
 from fastapi import FastAPI, Request, Response
 
 from vllm import envs
-from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.multiprocessing import MQEngineDeadError
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
@@ -155,7 +154,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
     """
 
     @app.exception_handler(RuntimeError)
-    @app.exception_handler(AsyncEngineDeadError)
     @app.exception_handler(MQEngineDeadError)
     @app.exception_handler(EngineDeadError)
     @app.exception_handler(EngineGenerateError)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e4aa7f3d5a6..85706738835e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import atexit
 import gc
 import importlib
 import inspect
@@ -17,7 +16,6 @@
 from argparse import Namespace
 from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
-from functools import partial
 from http import HTTPStatus
 from typing import Annotated, Any, Callable, Optional
 
@@ -41,9 +39,6 @@
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
-from vllm.engine.multiprocessing.client import MQLLMEngineClient
-from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (load_chat_template,
                                          resolve_hf_chat_template,
@@ -102,13 +97,10 @@
                                     log_non_default_args, with_cancellation)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
-from vllm.transformers_utils.config import (
-    maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
-                        get_open_zmq_ipc_path, is_valid_ipv6_address,
-                        set_ulimit)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -236,111 +228,8 @@ async def build_async_engine_client_from_engine_args(
             if async_llm:
                 async_llm.shutdown()
 
-    # V0 AsyncLLM.
-    elif (MQLLMEngineClient.is_unsupported_config(vllm_config)
-          or disable_frontend_multiprocessing):
-
-        engine_client: Optional[EngineClient] = None
-        try:
-            engine_client = AsyncLLMEngine.from_vllm_config(
-                vllm_config=vllm_config,
-                usage_context=usage_context,
-                enable_log_requests=engine_args.enable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats)
-            yield engine_client
-        finally:
-            if engine_client and hasattr(engine_client, "shutdown"):
-                engine_client.shutdown()
-
-    # V0MQLLMEngine.
     else:
-        if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
-            # Make TemporaryDirectory for prometheus multiprocessing
-            # Note: global TemporaryDirectory will be automatically
-            #   cleaned up upon exit.
-            global prometheus_multiproc_dir
-            prometheus_multiproc_dir = tempfile.TemporaryDirectory()
-            os.environ[
-                "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
-        else:
-            logger.warning(
-                "Found PROMETHEUS_MULTIPROC_DIR was set by user. "
-                "This directory must be wiped between vLLM runs or "
-                "you will find inaccurate metrics. Unset the variable "
-                "and vLLM will properly handle cleanup.")
-
-        # Select random path for IPC.
-        ipc_path = get_open_zmq_ipc_path()
-        logger.debug("Multiprocessing frontend to use %s for IPC Path.",
-                     ipc_path)
-
-        # Start RPCServer in separate process (holds the LLMEngine).
-        # the current process might have CUDA context,
-        # so we need to spawn a new process
-        context = multiprocessing.get_context("spawn")
-
-        # Ensure we can serialize transformer config before spawning
-        maybe_register_config_serialize_by_value()
-
-        # The Process can raise an exception during startup, which may
-        # not actually result in an exitcode being reported. As a result
-        # we use a shared variable to communicate the information.
-        engine_alive = multiprocessing.Value('b', True, lock=False)
-        engine_process = context.Process(
-            target=run_mp_engine,
-            args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
-                  engine_args.disable_log_stats,
-                  engine_args.enable_log_requests, engine_alive))
-        engine_process.start()
-        engine_pid = engine_process.pid
-        assert engine_pid is not None, "Engine process failed to start."
-        logger.info("Started engine process with PID %d", engine_pid)
-
-        def _cleanup_ipc_path():
-            socket_path = ipc_path.replace("ipc://", "")
-            if os.path.exists(socket_path):
-                os.remove(socket_path)
-
-        # Ensure we clean up the local IPC socket file on exit.
-        atexit.register(_cleanup_ipc_path)
-
-        # Build RPCClient, which conforms to EngineClient Protocol.
-        build_client = partial(MQLLMEngineClient, ipc_path, vllm_config,
-                               engine_pid)
-        mq_engine_client = await asyncio.get_running_loop().run_in_executor(
-            None, build_client)
-        try:
-            while True:
-                try:
-                    await mq_engine_client.setup()
-                    break
-                except TimeoutError:
-                    if (not engine_process.is_alive()
-                            or not engine_alive.value):
-                        raise RuntimeError(
-                            "Engine process failed to start. See stack "
-                            "trace for the root cause.") from None
-
-            yield mq_engine_client  # type: ignore[misc]
-        finally:
-            # Ensure rpc server process was terminated
-            engine_process.terminate()
-
-            # Close all open connections to the backend
-            mq_engine_client.close()
-
-            # Wait for engine process to join
-            engine_process.join(4)
-            if engine_process.exitcode is None:
-                # Kill if taking longer than 5 seconds to stop
-                engine_process.kill()
-
-            # Lazy import for prometheus multiprocessing.
-            # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
-            # before prometheus_client is imported.
-            # See https://prometheus.github.io/client_python/multiprocess/
-            from prometheus_client import multiprocess
-            multiprocess.mark_process_dead(engine_process.pid)
+        assert False
 
 
 async def validate_json_request(raw_request: Request):

From 6c89e6248de1d42803d7fe18cda5acd7dd917038 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 16 Sep 2025 21:16:58 -0700
Subject: [PATCH 02/17] fix assert false

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/entrypoints/openai/api_server.py | 58 +++++++++++++--------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 85706738835e..361e9dbbb743 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -198,38 +198,34 @@ async def build_async_engine_client_from_engine_args(
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
     # V1 AsyncLLM.
-    if envs.VLLM_USE_V1:
-        if disable_frontend_multiprocessing:
-            logger.warning(
-                "V1 is enabled, but got --disable-frontend-multiprocessing. "
-                "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
-
-        from vllm.v1.engine.async_llm import AsyncLLM
-        async_llm: Optional[AsyncLLM] = None
-        client_count = client_config.pop(
-            "client_count") if client_config else 1
-        client_index = client_config.pop(
-            "client_index") if client_config else 0
-        try:
-            async_llm = AsyncLLM.from_vllm_config(
-                vllm_config=vllm_config,
-                usage_context=usage_context,
-                enable_log_requests=engine_args.enable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats,
-                client_addresses=client_config,
-                client_count=client_count,
-                client_index=client_index)
-
-            # Don't keep the dummy data in memory
-            await async_llm.reset_mm_cache()
-
-            yield async_llm
-        finally:
-            if async_llm:
-                async_llm.shutdown()
+    assert envs.VLLM_USE_V1
 
-    else:
-        assert False
+    if disable_frontend_multiprocessing:
+        logger.warning(
+            "V1 is enabled, but got --disable-frontend-multiprocessing. "
+            "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
+
+    from vllm.v1.engine.async_llm import AsyncLLM
+    async_llm: Optional[AsyncLLM] = None
+    client_count = client_config.pop("client_count") if client_config else 1
+    client_index = client_config.pop("client_index") if client_config else 0
+    try:
+        async_llm = AsyncLLM.from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            enable_log_requests=engine_args.enable_log_requests,
+            disable_log_stats=engine_args.disable_log_stats,
+            client_addresses=client_config,
+            client_count=client_count,
+            client_index=client_index)
+
+        # Don't keep the dummy data in memory
+        await async_llm.reset_mm_cache()
+
+        yield async_llm
+    finally:
+        if async_llm:
+            async_llm.shutdown()
 
 
 async def validate_json_request(raw_request: Request):

From 51a326de9a35098548ef402166322d4f20c0c91b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 11:31:50 -0700
Subject: [PATCH 03/17] fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/distributed/test_pipeline_parallel.py   |   12 -
 tests/engine/conftest.py                      |   12 -
 tests/engine/test_computed_prefix_blocks.py   |   37 -
 tests/engine/test_executor.py                 |  111 --
 tests/engine/test_multiproc_workers.py        |  179 ---
 tests/engine/test_options.py                  |   58 -
 tests/engine/test_short_mm_context.py         |    1 +
 tests/engine/test_stop_checker.py             |  225 ----
 .../openai/correctness/test_lmeval.py         |   10 -
 tests/v1/engine/test_output_processor.py      | 1000 -----------------
 10 files changed, 1 insertion(+), 1644 deletions(-)
 delete mode 100644 tests/engine/conftest.py
 delete mode 100644 tests/engine/test_computed_prefix_blocks.py
 delete mode 100644 tests/engine/test_executor.py
 delete mode 100644 tests/engine/test_multiproc_workers.py
 delete mode 100644 tests/engine/test_options.py
 delete mode 100644 tests/engine/test_stop_checker.py
 delete mode 100644 tests/v1/engine/test_output_processor.py

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 9da9672d9597..76b105e8a8ec 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -26,18 +26,6 @@
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    For PP, we fall back to V0 by default. This means
-    that the TP baseline runs with V1 while the PP engine
-    runs with V0. This gives divergent results with dummy
-    weights. Once we enable V1 by default for PP, we can
-    remove this.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py
deleted file mode 100644
index 375b248ebeda..000000000000
--- a/tests/engine/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
deleted file mode 100644
index ac5a1f957dfe..000000000000
--- a/tests/engine/test_computed_prefix_blocks.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.sampling_params import SamplingParams
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-@pytest.mark.parametrize("block_size", [16])
-def test_computed_prefix_blocks(model: str, block_size: int):
-    # This test checks if we are able to run the engine to completion
-    # without triggering asserts.
-    # We are in a scenario where all blocks from the second request's prompt
-    # are full and already computed when the second request arrives.
-    prompt = (
-        "You are a helpful assistant. How do I build a car from cardboard and "
-        "paper clips? Is there an easy to follow video tutorial available "
-        "online for free?")
-    prompt2 = (
-        " Please recommend to me some resources where I can learn not only to "
-        "handle technical difficulties of building a car, but also "
-        "decoration.")
-
-    engine_args = EngineArgs(model=model,
-                             block_size=block_size,
-                             enable_prefix_caching=True)
-
-    engine = LLMEngine.from_engine_args(engine_args)
-    sampling_params = SamplingParams()
-
-    engine.add_request("0", prompt + prompt2, sampling_params)
-    engine.step()
-    engine.add_request("1", prompt, sampling_params)
-    engine.step()
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
deleted file mode 100644
index 67064aff3ae9..000000000000
--- a/tests/engine/test_executor.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-from typing import Any, Callable, Optional, Union
-
-import pytest
-
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.llm_engine import LLMEngine
-from vllm.executor.uniproc_executor import UniProcExecutor
-from vllm.sampling_params import SamplingParams
-
-
-class Mock:
-    ...
-
-
-class CustomUniExecutor(UniProcExecutor):
-
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict] = None) -> list[Any]:
-        # Drop marker to show that this was run
-        with open(".marker", "w"):
-            ...
-        return super().collective_rpc(method, timeout, args, kwargs)
-
-
-CustomUniExecutorAsync = CustomUniExecutor
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_custom_executor_type_checking(model):
-    with pytest.raises(ValueError):
-        engine_args = EngineArgs(model=model,
-                                 distributed_executor_backend=Mock)
-        LLMEngine.from_engine_args(engine_args)
-    with pytest.raises(ValueError):
-        engine_args = AsyncEngineArgs(model=model,
-                                      distributed_executor_backend=Mock)
-        AsyncLLMEngine.from_engine_args(engine_args)
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_custom_executor(model, tmp_path):
-    cwd = os.path.abspath(".")
-    os.chdir(tmp_path)
-    try:
-        assert not os.path.exists(".marker")
-
-        engine_args = EngineArgs(
-            model=model,
-            distributed_executor_backend=CustomUniExecutor,
-            enforce_eager=True,  # reduce test time
-        )
-        engine = LLMEngine.from_engine_args(engine_args)
-        sampling_params = SamplingParams(max_tokens=1)
-
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
-
-        assert os.path.exists(".marker")
-    finally:
-        os.chdir(cwd)
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_custom_executor_async(model, tmp_path):
-    cwd = os.path.abspath(".")
-    os.chdir(tmp_path)
-    try:
-        assert not os.path.exists(".marker")
-
-        engine_args = AsyncEngineArgs(
-            model=model,
-            distributed_executor_backend=CustomUniExecutorAsync,
-            enforce_eager=True,  # reduce test time
-        )
-        engine = AsyncLLMEngine.from_engine_args(engine_args)
-        sampling_params = SamplingParams(max_tokens=1)
-
-        async def t():
-            stream = await engine.add_request("0", "foo", sampling_params)
-            async for x in stream:
-                ...
-
-        asyncio.run(t())
-
-        assert os.path.exists(".marker")
-    finally:
-        os.chdir(cwd)
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_respect_ray(model):
-    # even for TP=1 and PP=1,
-    # if users specify ray, we should use ray.
-    # users might do this if they want to manage the
-    # resources using ray.
-    engine_args = EngineArgs(
-        model=model,
-        distributed_executor_backend="ray",
-        enforce_eager=True,  # reduce test time
-    )
-    engine = LLMEngine.from_engine_args(engine_args)
-    assert engine.model_executor.uses_ray
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
deleted file mode 100644
index b5381b61a020..000000000000
--- a/tests/engine/test_multiproc_workers.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
-from time import sleep
-from typing import Any
-
-import pytest
-
-from vllm.config import VllmConfig
-from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
-                                                  ResultHandler, WorkerMonitor)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-
-class DummyWorkerWrapper(WorkerWrapperBase):
-    """Dummy version of vllm.worker.worker.Worker"""
-
-    def worker_method(self, worker_input: Any) -> tuple[int, Any]:
-        sleep(0.05)
-
-        if isinstance(worker_input, Exception):
-            # simulate error case
-            raise worker_input
-
-        return self.rpc_rank, input
-
-
-def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
-    result_handler = ResultHandler()
-    vllm_config = VllmConfig()
-    workers = [
-        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
-                             rank) for rank in range(8)
-    ]
-
-    worker_monitor = WorkerMonitor(workers, result_handler)
-    assert not worker_monitor.is_alive()
-
-    result_handler.start()
-    worker_monitor.start()
-    assert worker_monitor.is_alive()
-
-    return workers, worker_monitor
-
-
-def test_local_workers() -> None:
-    """Test workers with sync task submission"""
-
-    workers, worker_monitor = _start_workers()
-
-    def execute_workers(worker_input: str) -> None:
-        worker_outputs = [
-            worker.execute_method("worker_method", worker_input)
-            for worker in workers
-        ]
-
-        for rank, output in enumerate(worker_outputs):
-            assert output.get() == (rank, input)
-
-    executor = ThreadPoolExecutor(max_workers=4)
-
-    # Test concurrent submission from different threads
-    futures = [
-        executor.submit(partial(execute_workers, f"thread {thread_num}"))
-        for thread_num in range(4)
-    ]
-
-    for future in futures:
-        future.result()
-
-    # Test error case
-    exception = ValueError("fake error")
-    result = workers[0].execute_method("worker_method", exception)
-    try:
-        result.get()
-        pytest.fail("task should have failed")
-    except Exception as e:
-        assert isinstance(e, ValueError)
-        assert str(e) == "fake error"
-
-    # Test cleanup when a worker fails
-    assert worker_monitor.is_alive()
-    workers[3].process.kill()
-
-    # Other workers should get shut down here
-    worker_monitor.join(20)
-
-    # Ensure everything is stopped
-    assert not worker_monitor.is_alive()
-    assert all(not worker.process.is_alive() for worker in workers)
-
-    # Further attempts to submit tasks should fail
-    try:
-        _result = workers[0].execute_method("worker_method", "test")
-        pytest.fail("task should fail once workers have been shut down")
-    except Exception as e:
-        assert isinstance(e, ChildProcessError)
-
-
-def test_local_workers_clean_shutdown() -> None:
-    """Test clean shutdown"""
-
-    workers, worker_monitor = _start_workers()
-
-    assert worker_monitor.is_alive()
-    assert all(worker.process.is_alive() for worker in workers)
-
-    # Clean shutdown
-    worker_monitor.close()
-
-    worker_monitor.join(20)
-
-    # Ensure everything is stopped
-    assert not worker_monitor.is_alive()
-    assert all(not worker.process.is_alive() for worker in workers)
-
-    # Further attempts to submit tasks should fail
-    try:
-        _result = workers[0].execute_method("worker_method", "test")
-        pytest.fail("task should fail once workers have been shut down")
-    except Exception as e:
-        assert isinstance(e, ChildProcessError)
-
-
-@pytest.mark.asyncio
-async def test_local_workers_async() -> None:
-    """Test local workers with async task submission"""
-
-    workers, worker_monitor = _start_workers()
-
-    async def execute_workers(worker_input: str) -> None:
-        worker_coros = [
-            worker.execute_method_async("worker_method", worker_input)
-            for worker in workers
-        ]
-
-        results = await asyncio.gather(*worker_coros)
-        for rank, result in enumerate(results):
-            assert result == (rank, input)
-
-    tasks = [
-        asyncio.create_task(execute_workers(f"task {task_num}"))
-        for task_num in range(4)
-    ]
-
-    for task in tasks:
-        await task
-
-    # Test error case
-    exception = ValueError("fake error")
-    try:
-        _result = await workers[0].execute_method_async(
-            "worker_method", exception)
-        pytest.fail("task should have failed")
-    except Exception as e:
-        assert isinstance(e, ValueError)
-        assert str(e) == "fake error"
-
-    # Test cleanup when a worker fails
-    assert worker_monitor.is_alive()
-    workers[3].process.kill()
-
-    # Other workers should get shut down here
-    worker_monitor.join(20)
-
-    # Ensure everything is stopped
-    assert not worker_monitor.is_alive()
-    assert all(not worker.process.is_alive() for worker in workers)
-
-    # Further attempts to submit tasks should fail
-    try:
-        _result = await workers[0].execute_method_async(
-            "worker_method", "test")
-        pytest.fail("task should fail once workers have been shut down")
-    except Exception as e:
-        assert isinstance(e, ChildProcessError)
diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
deleted file mode 100644
index 42e88e84770a..000000000000
--- a/tests/engine/test_options.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from contextlib import nullcontext
-
-import pytest
-
-from vllm.entrypoints.llm import LLM
-from vllm.sampling_params import SamplingParams
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_skip_tokenizer_initialization(model: str):
-    # This test checks if the flag skip_tokenizer_init skips the initialization
-    # of tokenizer and detokenizer. The generated output is expected to contain
-    # token ids.
-    llm = LLM(
-        model=model,
-        skip_tokenizer_init=True,
-        enforce_eager=True,
-    )
-    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-
-    with pytest.raises(ValueError, match="cannot pass text prompts when"):
-        llm.generate("abc", sampling_params)
-
-    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
-                           sampling_params=sampling_params)
-    assert len(outputs) > 0
-    completions = outputs[0].outputs
-    assert len(completions) > 0
-    assert completions[0].text == ""
-    assert completions[0].token_ids
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
-def test_enable_prompt_embeds(hf_runner, model: str,
-                              enable_prompt_embeds: bool):
-    prompt = "abc"
-
-    with hf_runner(model) as hf_model:
-        token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids
-        token_ids = token_ids.to(hf_model.model.device)
-
-        embed_layer = hf_model.model.get_input_embeddings()
-        prompt_embeds = embed_layer(token_ids).squeeze(0)
-
-    ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
-        ValueError, match="set `--enable-prompt-embeds`"))
-
-    llm = LLM(
-        model=model,
-        enable_prompt_embeds=enable_prompt_embeds,
-        enforce_eager=True,
-    )
-
-    with ctx:
-        llm.generate({"prompt_embeds": prompt_embeds})
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index 9c62761d78af..9eb3dfc09224 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -25,6 +25,7 @@ def test_context_length_too_short(vllm_runner, image_assets, model):
             model,
             max_model_len=128,  # LLaVA has a feature size of 576
             enforce_eager=True,
+            load_format="dummy",
         )
 
         with vllm_model:
diff --git a/tests/engine/test_stop_checker.py b/tests/engine/test_stop_checker.py
deleted file mode 100644
index 34f4cb13ab0a..000000000000
--- a/tests/engine/test_stop_checker.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-from transformers import AutoTokenizer
-
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.reasoning import ReasoningParser
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Sequence, SequenceStatus
-
-REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-
-
-class MockReasoningParser(ReasoningParser):
-    """Mock reasoning parser for testing purposes."""
-
-    def __init__(self,
-                 tokenizer: AutoTokenizer,
-                 reasoning_active: bool = False):
-        super().__init__(tokenizer)
-        self.reasoning_active = reasoning_active
-
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return not self.reasoning_active
-
-    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
-        return input_ids
-
-
-class MockSequence(Sequence):
-    """Mock sequence for testing purposes."""
-
-    def __init__(self, token_ids, output_text="test_output", eos_token_id=0):
-        self.token_ids = token_ids
-        self.output_text = output_text
-        self.eos_token_id = eos_token_id
-        self.status = SequenceStatus.RUNNING
-        self.stop_reason = None
-
-    def get_token_ids(self):
-        return self.token_ids
-
-    def get_last_token_id(self):
-        return self.token_ids[-1] if self.token_ids else None
-
-    def get_len(self):
-        return len(self.token_ids)
-
-    def get_output_len(self):
-        return len(self.token_ids) - 1  # Simulating prompt + outputs
-
-
-@pytest.fixture
-def deepseek_r1_qwen_tokenizer():
-    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
-
-
-@pytest.fixture
-def stop_checker():
-    return StopChecker(max_model_len=10)
-
-
-@pytest.fixture
-def stop_checker_with_reasoner():
-    reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer)
-    return StopChecker(max_model_len=10, reasoner=reasoner)
-
-
-def test_eos_token_stopping(stop_checker):
-    """Test sequence stopping when EOS token is encountered."""
-    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
-    sampling_params = SamplingParams()
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=1,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.FINISHED_STOPPED
-
-
-def test_ignore_eos(stop_checker):
-    """Test sequence continuing when EOS token is ignored."""
-    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
-    sampling_params = SamplingParams(ignore_eos=True)
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=1,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.RUNNING
-
-
-def test_min_tokens(stop_checker):
-    """Test min_tokens prevents early stopping."""
-    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
-    sampling_params = SamplingParams(min_tokens=3)
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=1,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.RUNNING
-
-
-def test_stop_token_ids(stop_checker):
-    """Test sequence stopping with custom stop token IDs."""
-    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
-    sampling_params = SamplingParams(stop_token_ids=[3])
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=1,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.FINISHED_STOPPED
-    assert seq.stop_reason == 3
-
-
-def test_stop_strings(stop_checker):
-    """Test sequence stopping with stop strings."""
-    seq = MockSequence(token_ids=[1, 2, 3],
-                       output_text="test output with STOP",
-                       eos_token_id=0)
-    sampling_params = SamplingParams(stop=["STOP"])
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=1,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.FINISHED_STOPPED
-    assert seq.stop_reason == "STOP"
-    assert "STOP" not in seq.output_text  # Default behavior removes stop string
-
-
-def test_include_stop_str_in_output(stop_checker):
-    """Test keeping stop strings in output."""
-    seq = MockSequence(token_ids=[1, 2, 3],
-                       output_text="test output with STOP",
-                       eos_token_id=0)
-    sampling_params = SamplingParams(stop=["STOP"],
-                                     include_stop_str_in_output=True)
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=5,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.FINISHED_STOPPED
-    assert "STOP" in seq.output_text
-
-
-def test_max_tokens(stop_checker):
-    """Test sequence stopping at max_tokens."""
-    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
-    sampling_params = SamplingParams(max_tokens=2)
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=1,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED
-
-
-def test_max_model_len(stop_checker):
-    """Test sequence stopping at max_model_len."""
-    seq = MockSequence(token_ids=list(range(11)),
-                       eos_token_id=0)  # 11 tokens, max is 10
-    sampling_params = SamplingParams()
-
-    stop_checker.maybe_stop_sequence(seq,
-                                     new_char_count=1,
-                                     sampling_params=sampling_params)
-
-    assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED
-
-
-def test_reasoning_skip_stops(stop_checker_with_reasoner):
-    """Test that stop tokens and strings are ignored during reasoning."""
-    # Set reasoning_active to True to simulate being in reasoning mode
-    stop_checker_with_reasoner.reasoner.reasoning_active = True
-
-    # Test with stop token
-    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
-    sampling_params = SamplingParams(stop_token_ids=[3])
-
-    stop_checker_with_reasoner.maybe_stop_sequence(
-        seq, new_char_count=1, sampling_params=sampling_params)
-    assert seq.status == SequenceStatus.RUNNING
-
-    # Test with stop string
-    seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP")
-    sampling_params = SamplingParams(stop=["STOP"])
-
-    stop_checker_with_reasoner.maybe_stop_sequence(
-        seq, new_char_count=4, sampling_params=sampling_params)
-    assert seq.status == SequenceStatus.RUNNING
-
-    # But EOS token still stops the sequence
-    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
-    sampling_params = SamplingParams()
-
-    stop_checker_with_reasoner.maybe_stop_sequence(
-        seq, new_char_count=1, sampling_params=sampling_params)
-    assert seq.status == SequenceStatus.FINISHED_STOPPED
-
-
-def test_reasoning_end_enables_stops(stop_checker_with_reasoner):
-    """Test that stop tokens work after reasoning ends."""
-    # Set reasoning_active to False to simulate being out of reasoning mode
-    stop_checker_with_reasoner.reasoner.reasoning_active = False
-
-    # Test with stop token
-    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
-    sampling_params = SamplingParams(stop_token_ids=[3])
-
-    stop_checker_with_reasoner.maybe_stop_sequence(
-        seq, new_char_count=1, sampling_params=sampling_params)
-    assert seq.status == SequenceStatus.FINISHED_STOPPED
-
-    # Test with stop string
-    seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP")
-    sampling_params = SamplingParams(stop=["STOP"])
-
-    stop_checker_with_reasoner.maybe_stop_sequence(
-        seq, new_char_count=4, sampling_params=sampling_params)
-    assert seq.status == SequenceStatus.FINISHED_STOPPED
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 684407cd6ee9..624acd5ffde7 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
             more_args = ["--max-num-seqs", "64"]
 
         run_test(more_args)
-
-
-@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
-                                    more_args):
-    """Run with the V0 Engine."""
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        run_test(more_args)
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
deleted file mode 100644
index a9632ce54eac..000000000000
--- a/tests/v1/engine/test_output_processor.py
+++ /dev/null
@@ -1,1000 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import math
-import time
-from typing import Optional
-
-import pytest
-
-from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
-                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST,
-                                   STOP_STRINGS,
-                                   DummyOutputProcessorTestVectors,
-                                   MockEngineCore)
-from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import PromptLogprobs, SampleLogprobs
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.output_processor import (OutputProcessor,
-                                             RequestOutputCollector)
-from vllm.v1.metrics.stats import IterationStats
-
-
-def _ref_convert_id_to_token(
-    tokenizer: AnyTokenizer,
-    token_id: int,
-) -> str:
-    """Reference impl of logprobs detokenization.
-
-    Args:
-      tokenizer: tokenizer used by the model under test
-      token_id: convert this token id
-
-    Returns:
-      String representation of input token id
-    """
-    return tokenizer.decode([token_id]) or ""
-
-
-@pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind,
-                                    dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens)
-
-    # Make N requests.
-    requests = [
-        EngineCoreRequest(request_id=f"request-{idx}",
-                          prompt_token_ids=prompt_tokens,
-                          mm_features=None,
-                          eos_token_id=None,
-                          arrival_time=0,
-                          lora_request=None,
-                          cache_salt=None,
-                          data_parallel_rank=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False,
-                          ),
-                          pooling_params=None)
-        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
-    ]
-
-    # Add requests to the detokenizer.
-    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
-        output_processor.add_request(request, prompt)
-
-    gen_strings = {}
-    gen_tokens = {}
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        processed_outputs = output_processor.process_outputs(outputs)
-        request_outputs = processed_outputs.request_outputs
-        requests_to_abort = processed_outputs.reqs_to_abort
-        assert len(requests_to_abort) == 0
-
-        # Update tracking.
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            new_tokens = request_output.outputs[0].token_ids
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-                gen_tokens[request_id] = new_tokens
-            else:
-                gen_strings[request_id] += new_text
-                gen_tokens[request_id].extend(new_tokens)
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(dummy_test_vectors.generation_strings,
-                dummy_test_vectors.generation_tokens)):
-        gen_str = gen_strings[f"request-{idx}"]
-        gen_toks = gen_tokens[f"request-{idx}"]
-
-        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
-        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
-
-    assert output_processor.get_num_unfinished_requests() == 0
-    assert not output_processor.has_unfinished_requests()
-
-
-def _validate_logprobs(
-    gen_tokens: dict[str, list[int]],
-    gen_logprobs: dict[str, Optional[SampleLogprobs]],
-    gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]],
-    gen_cumulative_logprob: dict[str, float],
-    dtv: DummyOutputProcessorTestVectors,
-    request_id_list: list[str],
-    num_sample_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
-) -> None:
-    for req_idx, req_id in enumerate(request_id_list):
-        new_tokens = gen_tokens[req_id]
-        logprobs = gen_logprobs[req_id]
-        prompt_logprobs = gen_prompt_logprobs[req_id]
-        cumulative_logprob = gen_cumulative_logprob[req_id]
-        prompt_token_ids = dtv.prompt_tokens[req_idx]
-        ref_logprobs = dtv.generation_logprobs[req_idx]
-        ref_prompt_logprobs = dtv.prompt_logprobs[req_idx]
-        if num_sample_logprobs is not None:
-            # Validate sample logprobs
-            assert logprobs is not None, (f"Request {req_id} requires sample"
-                                          " logprobs but sample logprobs are"
-                                          " None.")
-            # Require num sampled tokens to match num
-            # sampled logprobs - especially important
-            # to check since the detokenizer can cause
-            # a request to finish early due to a stop
-            # string being hit
-            num_new_tokens = len(new_tokens)
-            len_sample_logprobs = len(logprobs)
-            assert num_new_tokens == len_sample_logprobs, (
-                f"Request {req_id} has {num_new_tokens}"
-                " completion tokens but has"
-                f" {len_sample_logprobs} sample logprobs.")
-            ref_cumulative_logprob = 0.0
-            for idx, (sampled_token,
-                      pos_logprob_dict) in enumerate(zip(new_tokens,
-                                                         logprobs)):
-                # Break out the reference log probability value &
-                # logprob token id tensors associated with this
-                # position in the completion. Also break out the
-                # sampled token ranks
-                (ref_pos_logprob_toks, ref_pos_logprob_vals,
-                 ref_sampled_token_rank) = ref_logprobs[idx]
-                # For each position in the completion sequence,
-                # ensure the actual sampled token is among the
-                # logprobs
-                assert sampled_token in pos_logprob_dict, (
-                    f"Sampled token {sampled_token} not"
-                    f" present in logprob at index {idx}")
-
-                # Validate number of sample logprobs
-                num_lp_toks = len(pos_logprob_dict)
-                assert (num_lp_toks == num_sample_logprobs
-                        or num_lp_toks == num_sample_logprobs +
-                        1), ("Valid numbers of sample logprobs are"
-                             f" {num_sample_logprobs} or"
-                             f" {num_sample_logprobs+1} but"
-                             f" {num_lp_toks} logprobs found at"
-                             f" position {idx}. Logprobs dict:"
-                             f" {pos_logprob_dict}")
-
-                # Validate sampled token logprob rank
-                smp_lp = pos_logprob_dict[sampled_token]
-                smp_lp_rank = smp_lp.rank
-                assert (ref_sampled_token_rank == smp_lp_rank), (
-                    "Sampled token logprob rank"
-                    f" {smp_lp_rank} does not match"
-                    " correct value"
-                    f" {ref_sampled_token_rank}"
-                    f" in Logprob {smp_lp}")
-
-                # Validate that the logprob processor yields
-                # the correct log probabilities and valid
-                # rankings
-                rank_one_appears = False
-                for jdx in range(1, len(ref_pos_logprob_toks)):
-                    # Iterate over the (logprob val,logprob tok id)
-                    # pairs expected by the test fixture at this
-                    # position in the completion.
-                    ref_lp_val = ref_pos_logprob_vals[jdx]
-                    ref_tok_id = ref_pos_logprob_toks[jdx]
-                    assert ref_tok_id in pos_logprob_dict, (
-                        f"Expected token {ref_tok_id} to be"
-                        f" in logprob dict but it is not.")
-
-                    # Extract actually-generated logprob
-                    # info
-                    lp = pos_logprob_dict[ref_tok_id]
-                    lp_val = lp.logprob
-                    lp_rank = lp.rank
-
-                    # A "top" (rank 1) logprob must be
-                    # present
-                    rank_one_appears = (True
-                                        if lp_rank == 1 else rank_one_appears)
-
-                    # Rank must be >= 1
-                    assert lp_rank >= 1, (f"Logprob {lp} has invalid"
-                                          f" rank {lp_rank} < 1."
-                                          f" Logprob dict: {pos_logprob_dict}")
-
-                    # Validate log probability
-                    assert math.isclose(lp_val, ref_lp_val), (
-                        f"Token id {ref_tok_id} appears in logprobs dict"
-                        f" at position {idx} in completion with log"
-                        f" probability {lp_val} but {ref_lp_val} was"
-                        f" expected. Logprob: {lp}")
-
-                assert rank_one_appears, (f"No Logprob has rank 1"
-                                          " in the following Logprob"
-                                          f" dict: {pos_logprob_dict}")
-
-                # Validate logprobs detokenization
-                for lp_tok in pos_logprob_dict:
-                    # Confirm that sample logprob decoded token matches
-                    # the logprob token id at this sequence position
-                    decoded_token = pos_logprob_dict[lp_tok].decoded_token
-                    ref_decoded_token = _ref_convert_id_to_token(
-                        dtv.tokenizer, lp_tok)
-                    assert decoded_token == ref_decoded_token, (
-                        f"Sampled logprob token id {lp_tok} decodes to"
-                        f" {ref_decoded_token} but Logprob decoded"
-                        f" token is {decoded_token} instead"
-                        f" (at position {idx})")
-
-                ref_cumulative_logprob += pos_logprob_dict[
-                    sampled_token].logprob
-            # Assert that cumulative logprobs are correct
-            assert math.isclose(cumulative_logprob, ref_cumulative_logprob)
-        else:
-            # Sample logprobs disabled for this request
-            assert logprobs is None
-            assert cumulative_logprob is None
-
-        if num_prompt_logprobs is not None:
-            # Validate prompt logprobs
-            assert prompt_logprobs is not None, (
-                f"Request {req_id} requires prompt"
-                " logprobs but prompt logprobs are"
-                " None.")
-            # Require num prompt tokens to match num
-            # prompt logprobs
-            num_prompt_tokens = len(prompt_token_ids)
-            len_prompt_logprobs = len(prompt_logprobs)
-            assert num_prompt_tokens == len_prompt_logprobs, (
-                f"Request {req_id} has {num_prompt_tokens}"
-                " prompt tokens but has"
-                f" {len_prompt_logprobs} prompt logprobs.")
-            # First prompt logprob is None
-            first_plp_dict = prompt_logprobs[0]
-            assert first_plp_dict is None, (
-                f"Request {req_id} first prompt logprob"
-                f" should be None but has following value"
-                f" instead: {first_plp_dict}")
-            # Break out the reference prompt log prob value &
-            # logprob token id matrices for the whole prompt.
-            # Also break out the prompt token rank vector
-            (ref_prompt_logprob_toks, ref_prompt_logprob_vals,
-             ref_prompt_token_ranks) = ref_prompt_logprobs
-            for idx, (prompt_token, pos_logprob_dict) in enumerate(
-                    zip(prompt_token_ids[1:], prompt_logprobs[1:])):
-
-                # Break out the reference prompt log prob value
-                # vector, prompt logprob token id vector, and
-                # prompt token rank at the current position.
-                (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals,
-                 ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :],
-                                               ref_prompt_logprob_vals[idx, :],
-                                               ref_prompt_token_ranks[idx])
-
-                # For each position in the prompt sequence,
-                # ensure the actual prompt token is among the
-                # logprobs
-                assert prompt_token in pos_logprob_dict, (
-                    f"Prompt token {prompt_token} not"
-                    f" present in logprob at index {idx}")
-                # Validate number of prompt logprobs
-                num_plp_toks = len(pos_logprob_dict)
-                assert (num_plp_toks == num_prompt_logprobs
-                        or num_plp_toks == num_prompt_logprobs +
-                        1), ("Valid numbers of prompt logprobs are"
-                             f" {num_prompt_logprobs} or"
-                             f" {num_prompt_logprobs+1} but"
-                             f" {num_plp_toks} logprobs found at"
-                             f" position {idx}. Logprobs dict:"
-                             f" {pos_logprob_dict}")
-
-                # Validate prompt token logprob rank
-                prmpt_tok_lp = pos_logprob_dict[prompt_token]
-                prmpt_tok_lp_rank = prmpt_tok_lp.rank
-                ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank
-                assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), (
-                    "Prompt token logprob rank"
-                    f" {prmpt_tok_lp_rank} does not match"
-                    " correct value"
-                    f" {ref_prmpt_tok_lp_rank}"
-                    f" in Logprob {prmpt_tok_lp}")
-
-                # Validate that the logprob processor yields
-                # the correct prompt log probs and valid
-                # rankings
-                rank_one_appears = False
-                for jdx in range(1, len(ref_pos_prompt_logprob_toks)):
-                    # Iterate over the (logprob val,logprob tok id)
-                    # pairs expected by the test fixture at this
-                    # position in the completion.
-                    ref_plp_val = float(ref_pos_prompt_logprob_vals[jdx])
-                    ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx])
-                    assert ref_tok_id in pos_logprob_dict, (
-                        f"Expected token {ref_tok_id} to be"
-                        f" in logprob dict but it is not.")
-
-                    # Extract actually-generated logprob
-                    # info
-                    plp = pos_logprob_dict[ref_tok_id]
-                    plp_val = plp.logprob
-                    plp_rank = plp.rank
-
-                    # A "top" (rank 1) logprob must be
-                    # present
-                    rank_one_appears = (True
-                                        if plp_rank == 1 else rank_one_appears)
-
-                    # Rank must be >= 1
-                    assert plp_rank >= 1, (
-                        f"Logprob {plp} has invalid"
-                        f" rank {plp_rank} < 1."
-                        f" Logprob dict: {pos_logprob_dict}")
-
-                    # Validate log probability
-                    assert math.isclose(plp_val, ref_plp_val), (
-                        f"Token id {ref_tok_id} appears in logprobs dict"
-                        f" at position {idx} in completion with log"
-                        f" probability {plp_val} but {ref_plp_val} was"
-                        f" expected. Logprob: {plp}")
-
-                assert rank_one_appears, (f"No Logprob has rank 1"
-                                          " in the following Logprob"
-                                          f" dict: {pos_logprob_dict}")
-
-                # Validate prompt logprob detokenization
-                for plp_tok in pos_logprob_dict:
-                    # Confirm that prompt logprob decoded token matches
-                    # the logprob token id at this sequence position
-                    decoded_token = pos_logprob_dict[plp_tok].decoded_token
-                    ref_decoded_token = _ref_convert_id_to_token(
-                        dtv.tokenizer, plp_tok)
-                    assert decoded_token == ref_decoded_token, (
-                        f"Prompt logprob token id {plp_tok} decodes to"
-                        f" {ref_decoded_token} but Logprob decoded"
-                        f" token is {decoded_token} instead"
-                        f" (at position {idx})")
-        else:
-            # Prompt logprobs disabled for this request
-            assert prompt_logprobs is None
-
-
-@pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("num_sample_logprobs",
-                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-@pytest.mark.parametrize("num_prompt_logprobs",
-                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
-def test_logprobs_processor(request_output_kind: RequestOutputKind,
-                            num_sample_logprobs: Optional[int],
-                            num_prompt_logprobs: Optional[int],
-                            dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=None if num_sample_logprobs is None else
-        dummy_test_vectors.generation_logprobs,
-        prompt_logprobs_raw=None
-        if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs)
-
-    # Make N requests.
-    request_id_list = [
-        f"request-{idx}"
-        for idx in range(len(dummy_test_vectors.prompt_strings))
-    ]
-    requests = [
-        EngineCoreRequest(request_id=request_id_list[idx],
-                          prompt_token_ids=prompt_tokens,
-                          mm_features=None,
-                          eos_token_id=None,
-                          arrival_time=0,
-                          lora_request=None,
-                          cache_salt=None,
-                          data_parallel_rank=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False,
-                              logprobs=num_sample_logprobs,
-                              prompt_logprobs=num_prompt_logprobs,
-                          ),
-                          pooling_params=None)
-        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
-    ]
-
-    # Add requests to the detokenizer.
-    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
-        output_processor.add_request(request, prompt)
-
-    gen_tokens = {}
-    gen_logprobs = {}
-    gen_prompt_logprobs = {}
-    gen_cumulative_logprobs = {}
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the logprobs processor.
-        processed_outputs = output_processor.process_outputs(outputs)
-        request_outputs = processed_outputs.request_outputs
-        requests_to_abort = processed_outputs.reqs_to_abort
-        assert len(requests_to_abort) == 0
-
-        # Update tracking.
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-            new_tokens = request_output.outputs[0].token_ids
-            prompt_logprobs = request_output.prompt_logprobs
-            logprobs = request_output.outputs[0].logprobs
-            gen_cumulative_logprobs[request_id] = request_output.outputs[
-                0].cumulative_logprob
-            if request_id not in gen_logprobs:
-                # Start tracking sample and prompt logprobs for this request
-                gen_tokens[request_id] = new_tokens
-                gen_logprobs[request_id] = logprobs
-                gen_prompt_logprobs[request_id] = prompt_logprobs
-            else:
-                # Extend logprobs tracker
-                gen_tokens[request_id].extend(new_tokens)
-                lp = gen_logprobs[request_id]
-                plp = gen_prompt_logprobs[request_id]
-                if lp:
-                    lp.extend(logprobs)
-                if plp:
-                    plp.extend(prompt_logprobs)
-
-    # Confirmed tracked logprobs match what we expect
-    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
-                       gen_cumulative_logprobs, dummy_test_vectors,
-                       request_id_list, num_sample_logprobs,
-                       num_prompt_logprobs)
-
-    assert output_processor.get_num_unfinished_requests() == 0
-    assert not output_processor.has_unfinished_requests()
-
-
-@pytest.mark.parametrize(
-    "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
-    [(False, "stop_token_ids", False, None),
-     (True, "stop_token_ids", False, None),
-     (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
-     (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
-     (False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
-     (False, "eos_token_id", True, None)])
-def test_stop_token(include_stop_str_in_output: bool,
-                    num_sample_logprobs: Optional[int], stop_token_type: str,
-                    ignore_eos: bool, dummy_test_vectors):
-    """Test output processor EOS/stop token handling.
-
-    Send mock engine core request to mock engine core and pass core outputs
-    to output processor. Validate output processor tokens, text and
-    (if enabled) sample logprobs. Batch-size one.
-
-    The test emulates a scenario where a model outputs text tokens followed
-    by two identical control tokens:
-    <token><token>...<token><control><control>
-
-    If EOS is under test, the control tokens are EOS; otherwise, they are
-    some other token id.
-
-    Test behavior:
-
-    * If EOS is under test and `ignore_eos=True`, the detokenized string
-      should be <token><token>...<token><control><control> and the finish
-      reason should be "length" (i.e. no stop occurs)
-
-    * else, if `include_stop_str_in_output==True`, the detokenized
-      string should be <token><token>...<token><control> and the finish
-      reason should be "stop" (i.e. first control token causes stop
-      and is represented in output text)
-
-    * else, the detokenized string should be
-      <token><token>...<token> and the finish reason should be "stop"
-      (i.e. first control token causes stop but is not represented
-      in output text.)
-
-    Note: some test details are tuned for meta-llama/Llama-3.2-1B,
-    another model should work only if the test is modified.
-
-    Args:
-        include_stop_str_in_output: stop token str appears in output text
-        num_sample_logprobs: number of sample logprobs (`None` for no logprobs)
-        stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token
-        ignore_eos: if True, EOS stops are disabled
-        dummy_test_vectors: dummy engine core outputs and other data structures
-    """
-    model_id = dummy_test_vectors.tokenizer.name_or_path
-    if model_id != 'meta-llama/Llama-3.2-1B':
-        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
-                             f"{model_id} is in use.")
-    do_logprobs = num_sample_logprobs is not None
-    # EOS under test; if False, stop_token_ids under test
-    is_eos_test = stop_token_type == "eos_token_id"
-    # EOS under test but ignore_eos enabled
-    is_eos_ignore_test = is_eos_test and ignore_eos
-    eos_token_id = (
-        dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
-    )  # '<|end_of_text|>'
-    stop_token_ids = [128009] if not is_eos_test else None  # '<|eot_id|>'
-
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
-    # Dummy engine core outputs, with control tokens suffixed to test stops
-    suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
-    assert suffix_token is not None and isinstance(suffix_token[0], int)
-    generation_string = dummy_test_vectors.generation_strings[0]
-    generation_tokens = (dummy_test_vectors.generation_tokens[0] +
-                         2 * suffix_token)
-    if do_logprobs:
-        generation_logprobs = (
-            dummy_test_vectors.generation_logprobs[0] +
-            2 * [dummy_test_vectors.generation_logprobs[0][-1]])
-    prompt_string = dummy_test_vectors.prompt_strings[0]
-    prompt_tokens = dummy_test_vectors.prompt_tokens[0]
-    engine_core = MockEngineCore(
-        tokens_list=[generation_tokens],
-        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
-        prompt_logprobs_raw=None,
-        eos_token_id=eos_token_id,
-        stop_token_ids=stop_token_ids,
-        ignore_eos=ignore_eos)
-
-    # Make request.
-    request_id = "request-0"
-    request = EngineCoreRequest(
-        request_id=request_id,
-        prompt_token_ids=prompt_tokens,
-        mm_features=None,
-        eos_token_id=eos_token_id,
-        arrival_time=0,
-        lora_request=None,
-        cache_salt=None,
-        data_parallel_rank=None,
-        sampling_params=SamplingParams(
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=RequestOutputKind.DELTA,
-            stop=[],
-            stop_token_ids=stop_token_ids,
-            include_stop_str_in_output=include_stop_str_in_output,
-            logprobs=num_sample_logprobs,
-            prompt_logprobs=None,
-            ignore_eos=ignore_eos,
-        ),
-        pooling_params=None)
-
-    # Add request to the detokenizer.
-    output_processor.add_request(request, prompt_string)
-
-    # Loop over engine core steps; run output processor
-    gen_string = ""
-    gen_tokens = []
-    gen_logprobs = []
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        processed_outputs = output_processor.process_outputs(outputs)
-        request_outputs = processed_outputs.request_outputs
-        assert len(request_outputs) == 1
-        # Stop token does not rely on abort
-        assert not processed_outputs.reqs_to_abort
-
-        # Update tracking.
-        request_output = request_outputs[0]
-        if request_output.finished:
-            finish_reason = ("length" if is_eos_ignore_test else "stop")
-            assert request_output.outputs[0].finish_reason == finish_reason
-
-        gen_string += request_output.outputs[0].text
-        gen_tokens.extend(request_output.outputs[0].token_ids)
-        if do_logprobs:
-            gen_logprobs.extend(request_output.outputs[0].logprobs)
-
-    # Validate generated text
-    control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
-    if is_eos_ignore_test:
-        # Length-based stop; expect full string
-        ref_str = generation_string + 2 * control_token
-    elif include_stop_str_in_output:
-        # Stop token triggered; include in output
-        ref_str = generation_string + control_token
-    else:
-        # Stop token triggered but not in output
-        ref_str = generation_string
-    assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
-
-    if do_logprobs:
-        # Validate number of sample logprobs
-        num_tokens = len(gen_tokens)
-        num_logprobs = len(gen_logprobs)
-        assert num_tokens == num_logprobs, (
-            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
-
-    # Check requests are finished
-    assert output_processor.get_num_unfinished_requests() == 0
-    assert not output_processor.has_unfinished_requests()
-
-
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.parametrize("num_sample_logprobs",
-                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-def test_stop_string(include_stop_str_in_output: bool,
-                     num_sample_logprobs: Optional[int], dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if num_sample_logprobs else None,
-        prompt_logprobs_raw=None)
-
-    # Make N requests.
-    request_id_list = [
-        f"request-{idx}"
-        for idx in range(len(dummy_test_vectors.prompt_strings))
-    ]
-    requests = [
-        EngineCoreRequest(
-            request_id=request_id_list[idx],
-            prompt_token_ids=prompt_tokens,
-            mm_features=None,
-            eos_token_id=None,
-            arrival_time=0,
-            lora_request=None,
-            cache_salt=None,
-            data_parallel_rank=None,
-            sampling_params=SamplingParams(
-                skip_special_tokens=False,
-                spaces_between_special_tokens=False,
-                output_kind=RequestOutputKind.DELTA,
-                stop=STOP_STRINGS,
-                include_stop_str_in_output=include_stop_str_in_output,
-                logprobs=num_sample_logprobs,
-                prompt_logprobs=None,
-            ),
-            pooling_params=None)
-        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
-    ]
-
-    # Add requests to the detokenizer.
-    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
-        output_processor.add_request(request, prompt)
-
-    gen_strings = {}
-    gen_tokens = {}
-    gen_logprobs = {}
-    gen_prompt_logprobs = {}
-    gen_cumulative_logprobs = {}
-    aborted = []
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        processed_outputs = output_processor.process_outputs(outputs)
-        request_outputs = processed_outputs.request_outputs
-        requests_to_abort = processed_outputs.reqs_to_abort
-        for request_output in request_outputs:
-            # If aborted, we should not get a request output.
-            assert request_output.request_id not in aborted
-        aborted.extend(requests_to_abort)
-
-        # Update tracking.
-        for request_output in request_outputs:
-            if request_output.finished:
-                assert request_output.outputs[0].finish_reason == "stop"
-
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            new_tokens = request_output.outputs[0].token_ids
-            prompt_logprobs = request_output.prompt_logprobs
-            logprobs = request_output.outputs[0].logprobs
-            gen_cumulative_logprobs[request_id] = request_output.outputs[
-                0].cumulative_logprob
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-                gen_tokens[request_id] = new_tokens
-                gen_logprobs[request_id] = logprobs
-                gen_prompt_logprobs[request_id] = prompt_logprobs
-            else:
-                gen_strings[request_id] += new_text
-                gen_tokens[request_id].extend(new_tokens)
-                lp = gen_logprobs[request_id]
-                plp = gen_prompt_logprobs[request_id]
-                if lp:
-                    lp.extend(logprobs)
-                if plp:
-                    plp.extend(prompt_logprobs)
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str, stop_str) in enumerate(
-            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
-
-        # Request should be aborted.
-        request_id = f"request-{idx}"
-        assert request_id in aborted
-
-        # Collected values that were generated.
-        gen_str = gen_strings[request_id]
-
-        # Construct reference strings.
-        stop_str_idx = ref_gen_str.find(stop_str)
-        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
-        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
-
-        if include_stop_str_in_output:
-            assert gen_str == ref_str_inc_stop, (
-                f"{gen_str=}, {ref_str_inc_stop=}")
-        else:
-            assert gen_str == ref_str_exc_stop, (
-                f"{gen_str=}, {ref_str_exc_stop=}")
-
-    # Confirmed tracked logprobs match what we expect
-    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
-                       gen_cumulative_logprobs, dummy_test_vectors,
-                       request_id_list, num_sample_logprobs, None)
-
-    assert output_processor.get_num_unfinished_requests() == 0
-    assert not output_processor.has_unfinished_requests()
-
-
-def test_iteration_stats(dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
-                                       log_stats=True)
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
-    engine_core_timestamp = time.monotonic()
-
-    # Make N requests.
-    requests = [
-        EngineCoreRequest(
-            request_id=f"request-{idx}",
-            prompt_token_ids=prompt_tokens,
-            mm_features=None,
-            eos_token_id=None,
-            arrival_time=0,
-            lora_request=None,
-            cache_salt=None,
-            data_parallel_rank=None,
-            sampling_params=SamplingParams(),
-            pooling_params=None,
-        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
-    ]
-
-    # Add all requests except one to the OutputProcessor.
-    num_active = len(dummy_test_vectors.generation_tokens) - 1
-    for request in requests[:num_active]:
-        output_processor.add_request(request, None)
-    inactive_request = requests[num_active]
-
-    # First iteration has 2 prefills.
-    outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
-    total_prompt_tokens = sum([
-        len(prompt_tokens)
-        for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
-    ])
-
-    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
-    assert iteration_stats.num_generation_tokens == num_active
-
-    # Just decodes in this step.
-    outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
-
-    assert iteration_stats.num_prompt_tokens == 0
-    assert iteration_stats.num_generation_tokens == num_active
-
-    # Add a new request - prefill and 2 decodes in this step.
-    output_processor.add_request(inactive_request, None)
-    num_active += 1
-    outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
-    total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
-
-    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
-    assert iteration_stats.num_generation_tokens == num_active
-
-    # Just decodes in this step.
-    outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
-
-    assert iteration_stats.num_prompt_tokens == 0
-    assert iteration_stats.num_generation_tokens == num_active
-
-
-@pytest.mark.asyncio
-async def test_request_output_collector():
-    NUM_REQS = 3
-    TEXT = "a"
-
-    def make_outputs() -> list[RequestOutput]:
-        return [
-            RequestOutput(
-                request_id="my-request-id",
-                prompt=None,
-                prompt_token_ids=[1, 2, 3],
-                prompt_logprobs=None,
-                outputs=[
-                    CompletionOutput(
-                        index=0,
-                        text=TEXT,
-                        token_ids=[idx],
-                        cumulative_logprob=(idx + 1 * 1.0),
-                        logprobs=[{
-                            "a": idx,
-                            "b": idx
-                        }],
-                        finish_reason="length" if
-                        (idx == NUM_REQS - 1) else None,
-                    )
-                ],
-                finished=(idx == NUM_REQS - 1),
-            ) for idx in range(NUM_REQS)
-        ]
-
-    collector = RequestOutputCollector(RequestOutputKind.DELTA)
-
-    # CASE 1: Put then get.
-    outputs = make_outputs()
-    collector.put(outputs[0])
-    output = await collector.get()
-    assert not collector.ready.is_set()
-    assert collector.output is None
-    assert output.outputs[0].text == "a"
-    assert output.outputs[0].token_ids == [0]
-
-    # CASE 2: 2 puts then get.
-    num_to_put = 2
-    outputs = make_outputs()
-    for i in range(num_to_put):
-        collector.put(outputs[i])
-    output = await collector.get()
-    assert not collector.ready.is_set()
-    assert collector.output is None
-
-    assert not output.finished
-    # Text, token_ids, and logprobs should get merged.
-    assert output.outputs[0].text == TEXT * num_to_put
-    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
-                            list(range(num_to_put))):
-        assert tok_0 == tok_1
-    assert len(output.outputs[0].logprobs) == num_to_put
-
-    # Cumulative logprobs should be the last one.
-    cumulative_logprob_expected = 1.0 * num_to_put
-    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
-
-    # CASE 3: Put all 3 (including a finished).
-    num_to_put = 3
-    outputs = make_outputs()
-    for i in range(num_to_put):
-        collector.put(outputs[i])
-    output = await collector.get()
-    assert not collector.ready.is_set()
-    assert collector.output is None
-
-    assert output.finished
-    assert output.outputs[0].finish_reason == "length"
-    # Text, token_ids, and logprobs should get merged.
-    assert output.outputs[0].text == TEXT * num_to_put
-    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
-                            list(range(num_to_put))):
-        assert tok_0 == tok_1
-    assert len(output.outputs[0].logprobs) == num_to_put
-
-    # Cumulative logprobs should be the last one.
-    cumulative_logprob_expected = 1.0 * num_to_put
-    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
-
-
-@pytest.mark.asyncio
-async def test_cumulative_output_collector_n():
-    """Test collector correctly handles multiple outputs by index."""
-    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
-    outputs = [
-        RequestOutput(
-            request_id="my-request-id",
-            prompt=None,
-            prompt_token_ids=[1, 2, 3],
-            prompt_logprobs=None,
-            outputs=[
-                CompletionOutput(
-                    index=0,
-                    text="a",
-                    token_ids=[0],
-                    cumulative_logprob=None,
-                    logprobs=None,
-                    finish_reason=None,
-                ),
-                CompletionOutput(
-                    index=1,
-                    text="b",
-                    token_ids=[1],
-                    cumulative_logprob=None,
-                    logprobs=None,
-                    finish_reason=None,
-                ),
-            ],
-            finished=False,
-        ),
-        RequestOutput(
-            request_id="my-request-id",
-            prompt=None,
-            prompt_token_ids=[1, 2, 3],
-            prompt_logprobs=None,
-            outputs=[
-                CompletionOutput(
-                    index=0,
-                    text="ab",
-                    token_ids=[0, 1],
-                    cumulative_logprob=None,
-                    logprobs=None,
-                    finish_reason=None,
-                ),
-                CompletionOutput(
-                    index=2,
-                    text="c",
-                    token_ids=[2],
-                    cumulative_logprob=None,
-                    logprobs=None,
-                    finish_reason=None,
-                ),
-            ],
-            finished=False,
-        ),
-    ]
-    for output in outputs:
-        collector.put(output)
-
-    # Get the output and check that the text and token_ids are correct.
-    result = await collector.get()
-    # We are expecting
-    # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}]
-    assert len(result.outputs) == 3
-    # First is the one where index is 0
-    first = [k for k in result.outputs if k.index == 0]
-    assert len(first) == 1
-    assert first[0].text == "ab"
-
-    # Second is the one where index is 1
-    second = [k for k in result.outputs if k.index == 1]
-    assert len(second) == 1
-    assert second[0].text == "b"
-    assert second[0].token_ids == [1]
-
-    # Third is the one where index is 2
-    third = [k for k in result.outputs if k.index == 2]
-    assert len(third) == 1
-    assert third[0].text == "c"

From 8c2eb56bff4ef0da36d695dcb4f0247a212c3f5e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 11:33:04 -0700
Subject: [PATCH 04/17] revert

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_output_processor.py | 1000 ++++++++++++++++++++++
 1 file changed, 1000 insertions(+)
 create mode 100644 tests/v1/engine/test_output_processor.py

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
new file mode 100644
index 000000000000..a9632ce54eac
--- /dev/null
+++ b/tests/v1/engine/test_output_processor.py
@@ -0,0 +1,1000 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+import time
+from typing import Optional
+
+import pytest
+
+from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
+                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+                                   STOP_STRINGS,
+                                   DummyOutputProcessorTestVectors,
+                                   MockEngineCore)
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sequence import PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
+from vllm.v1.metrics.stats import IterationStats
+
+
+def _ref_convert_id_to_token(
+    tokenizer: AnyTokenizer,
+    token_id: int,
+) -> str:
+    """Reference impl of logprobs detokenization.
+
+    Args:
+      tokenizer: tokenizer used by the model under test
+      token_id: convert this token id
+
+    Returns:
+      String representation of input token id
+    """
+    return tokenizer.decode([token_id]) or ""
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+def test_incremental_detokenization(request_output_kind: RequestOutputKind,
+                                    dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt_token_ids=prompt_tokens,
+                          mm_features=None,
+                          eos_token_id=None,
+                          arrival_time=0,
+                          lora_request=None,
+                          cache_salt=None,
+                          data_parallel_rank=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False,
+                          ),
+                          pooling_params=None)
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add requests to the detokenizer.
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
+
+    gen_strings = {}
+    gen_tokens = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
+            zip(dummy_test_vectors.generation_strings,
+                dummy_test_vectors.generation_tokens)):
+        gen_str = gen_strings[f"request-{idx}"]
+        gen_toks = gen_tokens[f"request-{idx}"]
+
+        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
+        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+def _validate_logprobs(
+    gen_tokens: dict[str, list[int]],
+    gen_logprobs: dict[str, Optional[SampleLogprobs]],
+    gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]],
+    gen_cumulative_logprob: dict[str, float],
+    dtv: DummyOutputProcessorTestVectors,
+    request_id_list: list[str],
+    num_sample_logprobs: Optional[int],
+    num_prompt_logprobs: Optional[int],
+) -> None:
+    for req_idx, req_id in enumerate(request_id_list):
+        new_tokens = gen_tokens[req_id]
+        logprobs = gen_logprobs[req_id]
+        prompt_logprobs = gen_prompt_logprobs[req_id]
+        cumulative_logprob = gen_cumulative_logprob[req_id]
+        prompt_token_ids = dtv.prompt_tokens[req_idx]
+        ref_logprobs = dtv.generation_logprobs[req_idx]
+        ref_prompt_logprobs = dtv.prompt_logprobs[req_idx]
+        if num_sample_logprobs is not None:
+            # Validate sample logprobs
+            assert logprobs is not None, (f"Request {req_id} requires sample"
+                                          " logprobs but sample logprobs are"
+                                          " None.")
+            # Require num sampled tokens to match num
+            # sampled logprobs - especially important
+            # to check since the detokenizer can cause
+            # a request to finish early due to a stop
+            # string being hit
+            num_new_tokens = len(new_tokens)
+            len_sample_logprobs = len(logprobs)
+            assert num_new_tokens == len_sample_logprobs, (
+                f"Request {req_id} has {num_new_tokens}"
+                " completion tokens but has"
+                f" {len_sample_logprobs} sample logprobs.")
+            ref_cumulative_logprob = 0.0
+            for idx, (sampled_token,
+                      pos_logprob_dict) in enumerate(zip(new_tokens,
+                                                         logprobs)):
+                # Break out the reference log probability value &
+                # logprob token id tensors associated with this
+                # position in the completion. Also break out the
+                # sampled token ranks
+                (ref_pos_logprob_toks, ref_pos_logprob_vals,
+                 ref_sampled_token_rank) = ref_logprobs[idx]
+                # For each position in the completion sequence,
+                # ensure the actual sampled token is among the
+                # logprobs
+                assert sampled_token in pos_logprob_dict, (
+                    f"Sampled token {sampled_token} not"
+                    f" present in logprob at index {idx}")
+
+                # Validate number of sample logprobs
+                num_lp_toks = len(pos_logprob_dict)
+                assert (num_lp_toks == num_sample_logprobs
+                        or num_lp_toks == num_sample_logprobs +
+                        1), ("Valid numbers of sample logprobs are"
+                             f" {num_sample_logprobs} or"
+                             f" {num_sample_logprobs+1} but"
+                             f" {num_lp_toks} logprobs found at"
+                             f" position {idx}. Logprobs dict:"
+                             f" {pos_logprob_dict}")
+
+                # Validate sampled token logprob rank
+                smp_lp = pos_logprob_dict[sampled_token]
+                smp_lp_rank = smp_lp.rank
+                assert (ref_sampled_token_rank == smp_lp_rank), (
+                    "Sampled token logprob rank"
+                    f" {smp_lp_rank} does not match"
+                    " correct value"
+                    f" {ref_sampled_token_rank}"
+                    f" in Logprob {smp_lp}")
+
+                # Validate that the logprob processor yields
+                # the correct log probabilities and valid
+                # rankings
+                rank_one_appears = False
+                for jdx in range(1, len(ref_pos_logprob_toks)):
+                    # Iterate over the (logprob val,logprob tok id)
+                    # pairs expected by the test fixture at this
+                    # position in the completion.
+                    ref_lp_val = ref_pos_logprob_vals[jdx]
+                    ref_tok_id = ref_pos_logprob_toks[jdx]
+                    assert ref_tok_id in pos_logprob_dict, (
+                        f"Expected token {ref_tok_id} to be"
+                        f" in logprob dict but it is not.")
+
+                    # Extract actually-generated logprob
+                    # info
+                    lp = pos_logprob_dict[ref_tok_id]
+                    lp_val = lp.logprob
+                    lp_rank = lp.rank
+
+                    # A "top" (rank 1) logprob must be
+                    # present
+                    rank_one_appears = (True
+                                        if lp_rank == 1 else rank_one_appears)
+
+                    # Rank must be >= 1
+                    assert lp_rank >= 1, (f"Logprob {lp} has invalid"
+                                          f" rank {lp_rank} < 1."
+                                          f" Logprob dict: {pos_logprob_dict}")
+
+                    # Validate log probability
+                    assert math.isclose(lp_val, ref_lp_val), (
+                        f"Token id {ref_tok_id} appears in logprobs dict"
+                        f" at position {idx} in completion with log"
+                        f" probability {lp_val} but {ref_lp_val} was"
+                        f" expected. Logprob: {lp}")
+
+                assert rank_one_appears, (f"No Logprob has rank 1"
+                                          " in the following Logprob"
+                                          f" dict: {pos_logprob_dict}")
+
+                # Validate logprobs detokenization
+                for lp_tok in pos_logprob_dict:
+                    # Confirm that sample logprob decoded token matches
+                    # the logprob token id at this sequence position
+                    decoded_token = pos_logprob_dict[lp_tok].decoded_token
+                    ref_decoded_token = _ref_convert_id_to_token(
+                        dtv.tokenizer, lp_tok)
+                    assert decoded_token == ref_decoded_token, (
+                        f"Sampled logprob token id {lp_tok} decodes to"
+                        f" {ref_decoded_token} but Logprob decoded"
+                        f" token is {decoded_token} instead"
+                        f" (at position {idx})")
+
+                ref_cumulative_logprob += pos_logprob_dict[
+                    sampled_token].logprob
+            # Assert that cumulative logprobs are correct
+            assert math.isclose(cumulative_logprob, ref_cumulative_logprob)
+        else:
+            # Sample logprobs disabled for this request
+            assert logprobs is None
+            assert cumulative_logprob is None
+
+        if num_prompt_logprobs is not None:
+            # Validate prompt logprobs
+            assert prompt_logprobs is not None, (
+                f"Request {req_id} requires prompt"
+                " logprobs but prompt logprobs are"
+                " None.")
+            # Require num prompt tokens to match num
+            # prompt logprobs
+            num_prompt_tokens = len(prompt_token_ids)
+            len_prompt_logprobs = len(prompt_logprobs)
+            assert num_prompt_tokens == len_prompt_logprobs, (
+                f"Request {req_id} has {num_prompt_tokens}"
+                " prompt tokens but has"
+                f" {len_prompt_logprobs} prompt logprobs.")
+            # First prompt logprob is None
+            first_plp_dict = prompt_logprobs[0]
+            assert first_plp_dict is None, (
+                f"Request {req_id} first prompt logprob"
+                f" should be None but has following value"
+                f" instead: {first_plp_dict}")
+            # Break out the reference prompt log prob value &
+            # logprob token id matrices for the whole prompt.
+            # Also break out the prompt token rank vector
+            (ref_prompt_logprob_toks, ref_prompt_logprob_vals,
+             ref_prompt_token_ranks) = ref_prompt_logprobs
+            for idx, (prompt_token, pos_logprob_dict) in enumerate(
+                    zip(prompt_token_ids[1:], prompt_logprobs[1:])):
+
+                # Break out the reference prompt log prob value
+                # vector, prompt logprob token id vector, and
+                # prompt token rank at the current position.
+                (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals,
+                 ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :],
+                                               ref_prompt_logprob_vals[idx, :],
+                                               ref_prompt_token_ranks[idx])
+
+                # For each position in the prompt sequence,
+                # ensure the actual prompt token is among the
+                # logprobs
+                assert prompt_token in pos_logprob_dict, (
+                    f"Prompt token {prompt_token} not"
+                    f" present in logprob at index {idx}")
+                # Validate number of prompt logprobs
+                num_plp_toks = len(pos_logprob_dict)
+                assert (num_plp_toks == num_prompt_logprobs
+                        or num_plp_toks == num_prompt_logprobs +
+                        1), ("Valid numbers of prompt logprobs are"
+                             f" {num_prompt_logprobs} or"
+                             f" {num_prompt_logprobs+1} but"
+                             f" {num_plp_toks} logprobs found at"
+                             f" position {idx}. Logprobs dict:"
+                             f" {pos_logprob_dict}")
+
+                # Validate prompt token logprob rank
+                prmpt_tok_lp = pos_logprob_dict[prompt_token]
+                prmpt_tok_lp_rank = prmpt_tok_lp.rank
+                ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank
+                assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), (
+                    "Prompt token logprob rank"
+                    f" {prmpt_tok_lp_rank} does not match"
+                    " correct value"
+                    f" {ref_prmpt_tok_lp_rank}"
+                    f" in Logprob {prmpt_tok_lp}")
+
+                # Validate that the logprob processor yields
+                # the correct prompt log probs and valid
+                # rankings
+                rank_one_appears = False
+                for jdx in range(1, len(ref_pos_prompt_logprob_toks)):
+                    # Iterate over the (logprob val,logprob tok id)
+                    # pairs expected by the test fixture at this
+                    # position in the completion.
+                    ref_plp_val = float(ref_pos_prompt_logprob_vals[jdx])
+                    ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx])
+                    assert ref_tok_id in pos_logprob_dict, (
+                        f"Expected token {ref_tok_id} to be"
+                        f" in logprob dict but it is not.")
+
+                    # Extract actually-generated logprob
+                    # info
+                    plp = pos_logprob_dict[ref_tok_id]
+                    plp_val = plp.logprob
+                    plp_rank = plp.rank
+
+                    # A "top" (rank 1) logprob must be
+                    # present
+                    rank_one_appears = (True
+                                        if plp_rank == 1 else rank_one_appears)
+
+                    # Rank must be >= 1
+                    assert plp_rank >= 1, (
+                        f"Logprob {plp} has invalid"
+                        f" rank {plp_rank} < 1."
+                        f" Logprob dict: {pos_logprob_dict}")
+
+                    # Validate log probability
+                    assert math.isclose(plp_val, ref_plp_val), (
+                        f"Token id {ref_tok_id} appears in logprobs dict"
+                        f" at position {idx} in completion with log"
+                        f" probability {plp_val} but {ref_plp_val} was"
+                        f" expected. Logprob: {plp}")
+
+                assert rank_one_appears, (f"No Logprob has rank 1"
+                                          " in the following Logprob"
+                                          f" dict: {pos_logprob_dict}")
+
+                # Validate prompt logprob detokenization
+                for plp_tok in pos_logprob_dict:
+                    # Confirm that prompt logprob decoded token matches
+                    # the logprob token id at this sequence position
+                    decoded_token = pos_logprob_dict[plp_tok].decoded_token
+                    ref_decoded_token = _ref_convert_id_to_token(
+                        dtv.tokenizer, plp_tok)
+                    assert decoded_token == ref_decoded_token, (
+                        f"Prompt logprob token id {plp_tok} decodes to"
+                        f" {ref_decoded_token} but Logprob decoded"
+                        f" token is {decoded_token} instead"
+                        f" (at position {idx})")
+        else:
+            # Prompt logprobs disabled for this request
+            assert prompt_logprobs is None
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+@pytest.mark.parametrize("num_prompt_logprobs",
+                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
+def test_logprobs_processor(request_output_kind: RequestOutputKind,
+                            num_sample_logprobs: Optional[int],
+                            num_prompt_logprobs: Optional[int],
+                            dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=None if num_sample_logprobs is None else
+        dummy_test_vectors.generation_logprobs,
+        prompt_logprobs_raw=None
+        if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs)
+
+    # Make N requests.
+    request_id_list = [
+        f"request-{idx}"
+        for idx in range(len(dummy_test_vectors.prompt_strings))
+    ]
+    requests = [
+        EngineCoreRequest(request_id=request_id_list[idx],
+                          prompt_token_ids=prompt_tokens,
+                          mm_features=None,
+                          eos_token_id=None,
+                          arrival_time=0,
+                          lora_request=None,
+                          cache_salt=None,
+                          data_parallel_rank=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False,
+                              logprobs=num_sample_logprobs,
+                              prompt_logprobs=num_prompt_logprobs,
+                          ),
+                          pooling_params=None)
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add requests to the detokenizer.
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
+
+    gen_tokens = {}
+    gen_logprobs = {}
+    gen_prompt_logprobs = {}
+    gen_cumulative_logprobs = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the logprobs processor.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_tokens = request_output.outputs[0].token_ids
+            prompt_logprobs = request_output.prompt_logprobs
+            logprobs = request_output.outputs[0].logprobs
+            gen_cumulative_logprobs[request_id] = request_output.outputs[
+                0].cumulative_logprob
+            if request_id not in gen_logprobs:
+                # Start tracking sample and prompt logprobs for this request
+                gen_tokens[request_id] = new_tokens
+                gen_logprobs[request_id] = logprobs
+                gen_prompt_logprobs[request_id] = prompt_logprobs
+            else:
+                # Extend logprobs tracker
+                gen_tokens[request_id].extend(new_tokens)
+                lp = gen_logprobs[request_id]
+                plp = gen_prompt_logprobs[request_id]
+                if lp:
+                    lp.extend(logprobs)
+                if plp:
+                    plp.extend(prompt_logprobs)
+
+    # Confirmed tracked logprobs match what we expect
+    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
+                       gen_cumulative_logprobs, dummy_test_vectors,
+                       request_id_list, num_sample_logprobs,
+                       num_prompt_logprobs)
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize(
+    "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
+    [(False, "stop_token_ids", False, None),
+     (True, "stop_token_ids", False, None),
+     (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+     (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+     (False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
+     (False, "eos_token_id", True, None)])
+def test_stop_token(include_stop_str_in_output: bool,
+                    num_sample_logprobs: Optional[int], stop_token_type: str,
+                    ignore_eos: bool, dummy_test_vectors):
+    """Test output processor EOS/stop token handling.
+
+    Send mock engine core request to mock engine core and pass core outputs
+    to output processor. Validate output processor tokens, text and
+    (if enabled) sample logprobs. Batch-size one.
+
+    The test emulates a scenario where a model outputs text tokens followed
+    by two identical control tokens:
+    <token><token>...<token><control><control>
+
+    If EOS is under test, the control tokens are EOS; otherwise, they are
+    some other token id.
+
+    Test behavior:
+
+    * If EOS is under test and `ignore_eos=True`, the detokenized string
+      should be <token><token>...<token><control><control> and the finish
+      reason should be "length" (i.e. no stop occurs)
+
+    * else, if `include_stop_str_in_output==True`, the detokenized
+      string should be <token><token>...<token><control> and the finish
+      reason should be "stop" (i.e. first control token causes stop
+      and is represented in output text)
+
+    * else, the detokenized string should be
+      <token><token>...<token> and the finish reason should be "stop"
+      (i.e. first control token causes stop but is not represented
+      in output text.)
+
+    Note: some test details are tuned for meta-llama/Llama-3.2-1B,
+    another model should work only if the test is modified.
+
+    Args:
+        include_stop_str_in_output: stop token str appears in output text
+        num_sample_logprobs: number of sample logprobs (`None` for no logprobs)
+        stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token
+        ignore_eos: if True, EOS stops are disabled
+        dummy_test_vectors: dummy engine core outputs and other data structures
+    """
+    model_id = dummy_test_vectors.tokenizer.name_or_path
+    if model_id != 'meta-llama/Llama-3.2-1B':
+        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
+                             f"{model_id} is in use.")
+    do_logprobs = num_sample_logprobs is not None
+    # EOS under test; if False, stop_token_ids under test
+    is_eos_test = stop_token_type == "eos_token_id"
+    # EOS under test but ignore_eos enabled
+    is_eos_ignore_test = is_eos_test and ignore_eos
+    eos_token_id = (
+        dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
+    )  # '<|end_of_text|>'
+    stop_token_ids = [128009] if not is_eos_test else None  # '<|eot_id|>'
+
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
+                                       log_stats=False)
+    # Dummy engine core outputs, with control tokens suffixed to test stops
+    suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
+    assert suffix_token is not None and isinstance(suffix_token[0], int)
+    generation_string = dummy_test_vectors.generation_strings[0]
+    generation_tokens = (dummy_test_vectors.generation_tokens[0] +
+                         2 * suffix_token)
+    if do_logprobs:
+        generation_logprobs = (
+            dummy_test_vectors.generation_logprobs[0] +
+            2 * [dummy_test_vectors.generation_logprobs[0][-1]])
+    prompt_string = dummy_test_vectors.prompt_strings[0]
+    prompt_tokens = dummy_test_vectors.prompt_tokens[0]
+    engine_core = MockEngineCore(
+        tokens_list=[generation_tokens],
+        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+        prompt_logprobs_raw=None,
+        eos_token_id=eos_token_id,
+        stop_token_ids=stop_token_ids,
+        ignore_eos=ignore_eos)
+
+    # Make request.
+    request_id = "request-0"
+    request = EngineCoreRequest(
+        request_id=request_id,
+        prompt_token_ids=prompt_tokens,
+        mm_features=None,
+        eos_token_id=eos_token_id,
+        arrival_time=0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+        sampling_params=SamplingParams(
+            skip_special_tokens=False,
+            spaces_between_special_tokens=False,
+            output_kind=RequestOutputKind.DELTA,
+            stop=[],
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_stop_str_in_output,
+            logprobs=num_sample_logprobs,
+            prompt_logprobs=None,
+            ignore_eos=ignore_eos,
+        ),
+        pooling_params=None)
+
+    # Add request to the detokenizer.
+    output_processor.add_request(request, prompt_string)
+
+    # Loop over engine core steps; run output processor
+    gen_string = ""
+    gen_tokens = []
+    gen_logprobs = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        assert len(request_outputs) == 1
+        # Stop token does not rely on abort
+        assert not processed_outputs.reqs_to_abort
+
+        # Update tracking.
+        request_output = request_outputs[0]
+        if request_output.finished:
+            finish_reason = ("length" if is_eos_ignore_test else "stop")
+            assert request_output.outputs[0].finish_reason == finish_reason
+
+        gen_string += request_output.outputs[0].text
+        gen_tokens.extend(request_output.outputs[0].token_ids)
+        if do_logprobs:
+            gen_logprobs.extend(request_output.outputs[0].logprobs)
+
+    # Validate generated text
+    control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
+    if is_eos_ignore_test:
+        # Length-based stop; expect full string
+        ref_str = generation_string + 2 * control_token
+    elif include_stop_str_in_output:
+        # Stop token triggered; include in output
+        ref_str = generation_string + control_token
+    else:
+        # Stop token triggered but not in output
+        ref_str = generation_string
+    assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
+
+    if do_logprobs:
+        # Validate number of sample logprobs
+        num_tokens = len(gen_tokens)
+        num_logprobs = len(gen_logprobs)
+        assert num_tokens == num_logprobs, (
+            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
+
+    # Check requests are finished
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+def test_stop_string(include_stop_str_in_output: bool,
+                     num_sample_logprobs: Optional[int], dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if num_sample_logprobs else None,
+        prompt_logprobs_raw=None)
+
+    # Make N requests.
+    request_id_list = [
+        f"request-{idx}"
+        for idx in range(len(dummy_test_vectors.prompt_strings))
+    ]
+    requests = [
+        EngineCoreRequest(
+            request_id=request_id_list[idx],
+            prompt_token_ids=prompt_tokens,
+            mm_features=None,
+            eos_token_id=None,
+            arrival_time=0,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+                logprobs=num_sample_logprobs,
+                prompt_logprobs=None,
+            ),
+            pooling_params=None)
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add requests to the detokenizer.
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
+
+    gen_strings = {}
+    gen_tokens = {}
+    gen_logprobs = {}
+    gen_prompt_logprobs = {}
+    gen_cumulative_logprobs = {}
+    aborted = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        for request_output in request_outputs:
+            # If aborted, we should not get a request output.
+            assert request_output.request_id not in aborted
+        aborted.extend(requests_to_abort)
+
+        # Update tracking.
+        for request_output in request_outputs:
+            if request_output.finished:
+                assert request_output.outputs[0].finish_reason == "stop"
+
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            prompt_logprobs = request_output.prompt_logprobs
+            logprobs = request_output.outputs[0].logprobs
+            gen_cumulative_logprobs[request_id] = request_output.outputs[
+                0].cumulative_logprob
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+                gen_logprobs[request_id] = logprobs
+                gen_prompt_logprobs[request_id] = prompt_logprobs
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+                lp = gen_logprobs[request_id]
+                plp = gen_prompt_logprobs[request_id]
+                if lp:
+                    lp.extend(logprobs)
+                if plp:
+                    plp.extend(prompt_logprobs)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, stop_str) in enumerate(
+            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
+
+        # Request should be aborted.
+        request_id = f"request-{idx}"
+        assert request_id in aborted
+
+        # Collected values that were generated.
+        gen_str = gen_strings[request_id]
+
+        # Construct reference strings.
+        stop_str_idx = ref_gen_str.find(stop_str)
+        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
+        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
+
+        if include_stop_str_in_output:
+            assert gen_str == ref_str_inc_stop, (
+                f"{gen_str=}, {ref_str_inc_stop=}")
+        else:
+            assert gen_str == ref_str_exc_stop, (
+                f"{gen_str=}, {ref_str_exc_stop=}")
+
+    # Confirmed tracked logprobs match what we expect
+    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
+                       gen_cumulative_logprobs, dummy_test_vectors,
+                       request_id_list, num_sample_logprobs, None)
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+def test_iteration_stats(dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
+                                       log_stats=True)
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
+    engine_core_timestamp = time.monotonic()
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt_token_ids=prompt_tokens,
+            mm_features=None,
+            eos_token_id=None,
+            arrival_time=0,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(),
+            pooling_params=None,
+        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add all requests except one to the OutputProcessor.
+    num_active = len(dummy_test_vectors.generation_tokens) - 1
+    for request in requests[:num_active]:
+        output_processor.add_request(request, None)
+    inactive_request = requests[num_active]
+
+    # First iteration has 2 prefills.
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+    total_prompt_tokens = sum([
+        len(prompt_tokens)
+        for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
+    ])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Add a new request - prefill and 2 decodes in this step.
+    output_processor.add_request(inactive_request, None)
+    num_active += 1
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+    total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+
+@pytest.mark.asyncio
+async def test_request_output_collector():
+    NUM_REQS = 3
+    TEXT = "a"
+
+    def make_outputs() -> list[RequestOutput]:
+        return [
+            RequestOutput(
+                request_id="my-request-id",
+                prompt=None,
+                prompt_token_ids=[1, 2, 3],
+                prompt_logprobs=None,
+                outputs=[
+                    CompletionOutput(
+                        index=0,
+                        text=TEXT,
+                        token_ids=[idx],
+                        cumulative_logprob=(idx + 1 * 1.0),
+                        logprobs=[{
+                            "a": idx,
+                            "b": idx
+                        }],
+                        finish_reason="length" if
+                        (idx == NUM_REQS - 1) else None,
+                    )
+                ],
+                finished=(idx == NUM_REQS - 1),
+            ) for idx in range(NUM_REQS)
+        ]
+
+    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+
+    # CASE 1: Put then get.
+    outputs = make_outputs()
+    collector.put(outputs[0])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+    assert output.outputs[0].text == "a"
+    assert output.outputs[0].token_ids == [0]
+
+    # CASE 2: 2 puts then get.
+    num_to_put = 2
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert not output.finished
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+    # CASE 3: Put all 3 (including a finished).
+    num_to_put = 3
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert output.finished
+    assert output.outputs[0].finish_reason == "length"
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+
+@pytest.mark.asyncio
+async def test_cumulative_output_collector_n():
+    """Test collector correctly handles multiple outputs by index."""
+    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    outputs = [
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="a",
+                    token_ids=[0],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=1,
+                    text="b",
+                    token_ids=[1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="ab",
+                    token_ids=[0, 1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=2,
+                    text="c",
+                    token_ids=[2],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+    ]
+    for output in outputs:
+        collector.put(output)
+
+    # Get the output and check that the text and token_ids are correct.
+    result = await collector.get()
+    # We are expecting
+    # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}]
+    assert len(result.outputs) == 3
+    # First is the one where index is 0
+    first = [k for k in result.outputs if k.index == 0]
+    assert len(first) == 1
+    assert first[0].text == "ab"
+
+    # Second is the one where index is 1
+    second = [k for k in result.outputs if k.index == 1]
+    assert len(second) == 1
+    assert second[0].text == "b"
+    assert second[0].token_ids == [1]
+
+    # Third is the one where index is 2
+    third = [k for k in result.outputs if k.index == 2]
+    assert len(third) == 1
+    assert third[0].text == "c"

From 7a92f1791bcf75063c6a7b2b77d2e08ec021c1e8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 12:56:03 -0700
Subject: [PATCH 05/17] fix test_chat

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/entrypoints/openai/test_chat.py | 49 ++++++++++-----------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d5924b7b3ae3..be62586f9741 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -28,11 +28,9 @@ def monkeypatch_module():
     mpatch.undo()
 
 
-@pytest.fixture(scope="module", params=[False, True])
-def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+@pytest.fixture(scope="module")
+def server(monkeypatch_module, zephyr_lora_files):  #noqa: F811
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')
 
     args = [
         # use half precision for speed and memory savings in CI environment
@@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
         yield remote_server
 
 
-@pytest.fixture
-def is_v1_server(server):
-    import os
-    assert os.environ['VLLM_USE_V1'] in ['0', '1']
-    return os.environ['VLLM_USE_V1'] == '1'
-
-
 @pytest_asyncio.fixture
 async def client(server):
     async with server.get_async_client() as async_client:
@@ -481,9 +472,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  sample_guided_choice, is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+                                  sample_guided_choice):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -519,10 +508,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
-                                is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+async def test_guided_json_chat(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
 
     messages = [{
         "role": "system",
@@ -565,10 +554,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
 
 
 @pytest.mark.asyncio
-async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
-                                 is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+async def test_guided_regex_chat(
+    client: openai.AsyncOpenAI,
+    sample_regex,
+):
 
     messages = [{
         "role": "system",
@@ -653,10 +642,10 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
-                              is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Tool use is only supported in v1 engine")
+async def test_named_tool_use(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -826,11 +815,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_response_format_json_schema(client: openai.AsyncOpenAI,
-                                           is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip(
-            "JSON schema response format is only supported in v1 engine")
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
     prompt = 'what is 1+1? The format is "result": 2'
     # Check that this prompt cannot lead to a valid JSON without json_schema
     for _ in range(2):

From d80a45534ff96c7e04994cf1cea3d68b69cb754e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 13:03:29 -0700
Subject: [PATCH 06/17] fix pp test

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/distributed/test_pipeline_parallel.py | 75 ++++++---------------
 tests/metrics/test_metrics.py               |  9 ---
 2 files changed, 19 insertions(+), 65 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 76b105e8a8ec..efc15a00f0c7 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -41,23 +41,10 @@ class PPTestOptions(NamedTuple):
 @dataclass
 class PPTestSettings:
     parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
     distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
     runner: RunnerOption
     test_options: PPTestOptions
 
-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})")
-
     @staticmethod
     def detailed(
         *,
@@ -90,8 +77,7 @@ def detailed(
                               eager_mode=True,
                               chunked_prefill=False),
             ],
-            distributed_backends=["mp", "mp", "ray", "ray"],
-            vllm_major_versions=["0", "1", "0", "1"],
+            distributed_backends=["mp", "ray"],
             runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
@@ -106,7 +92,6 @@ def fast(
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
-        vllm_major_versions = ["1"] if runner == "pooling" else ["0"]
 
         return PPTestSettings(
             parallel_setups=[
@@ -116,7 +101,6 @@ def fast(
                               chunked_prefill=False),
             ],
             distributed_backends=["mp"],
-            vllm_major_versions=vllm_major_versions,
             runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
@@ -126,10 +110,8 @@ def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(self.distributed_backends,
-                                                   self.vllm_major_versions):
-                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.runner, opts)
+            for backend in self.distributed_backends:
+                yield (model_id, parallel_setup, backend, self.runner, opts)
 
 
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -257,7 +239,6 @@ def _compare_tp(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available: int,
@@ -341,14 +322,11 @@ def _compare_tp(
     if max_num_seqs:
         common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
 
-    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
-    testing_ray_compiled_graph = False
-    if distributed_backend == "ray" and (vllm_major_version == "1"
-                                         or specific_case):
+    if distributed_backend == "ray":
         # For V1, test Ray Compiled Graph for all the tests
         # For V0, test Ray Compiled Graph for a subset of the tests
         pp_env = {
-            "VLLM_USE_V1": vllm_major_version,
+            "VLLM_USE_V1": "1",
             "VLLM_USE_RAY_COMPILED_DAG": "1",
             "VLLM_USE_RAY_SPMD_WORKER": "1",
             "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@@ -356,17 +334,16 @@ def _compare_tp(
         # Temporary. Currently when zeromq + SPMD is used, it does not properly
         # terminate because of a Ray Compiled Graph issue.
         common_args.append("--disable-frontend-multiprocessing")
-        testing_ray_compiled_graph = True
     elif distributed_backend == "mp":
         # Both V0/V1 of multiprocessing executor support PP
         pp_env = {
-            "VLLM_USE_V1": vllm_major_version,
+            "VLLM_USE_V1": "1",
         }
     else:
         pp_env = None
 
     tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
+        "VLLM_USE_V1": "1",
     }
 
     pp_args = [
@@ -392,25 +369,17 @@ def _compare_tp(
         "mp",
     ]
 
-    try:
-        compare_two_settings(model_id,
-                             pp_args,
-                             tp_args,
-                             pp_env,
-                             tp_env,
-                             method=method)
-    except Exception:
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(model_id,
+                         pp_args,
+                         tp_args,
+                         pp_env,
+                         tp_env,
+                         method=method)
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "runner", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner",
+     "test_options"),
     [
         params for model_id, settings in TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -421,7 +390,6 @@ def test_tp_language_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
@@ -429,7 +397,6 @@ def test_tp_language_generation(
     _compare_tp(model_id,
                 parallel_setup,
                 distributed_backend,
-                vllm_major_version,
                 runner,
                 test_options,
                 num_gpus_available,
@@ -438,8 +405,8 @@ def test_tp_language_generation(
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "runner", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner",
+     "test_options"),
     [
         params for model_id, settings in EMBEDDING_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -450,7 +417,6 @@ def test_tp_language_embedding(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
@@ -458,7 +424,6 @@ def test_tp_language_embedding(
     _compare_tp(model_id,
                 parallel_setup,
                 distributed_backend,
-                vllm_major_version,
                 runner,
                 test_options,
                 num_gpus_available,
@@ -467,8 +432,8 @@ def test_tp_language_embedding(
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "runner", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner",
+     "test_options"),
     [
         params for model_id, settings in MULTIMODAL_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -479,7 +444,6 @@ def test_tp_multimodal_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
@@ -487,7 +451,6 @@ def test_tp_multimodal_generation(
     _compare_tp(model_id,
                 parallel_setup,
                 distributed_backend,
-                vllm_major_version,
                 runner,
                 test_options,
                 num_gpus_available,
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index dbd9c518e020..3c0de9782fc9 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -13,15 +13,6 @@
 from vllm.sampling_params import SamplingParams
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
 
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 MODELS = [
     "distilbert/distilgpt2",
 ]

From 9bb81febb4f898ac7808d2fd9f7af50258f7cb2e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 13:03:58 -0700
Subject: [PATCH 07/17] fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/distributed/test_pipeline_parallel.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index efc15a00f0c7..aa4b4ac7fe52 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -324,7 +324,6 @@ def _compare_tp(
 
     if distributed_backend == "ray":
         # For V1, test Ray Compiled Graph for all the tests
-        # For V0, test Ray Compiled Graph for a subset of the tests
         pp_env = {
             "VLLM_USE_V1": "1",
             "VLLM_USE_RAY_COMPILED_DAG": "1",
@@ -335,7 +334,6 @@ def _compare_tp(
         # terminate because of a Ray Compiled Graph issue.
         common_args.append("--disable-frontend-multiprocessing")
     elif distributed_backend == "mp":
-        # Both V0/V1 of multiprocessing executor support PP
         pp_env = {
             "VLLM_USE_V1": "1",
         }

From c855f921be48196cbad92a2751ae83def5364ec8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 17:16:33 -0700
Subject: [PATCH 08/17] rm more tests

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/basic_correctness/test_preemption.py  | 189 -----
 tests/entrypoints/openai/test_completion.py | 831 --------------------
 tests/metrics/__init__.py                   |   0
 tests/metrics/test_metrics.py               | 259 ------
 tests/tracing/__init__.py                   |   0
 tests/tracing/test_tracing.py               | 237 ------
 6 files changed, 1516 deletions(-)
 delete mode 100644 tests/basic_correctness/test_preemption.py
 delete mode 100644 tests/entrypoints/openai/test_completion.py
 delete mode 100644 tests/metrics/__init__.py
 delete mode 100644 tests/metrics/test_metrics.py
 delete mode 100644 tests/tracing/__init__.py
 delete mode 100644 tests/tracing/test_tracing.py

diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
deleted file mode 100644
index db2fa2f6bef6..000000000000
--- a/tests/basic_correctness/test_preemption.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the short outputs of HF and vLLM when using greedy sampling.
-
-VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
-
-Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
-pytest tests/basic_correctness/test_preemption.py`.
-"""
-import pytest
-from prometheus_client import REGISTRY
-
-import vllm.envs as envs
-from vllm import SamplingParams
-from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
-                                 ENABLE_ARTIFICIAL_PREEMPT)
-
-from ..models.utils import check_outputs_equal
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
-    so use VLLM_USE_V1=0 for all tests in the file.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.fixture(scope="module", autouse=True)
-def check_settings():
-    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
-        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
-        "pytest tests/basic_correctness/test_preemption.py`")
-
-
-@pytest.fixture
-def distributed_executor_backend() -> str:
-    # When SPMD worker is used, use distributed_executor_backend="ray"
-    # to test delta input optimization works with preemption.
-    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
-def test_chunked_prefill_recompute(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Ensure that chunked prefill works with preemption."""
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_seqs=max_num_seqs,
-            distributed_executor_backend=distributed_executor_backend,
-            disable_log_stats=False,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption(
-    caplog_vllm,
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """By default, recompute preemption is enabled"""
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
-
-    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
-    # Ensure the count bucket of request-level histogram metrics matches
-    # the number of requests as a simple sanity check to ensure metrics are
-    # generated
-    preemption_metrics = None
-    for m in REGISTRY.collect():
-        if m.name == "vllm:num_preemptions":
-            preemption_metrics = m
-    assert preemption_metrics is not None
-    total_recorded_preemption = 0
-    for sample in preemption_metrics.samples:
-        total_recorded_preemption += sample.value
-    assert total_preemption == total_recorded_preemption
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Verify infeasible preemption request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            block_size=BLOCK_SIZE,
-            # Not enough gpu blocks to complete a single sequence.
-            # preemption should happen, and the sequence should be
-            # ignored instead of hanging forever.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.llm.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    # Verify the request is ignored and not hang.
-    for req_output in req_outputs:
-        outputs = req_output.outputs
-        assert len(outputs) == 1
-        assert outputs[0].finish_reason == "length"
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
deleted file mode 100644
index 3650b1579257..000000000000
--- a/tests/entrypoints/openai/test_completion.py
+++ /dev/null
@@ -1,831 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# imports for guided decoding tests
-import json
-import os
-from typing import Optional
-
-import jsonschema
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-import regex as re
-import requests
-# downloading lora to test lora requests
-from openai import BadRequestError
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-from ...utils import RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically these adapters use a different base model,
-# but we're not testing generation quality here
-
-GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
-
-
-@pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
-    return [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--max-num-seqs",
-        "128",
-        "--enforce-eager",
-        # lora config
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-    ]
-
-
-@pytest.fixture(scope="module",
-                params=["", "--disable-frontend-multiprocessing"])
-def server(default_server_args, request):
-    if request.param:
-        default_server_args.append(request.param)
-
-    original_value = os.environ.get('VLLM_USE_V1')
-    os.environ['VLLM_USE_V1'] = '0'
-    try:
-        with RemoteOpenAIServer(MODEL_NAME,
-                                default_server_args) as remote_server:
-            yield remote_server
-    finally:
-        # Restore original env value
-        if original_value is None:
-            os.environ.pop('VLLM_USE_V1', None)
-        else:
-            os.environ['VLLM_USE_V1'] = original_value
-
-
-@pytest.fixture
-def is_v1_server(server):
-    import os
-
-    # For completion tests, we assume v0 since there's no explicit v1 setup
-    return os.environ.get('VLLM_USE_V1', '0') == '1'
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 1
-    assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
-    # test using token IDs
-    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
-        # Added tokens should be rejected by the base model
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 32000, 32001, 32002],
-            echo=True,
-            max_tokens=5,
-            temperature=0.0,
-        )
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
-                                                         (MODEL_NAME, 0),
-                                                         (MODEL_NAME, 1),
-                                                         (MODEL_NAME, None)])
-async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
-                                          model_name: str,
-                                          prompt_logprobs: Optional[int]):
-    params: dict = {
-        "prompt": ["A robot may not injure another robot", "My name is"],
-        "model": model_name,
-    }
-    if prompt_logprobs is not None:
-        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
-
-    if prompt_logprobs is not None and prompt_logprobs < 0:
-        with pytest.raises(BadRequestError):
-            await client.completions.create(**params)
-    else:
-        completion = await client.completions.create(**params)
-        if prompt_logprobs is not None:
-            assert completion.choices[0].prompt_logprobs is not None
-            assert len(completion.choices[0].prompt_logprobs) > 0
-
-            assert completion.choices[1].prompt_logprobs is not None
-            assert len(completion.choices[1].prompt_logprobs) > 0
-
-        else:
-            assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: list[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
-    """Streaming for parallel sampling.
-    The tokens from multiple samples, are flattened into a single stream,
-    with an index to indicate which sample the token belongs to.
-    """
-
-    prompt = "What is an LLM?"
-    n = 3
-    max_tokens = 5
-
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=max_tokens,
-                                             n=n,
-                                             stream=True)
-    chunks: list[list[str]] = [[] for i in range(n)]
-    finish_reason_count = 0
-    async for chunk in stream:
-        index = chunk.choices[0].index
-        text = chunk.choices[0].text
-        chunks[index].append(text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    assert finish_reason_count == n
-    for chunk in chunks:
-        assert len(chunk) == max_tokens
-        print("".join(chunk))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is not None
-        assert chunk.usage.prompt_tokens > 0
-        assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
-        if chunk.choices[0].finish_reason is not None:
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options=
-    #     {"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options=
-    #    {"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-    # Test stream=False, stream_options=
-    #     {"continuous_usage_stats": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": None})
-
-    # Test stream=False, stream_options=
-    #    {"continuous_usage_stats": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but
-                # not necessary for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
-@pytest.mark.asyncio
-async def test_allowed_token_ids(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 1
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    allowed_ids = [21555, 21557, 21558]
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        seed=42,
-        extra_body=dict(allowed_token_ids=allowed_ids),
-        logprobs=1,
-    )
-    response_tokens = completion.choices[0].logprobs.tokens
-    assert len(response_tokens) == 1
-    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str,
-                                      sample_json_schema, is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str,
-                                       sample_regex, is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(sample_regex,
-                            completion.choices[i].text) is not None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str,
-                                        sample_guided_choice,
-                                        is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in sample_guided_choice
-
-
-@pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements, is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided grammar is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_grammar=sample_sql_statements))
-
-    content = completion.choices[0].text
-
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(sample_sql_statements)
-    parser.parse(content)
-
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-
-    assert content.strip() == ground_truth
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex,
-                                          is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=sample_regex,
-                            guided_json=sample_json_schema))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name,stream,echo",
-    [
-        (MODEL_NAME, False, False),
-        (MODEL_NAME, False, True),
-        (MODEL_NAME, True, False),
-        (MODEL_NAME, True, True)  # should not raise BadRequestError error
-    ],
-)
-async def test_echo_stream_completion(client: openai.AsyncOpenAI,
-                                      model_name: str, stream: bool,
-                                      echo: bool):
-    saying: str = "Hello, my name is"
-    result = await client.completions.create(model=model_name,
-                                             prompt=saying,
-                                             max_tokens=10,
-                                             temperature=0.0,
-                                             echo=echo,
-                                             stream=stream)
-
-    stop_reason = "length"
-
-    if not stream:
-        completion = result
-        assert completion.id is not None
-        assert completion.choices is not None and len(completion.choices) == 1
-
-        choice = completion.choices[0]
-        assert len(choice.text) >= 5
-        assert choice.finish_reason == stop_reason
-
-        if echo:
-            assert choice.text is not None and saying in choice.text
-        else:
-            assert choice.text is not None and saying not in choice.text
-
-    else:
-        chunks: list[str] = []
-        final_finish_reason = None
-        async for chunk in result:
-            if chunk.choices and chunk.choices[0].text:
-                chunks.append(chunk.choices[0].text)
-            if chunk.choices and chunk.choices[0].finish_reason:
-                final_finish_reason = chunk.choices[0].finish_reason
-
-        assert final_finish_reason == stop_reason
-        content = "".join(chunks)
-        if echo:
-            assert content is not None and saying in content
-        else:
-            assert content is not None and saying not in content
-
-
-@pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
-    request_args = {
-        "model": MODEL_NAME,
-        "prompt": "Hello, my name is",
-        "max_tokens": 5,
-        "temperature": 0.0,
-        "logprobs": None,
-    }
-
-    completion = await client.completions.create(**request_args)
-
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
-    invocation_response.raise_for_status()
-
-    completion_output = completion.model_dump()
-    invocation_output = invocation_response.json()
-
-    assert completion_output.keys() == invocation_output.keys()
-    assert completion_output["choices"] == invocation_output["choices"]
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
deleted file mode 100644
index 3c0de9782fc9..000000000000
--- a/tests/metrics/test_metrics.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import ray
-from prometheus_client import REGISTRY
-
-import vllm.envs as envs
-from vllm import EngineArgs, LLMEngine
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.metrics import RayPrometheusStatLogger
-from vllm.sampling_params import SamplingParams
-from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_metric_counter_prompt_tokens(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.4) as vllm_model:
-        tokenizer = vllm_model.llm.get_tokenizer()
-        prompt_token_counts = [
-            len(tokenizer.encode(p)) for p in example_prompts
-        ]
-        # This test needs at least 2 prompts in a batch of different lengths to
-        # verify their token count is correct despite padding.
-        assert len(example_prompts) > 1, "at least 2 prompts are required"
-        assert prompt_token_counts[0] != prompt_token_counts[1], (
-            "prompts of different lengths are required")
-        vllm_prompt_token_count = sum(prompt_token_counts)
-
-        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
-            **stat_logger.labels)._value.get()
-
-    assert vllm_prompt_token_count == metric_count, (
-        f"prompt token count: {vllm_prompt_token_count!r}\n"
-        f"metric: {metric_count!r}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_metric_counter_generation_tokens(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.4) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.llm.get_tokenizer()
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-            **stat_logger.labels)._value.get()
-        vllm_generation_count = 0
-        for i in range(len(example_prompts)):
-            vllm_output_ids, vllm_output_str = vllm_outputs[i]
-            prompt_ids = tokenizer.encode(example_prompts[i])
-            # vllm_output_ids contains both prompt tokens and generation tokens.
-            # We're interested only in the count of the generation tokens.
-            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
-
-    assert vllm_generation_count == metric_count, (
-        f"generation token count: {vllm_generation_count!r}\n"
-        f"metric: {metric_count!r}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize(
-    "served_model_name",
-    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
-def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
-                                   served_model_name: list[str]) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.3,
-                     served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metrics_tag_content = stat_logger.labels["model_name"]
-
-    if envs.VLLM_CI_USE_S3:
-        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
-    if served_model_name is None or served_model_name == []:
-        assert metrics_tag_content == model, (
-            f"Metrics tag model_name is wrong! expect: {model!r}\n"
-            f"actual: {metrics_tag_content!r}")
-    else:
-        assert metrics_tag_content == served_model_name[0], (
-            f"Metrics tag model_name is wrong! expect: "
-            f"{served_model_name[0]!r}\n"
-            f"actual: {metrics_tag_content!r}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("disable_log_stats", [True, False])
-@pytest.mark.asyncio
-async def test_async_engine_log_metrics_regression(
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    disable_log_stats: bool,
-) -> None:
-    """
-    Regression test ensuring async engine generates metrics
-    when disable_log_stats=False
-    (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
-    """
-    engine_args = AsyncEngineArgs(
-        model=model,
-        dtype=dtype,
-        disable_log_stats=disable_log_stats,
-    )
-    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
-    for i, prompt in enumerate(example_prompts):
-        results = async_engine.generate(
-            prompt,
-            SamplingParams(max_tokens=max_tokens),
-            f"request-id-{i}",
-        )
-        # Exhaust the async iterator to make the async engine work
-        async for _ in results:
-            pass
-
-    assert_metrics(model, async_engine.engine, disable_log_stats,
-                   len(example_prompts))
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("disable_log_stats", [True, False])
-def test_engine_log_metrics_regression(
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    disable_log_stats: bool,
-) -> None:
-    engine_args = EngineArgs(
-        model=model,
-        dtype=dtype,
-        disable_log_stats=disable_log_stats,
-    )
-    engine = LLMEngine.from_engine_args(engine_args)
-    for i, prompt in enumerate(example_prompts):
-        engine.add_request(
-            f"request-id-{i}",
-            prompt,
-            SamplingParams(max_tokens=max_tokens),
-        )
-    while engine.has_unfinished_requests():
-        engine.step()
-
-    if envs.VLLM_CI_USE_S3:
-        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
-    assert_metrics(model, engine, disable_log_stats, len(example_prompts))
-
-
-def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
-                   num_requests: int) -> None:
-    if disable_log_stats:
-        with pytest.raises(AttributeError):
-            _ = engine.stat_loggers
-    else:
-        assert (engine.stat_loggers
-                is not None), "engine.stat_loggers should be set"
-        # Ensure the count bucket of request-level histogram metrics matches
-        # the number of requests as a simple sanity check to ensure metrics are
-        # generated
-        labels = {'model_name': model}
-        request_histogram_metrics = [
-            "vllm:e2e_request_latency_seconds",
-            "vllm:request_prompt_tokens",
-            "vllm:request_generation_tokens",
-            "vllm:request_params_n",
-            "vllm:request_params_max_tokens",
-        ]
-        for metric_name in request_histogram_metrics:
-            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
-                                                     labels)
-            assert (
-                metric_value == num_requests), "Metrics should be collected"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [16])
-def test_engine_log_metrics_ray(
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # This test is quite weak - it only checks that we can use
-    # RayPrometheusStatLogger without exceptions.
-    # Checking whether the metrics are actually emitted is unfortunately
-    # non-trivial.
-
-    # We have to run in a Ray task for Ray metrics to be emitted correctly
-    @ray.remote(num_gpus=1)
-    def _inner():
-
-        class _RayPrometheusStatLogger(RayPrometheusStatLogger):
-
-            def __init__(self, *args, **kwargs):
-                self._i = 0
-                super().__init__(*args, **kwargs)
-
-            def log(self, *args, **kwargs):
-                self._i += 1
-                return super().log(*args, **kwargs)
-
-        engine_args = EngineArgs(
-            model=model,
-            dtype=dtype,
-            disable_log_stats=False,
-        )
-        engine = LLMEngine.from_engine_args(engine_args)
-        logger = _RayPrometheusStatLogger(
-            local_interval=0.5,
-            labels=dict(model_name=engine.model_config.served_model_name),
-            vllm_config=engine.vllm_config)
-        engine.add_logger("ray", logger)
-        for i, prompt in enumerate(example_prompts):
-            engine.add_request(
-                f"request-id-{i}",
-                prompt,
-                SamplingParams(max_tokens=max_tokens),
-            )
-        while engine.has_unfinished_requests():
-            engine.step()
-        assert logger._i > 0, ".log must be called at least once"
-
-    ray.get(_inner.remote())
diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
deleted file mode 100644
index 4dbae7c15de3..000000000000
--- a/tests/tracing/test_tracing.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa
-# type: ignore
-from __future__ import annotations
-
-import threading
-from collections.abc import Iterable
-from concurrent import futures
-from typing import Callable, Generator, Literal
-
-import grpc
-import pytest
-from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
-    ExportTraceServiceResponse)
-from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
-    TraceServiceServicer, add_TraceServiceServicer_to_server)
-from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
-from opentelemetry.sdk.environment_variables import (
-    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
-
-from vllm import LLM, SamplingParams
-from vllm.tracing import SpanAttributes
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
-
-FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
-                    'array_value']
-
-
-def decode_value(value: AnyValue):
-    field_decoders: dict[FieldName, Callable] = {
-        "bool_value": (lambda v: v.bool_value),
-        "string_value": (lambda v: v.string_value),
-        "int_value": (lambda v: v.int_value),
-        "double_value": (lambda v: v.double_value),
-        "array_value":
-        (lambda v: [decode_value(item) for item in v.array_value.values]),
-    }
-    for field, decoder in field_decoders.items():
-        if value.HasField(field):
-            return decoder(value)
-    raise ValueError(f"Couldn't decode value: {value}")
-
-
-def decode_attributes(attributes: Iterable[KeyValue]):
-    return {kv.key: decode_value(kv.value) for kv in attributes}
-
-
-class FakeTraceService(TraceServiceServicer):
-
-    def __init__(self):
-        self.request = None
-        self.evt = threading.Event()
-
-    def Export(self, request, context):
-        self.request = request
-        self.evt.set()
-        return ExportTraceServiceResponse()
-
-
-@pytest.fixture
-def trace_service() -> Generator[FakeTraceService, None, None]:
-    """Fixture to set up a fake gRPC trace service"""
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
-    service = FakeTraceService()
-    add_TraceServiceServicer_to_server(service, server)
-    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
-    server.start()
-
-    yield service
-
-    server.stop(None)
-
-
-def test_traces(
-    monkeypatch: pytest.MonkeyPatch,
-    trace_service: FakeTraceService,
-):
-    with monkeypatch.context() as m:
-        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-
-        sampling_params = SamplingParams(
-            temperature=0.01,
-            top_p=0.1,
-            max_tokens=256,
-        )
-        model = "facebook/opt-125m"
-        llm = LLM(
-            model=model,
-            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-        )
-        prompts = ["This is a short prompt"]
-        outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-        timeout = 5
-        if not trace_service.evt.wait(timeout):
-            raise TimeoutError(
-                f"The fake trace service didn't receive a trace within "
-                f"the {timeout} seconds timeout")
-
-        request = trace_service.request
-        assert len(request.resource_spans) == 1, (
-            f"Expected 1 resource span, "
-            f"but got {len(request.resource_spans)}")
-        assert len(request.resource_spans[0].scope_spans) == 1, (
-            f"Expected 1 scope span, "
-            f"but got {len(request.resource_spans[0].scope_spans)}")
-        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-            f"Expected 1 span, "
-            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-
-        attributes = decode_attributes(
-            request.resource_spans[0].scope_spans[0].spans[0].attributes)
-        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                              ) == sampling_params.temperature
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-                              ) == sampling_params.max_tokens
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-                outputs[0].prompt_token_ids)
-        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-        metrics = outputs[0].metrics
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-                              ) == metrics.time_in_queue
-        ttft = metrics.first_token_time - metrics.arrival_time
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-        e2e_time = metrics.finished_time - metrics.arrival_time
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-        assert metrics.scheduler_time > 0
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                              ) == metrics.scheduler_time
-        # Model forward and model execute should be none, since detailed traces is
-        # not enabled.
-        assert metrics.model_forward_time is None
-        assert metrics.model_execute_time is None
-
-
-def test_traces_with_detailed_steps(
-    monkeypatch: pytest.MonkeyPatch,
-    trace_service: FakeTraceService,
-):
-    with monkeypatch.context() as m:
-        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-
-        sampling_params = SamplingParams(
-            temperature=0.01,
-            top_p=0.1,
-            max_tokens=256,
-        )
-        model = "facebook/opt-125m"
-        llm = LLM(
-            model=model,
-            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-            collect_detailed_traces=["all"],
-        )
-        prompts = ["This is a short prompt"]
-        outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-        timeout = 5
-        if not trace_service.evt.wait(timeout):
-            raise TimeoutError(
-                f"The fake trace service didn't receive a trace within "
-                f"the {timeout} seconds timeout")
-
-        request = trace_service.request
-        assert len(request.resource_spans) == 1, (
-            f"Expected 1 resource span, "
-            f"but got {len(request.resource_spans)}")
-        assert len(request.resource_spans[0].scope_spans) == 1, (
-            f"Expected 1 scope span, "
-            f"but got {len(request.resource_spans[0].scope_spans)}")
-        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-            f"Expected 1 span, "
-            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-
-        attributes = decode_attributes(
-            request.resource_spans[0].scope_spans[0].spans[0].attributes)
-        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                              ) == sampling_params.temperature
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-                              ) == sampling_params.max_tokens
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-                outputs[0].prompt_token_ids)
-        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-        metrics = outputs[0].metrics
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-                              ) == metrics.time_in_queue
-        ttft = metrics.first_token_time - metrics.arrival_time
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-        e2e_time = metrics.finished_time - metrics.arrival_time
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-        assert metrics.scheduler_time > 0
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                              ) == metrics.scheduler_time
-        assert metrics.model_forward_time > 0
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
-        ) == pytest.approx(metrics.model_forward_time / 1000)
-        assert metrics.model_execute_time > 0
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
-        ) == metrics.model_execute_time
-        assert metrics.model_forward_time < 1000 * metrics.model_execute_time

From c12bc3e5b595ef55e9046f9a100a996bf5ea8e5f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 17:18:11 -0700
Subject: [PATCH 09/17] fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/tracing/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/v1/tracing/__init__.py

diff --git a/tests/v1/tracing/__init__.py b/tests/v1/tracing/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1

From 3d7c3612eb6db4d2718835044b6435bd910b17de Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 17:20:04 -0700
Subject: [PATCH 10/17] fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d7aae4e1c71a..8c5599591663 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -215,16 +215,14 @@ steps:
   num_gpus: 2
   source_file_dependencies:
   - vllm/
-  - tests/metrics
   - tests/v1/tracing
   commands:
-  - pytest -v -s metrics
   - "pip install \
       'opentelemetry-sdk>=1.26.0' \
       'opentelemetry-api>=1.26.0' \
       'opentelemetry-exporter-otlp>=1.26.0' \
       'opentelemetry-semantic-conventions-ai>=0.4.1'"
-  - pytest -v -s tracing
+  - pytest -v -s v1/tracing
 
 ##### fast check tests  #####
 #####  1 GPU test  #####

From c17fb8fccad29802da708fd75fd017d464c9db23 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 17:37:16 -0700
Subject: [PATCH 11/17] [V0 Deprecation] Remove more V0 tests

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml                 |   4 -
 tests/async_engine/__init__.py                |   0
 tests/async_engine/api_server_async_engine.py |  54 --
 tests/async_engine/conftest.py                |  12 -
 tests/async_engine/test_api_server.py         | 139 ------
 tests/async_engine/test_request_tracker.py    |  71 ---
 tests/basic_correctness/test_preemption.py    | 189 -------
 tests/detokenizer/conftest.py                 |  11 -
 tests/detokenizer/test_stop_checker.py        |  83 ----
 .../openai/correctness/test_lmeval.py         |  10 -
 tests/samplers/test_logprobs.py               | 182 -------
 tests/worker/__init__.py                      |   0
 tests/worker/conftest.py                      |  11 -
 tests/worker/test_model_input.py              | 113 -----
 tests/worker/test_model_runner.py             | 462 ------------------
 tests/worker/test_profile.py                  |  68 ---
 tests/worker/test_swap.py                     |  87 ----
 17 files changed, 1496 deletions(-)
 delete mode 100644 tests/async_engine/__init__.py
 delete mode 100644 tests/async_engine/api_server_async_engine.py
 delete mode 100644 tests/async_engine/conftest.py
 delete mode 100644 tests/async_engine/test_api_server.py
 delete mode 100644 tests/async_engine/test_request_tracker.py
 delete mode 100644 tests/basic_correctness/test_preemption.py
 delete mode 100644 tests/detokenizer/conftest.py
 delete mode 100644 tests/detokenizer/test_stop_checker.py
 delete mode 100644 tests/samplers/test_logprobs.py
 delete mode 100644 tests/worker/__init__.py
 delete mode 100644 tests/worker/conftest.py
 delete mode 100644 tests/worker/test_model_input.py
 delete mode 100644 tests/worker/test_model_runner.py
 delete mode 100644 tests/worker/test_profile.py
 delete mode 100644 tests/worker/test_swap.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 08c10180fc22..b7e9746bb745 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -46,22 +46,18 @@ steps:
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
-  - tests/async_engine
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/utils_
-  - tests/worker
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
   commands:
   - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s async_engine # AsyncLLMEngine
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
   - pytest -v -s utils_ # Utils
-  - pytest -v -s worker # Worker
   - pytest -v -s transformers_utils # transformers_utils
 
 - label: Python-only Installation Test # 10min
diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
deleted file mode 100644
index ec6b20f5e04b..000000000000
--- a/tests/async_engine/api_server_async_engine.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""vllm.entrypoints.api_server with some extra logging for testing."""
-from collections.abc import Iterable
-from typing import Any
-
-import uvicorn
-from fastapi.responses import JSONResponse, Response
-
-import vllm.entrypoints.api_server
-import vllm.envs as envs
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.utils import FlexibleArgumentParser
-
-app = vllm.entrypoints.api_server.app
-
-
-class AsyncLLMEngineWithStats(AsyncLLMEngine):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._num_aborts = 0
-
-    async def _engine_abort(self, request_ids: Iterable[str]):
-        ids = list(request_ids)
-        self._num_aborts += len(ids)
-        await super()._engine_abort(ids)
-
-    def testing_stats(self) -> dict[str, Any]:
-        return {"num_aborted_requests": self._num_aborts}
-
-
-@app.get("/stats")
-def stats() -> Response:
-    """Get the statistics of the engine."""
-    return JSONResponse(engine.testing_stats())
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
-    vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level="debug",
-                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
deleted file mode 100644
index 375b248ebeda..000000000000
--- a/tests/async_engine/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
deleted file mode 100644
index 07370a880329..000000000000
--- a/tests/async_engine/test_api_server.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import copyreg
-import os
-import subprocess
-import sys
-import time
-from multiprocessing import Pool
-from pathlib import Path
-
-import pytest
-import requests
-import urllib3.exceptions
-
-
-def _pickle_new_connection_error(obj):
-    """Custom pickler for NewConnectionError to fix tblib compatibility."""
-    # Extract the original message by removing the "conn: " prefix
-    full_message = obj.args[0] if obj.args else ""
-    if ': ' in full_message:
-        # Split off the connection part and keep the actual message
-        _, actual_message = full_message.split(': ', 1)
-    else:
-        actual_message = full_message
-    return _unpickle_new_connection_error, (actual_message, )
-
-
-def _unpickle_new_connection_error(message):
-    """Custom unpickler for NewConnectionError."""
-    # Create with None as conn and the actual message
-    return urllib3.exceptions.NewConnectionError(None, message)
-
-
-# Register the custom pickle/unpickle functions for tblib compatibility
-copyreg.pickle(urllib3.exceptions.NewConnectionError,
-               _pickle_new_connection_error)
-
-
-def _query_server(prompt: str, max_tokens: int = 5) -> dict:
-    response = requests.post("http://localhost:8000/generate",
-                             json={
-                                 "prompt": prompt,
-                                 "max_tokens": max_tokens,
-                                 "temperature": 0,
-                                 "ignore_eos": True
-                             })
-    response.raise_for_status()
-    return response.json()
-
-
-def _query_server_long(prompt: str) -> dict:
-    return _query_server(prompt, max_tokens=500)
-
-
-@pytest.fixture
-def api_server(distributed_executor_backend: str):
-    script_path = Path(__file__).parent.joinpath(
-        "api_server_async_engine.py").absolute()
-    commands = [
-        sys.executable,
-        "-u",
-        str(script_path),
-        "--model",
-        "facebook/opt-125m",
-        "--host",
-        "127.0.0.1",
-        "--distributed-executor-backend",
-        distributed_executor_backend,
-    ]
-
-    # API Server Test Requires V0.
-    my_env = os.environ.copy()
-    my_env["VLLM_USE_V1"] = "0"
-    uvicorn_process = subprocess.Popen(commands, env=my_env)
-    yield
-    uvicorn_process.terminate()
-
-
-@pytest.mark.timeout(300)
-@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
-def test_api_server(api_server, distributed_executor_backend: str):
-    """
-    Run the API server and test it.
-
-    We run both the server and requests in separate processes.
-
-    We test that the server can handle incoming requests, including
-    multiple requests at the same time, and that it can handle requests
-    being cancelled without crashing.
-    """
-    with Pool(32) as pool:
-        # Wait until the server is ready
-        prompts = ["warm up"] * 1
-        result = None
-        while not result:
-            try:
-                for r in pool.map(_query_server, prompts):
-                    result = r
-                    break
-            except requests.exceptions.ConnectionError:
-                time.sleep(1)
-
-        # Actual tests start here
-        # Try with 1 prompt
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests == 0
-
-        # Try with 100 prompts
-        prompts = ["test prompt"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-    with Pool(32) as pool:
-        # Cancel requests
-        prompts = ["canceled requests"] * 100
-        pool.map_async(_query_server_long, prompts)
-        time.sleep(0.01)
-        pool.terminate()
-        pool.join()
-
-        # check cancellation stats
-        # give it some time to update the stats
-        time.sleep(1)
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests > 0
-
-    # check that server still runs after cancellations
-    with Pool(32) as pool:
-        # Try with 100 prompts
-        prompts = ["test prompt after canceled"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
deleted file mode 100644
index 1851eeeda790..000000000000
--- a/tests/async_engine/test_request_tracker.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.async_llm_engine import RequestTracker
-from vllm.outputs import RequestOutput
-
-
-@pytest.mark.asyncio
-async def test_request_tracker():
-    tracker = RequestTracker()
-    stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 1
-    assert new[0]["request_id"] == "1"
-    assert not aborted
-    assert not stream_1.finished
-
-    stream_2 = tracker.add_request("2")
-    stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 2
-    assert new[0]["request_id"] == "2"
-    assert new[1]["request_id"] == "3"
-    assert not aborted
-    assert not stream_2.finished
-    assert not stream_3.finished
-
-    # request_ids must be unique
-    with pytest.raises(KeyError):
-        tracker.add_request("1")
-    assert not tracker.new_requests_event.is_set()
-
-    tracker.abort_request("1")
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert len(aborted) == 1
-    assert "1" in aborted
-    assert not new
-    assert stream_1.finished
-
-    stream_4 = tracker.add_request("4")
-    tracker.abort_request("4")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    # aborted new requests will cancel each other out -
-    # there's no need for them to propagate into the
-    # engine
-    assert not aborted
-    assert not new
-    assert stream_4.finished
-
-    stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event.is_set()
-    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert not aborted
-    assert len(new) == 1
-    assert new[0]["request_id"] == "5"
-    assert stream_2.finished
-    assert not stream_5.finished
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
deleted file mode 100644
index db2fa2f6bef6..000000000000
--- a/tests/basic_correctness/test_preemption.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the short outputs of HF and vLLM when using greedy sampling.
-
-VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
-
-Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
-pytest tests/basic_correctness/test_preemption.py`.
-"""
-import pytest
-from prometheus_client import REGISTRY
-
-import vllm.envs as envs
-from vllm import SamplingParams
-from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
-                                 ENABLE_ARTIFICIAL_PREEMPT)
-
-from ..models.utils import check_outputs_equal
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
-    so use VLLM_USE_V1=0 for all tests in the file.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.fixture(scope="module", autouse=True)
-def check_settings():
-    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
-        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
-        "pytest tests/basic_correctness/test_preemption.py`")
-
-
-@pytest.fixture
-def distributed_executor_backend() -> str:
-    # When SPMD worker is used, use distributed_executor_backend="ray"
-    # to test delta input optimization works with preemption.
-    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
-def test_chunked_prefill_recompute(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Ensure that chunked prefill works with preemption."""
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_seqs=max_num_seqs,
-            distributed_executor_backend=distributed_executor_backend,
-            disable_log_stats=False,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption(
-    caplog_vllm,
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """By default, recompute preemption is enabled"""
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
-
-    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
-    # Ensure the count bucket of request-level histogram metrics matches
-    # the number of requests as a simple sanity check to ensure metrics are
-    # generated
-    preemption_metrics = None
-    for m in REGISTRY.collect():
-        if m.name == "vllm:num_preemptions":
-            preemption_metrics = m
-    assert preemption_metrics is not None
-    total_recorded_preemption = 0
-    for sample in preemption_metrics.samples:
-        total_recorded_preemption += sample.value
-    assert total_preemption == total_recorded_preemption
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Verify infeasible preemption request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            block_size=BLOCK_SIZE,
-            # Not enough gpu blocks to complete a single sequence.
-            # preemption should happen, and the sequence should be
-            # ignored instead of hanging forever.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.llm.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    # Verify the request is ignored and not hang.
-    for req_output in req_outputs:
-        outputs = req_output.outputs
-        assert len(outputs) == 1
-        assert outputs[0].finish_reason == "length"
diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py
deleted file mode 100644
index f2c125355c83..000000000000
--- a/tests/detokenizer/conftest.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py
deleted file mode 100644
index 2ca10c072b34..000000000000
--- a/tests/detokenizer/test_stop_checker.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.inputs import token_inputs
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob, Sequence, SequenceStatus
-
-
-def sequence_with_eos(text: str, eos_token: str,
-                      eos_token_id: int) -> Sequence:
-    """
-    Create a Sequence that ends with an EOS token.
-    """
-    seq = Sequence(
-        seq_id=0,
-        inputs=token_inputs([]),
-        block_size=16,
-        eos_token_id=eos_token_id,
-    )
-    seq.output_text = text + eos_token
-
-    offset = eos_token_id + 1
-    for i in range(offset, len(text) + offset):
-        seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
-    seq.append_token_id(token_id=eos_token_id,
-                        logprobs={eos_token_id: Logprob(0.0)})
-
-    seq.status = SequenceStatus.RUNNING
-
-    return seq
-
-
-@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
-    ("This text ends with EOS token", "</s>", 2),
-])
-@pytest.mark.parametrize("ignore_eos", [True, False])
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.skip_global_cleanup
-def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
-                           ignore_eos: bool, include_stop_str_in_output: bool):
-    """
-    Test the behavior of the StopChecker's maybe_stop_sequence method
-    when an EOS token is encountered.
-
-    This test covers:
-    - When the EOS token should stop the sequence and be removed from the output
-    - When the EOS token should stop the sequence and be included in the output
-    - When the EOS token should be ignored, and the sequence continues
-    """
-
-    stop_checker = StopChecker(max_model_len=1024)
-
-    seq = sequence_with_eos(
-        text=text_wo_eos,
-        eos_token=eos_token,
-        eos_token_id=eos_token_id,
-    )
-    new_char_count = len(eos_token)
-
-    # Note that `stop` and `stop_token_ids` are not specified
-    sampling_params = SamplingParams(
-        min_tokens=1,
-        ignore_eos=ignore_eos,
-        include_stop_str_in_output=include_stop_str_in_output)
-
-    stop_checker.maybe_stop_sequence(
-        seq=seq,
-        new_char_count=new_char_count,
-        sampling_params=sampling_params,
-    )
-
-    if ignore_eos:
-        assert seq.status == SequenceStatus.RUNNING
-        assert seq.output_text == text_wo_eos + eos_token
-    elif include_stop_str_in_output:
-        assert seq.status == SequenceStatus.FINISHED_STOPPED
-        assert seq.output_text == text_wo_eos + eos_token
-    else:
-        assert seq.status == SequenceStatus.FINISHED_STOPPED
-        assert seq.output_text == text_wo_eos
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 684407cd6ee9..624acd5ffde7 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
             more_args = ["--max-num-seqs", "64"]
 
         run_test(more_args)
-
-
-@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
-                                    more_args):
-    """Run with the V0 Engine."""
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        run_test(more_args)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
deleted file mode 100644
index 87f40b100531..000000000000
--- a/tests/samplers/test_logprobs.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm import SamplingParams
-
-from ..conftest import VllmRunner
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module is V0 only since it uses dtype=float, so
-    set VLLM_USE_V1=0 for all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["float"])  # needed for comparing logprobs with HF
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_get_prompt_logprobs(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype,
-    chunked_prefill_token_size: int,
-    num_top_logprobs: int,
-    detokenize: bool,
-    example_prompts,
-):
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    max_tokens = 5
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_logprobs = hf_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens=max_tokens,
-        )
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_logprobs=num_top_logprobs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                              logprobs=num_top_logprobs,
-                                              prompt_logprobs=num_top_logprobs,
-                                              temperature=0.0,
-                                              detokenize=detokenize)
-        vllm_results = vllm_model.llm.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
-
-    # Test whether logprobs are included in the results.
-    for result in vllm_results:
-        assert result.prompt_logprobs is not None
-        assert result.outputs[0].logprobs is not None
-        assert len(result.outputs[0].logprobs) == max_tokens
-        for logprobs in result.outputs[0].logprobs:
-            # If the output token is not included in the top X
-            # logprob, it can return 1 more data
-            assert (len(logprobs) == num_top_logprobs
-                    or len(logprobs) == num_top_logprobs + 1)
-        output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens_lst: list[str] = []
-        for top_logprobs in result.outputs[0].logprobs:
-            top_logprob = next(iter(top_logprobs.values()))
-            output_string_from_most_likely_tokens_lst.append(
-                top_logprob.decoded_token)
-
-        if detokenize:
-            output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens_lst)
-            assert output_text == output_string_from_most_likely_tokens, (
-                "The output text from the top logprob for each token position "
-                "should be the same as the output text in the result.")
-        else:
-            assert output_text == ''
-            assert output_string_from_most_likely_tokens_lst == ([None] *
-                                                                 max_tokens)
-
-        # The first prompt logprob is always None
-        assert result.prompt_logprobs[0] is None
-        for prompt_logprobs in result.prompt_logprobs[1:]:
-            # If the prompt token is not included in the top X
-            # logprob, it can return 1 more data
-            assert (len(prompt_logprobs) == num_top_logprobs
-                    or len(prompt_logprobs) == num_top_logprobs + 1)
-
-    # Test whether prompt logprobs are consistent with HF
-    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
-        # Check prompt logprobs
-        # The first prompt logprob is always None, so we compare it from 1:.
-        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
-        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
-            for token_id, logprob in vllm_prompt_logprob_dict.items():
-                torch.testing.assert_close(logprob.logprob,
-                                           hf_logprob[0][i][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
-        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
-        for i, top_logprobs in enumerate(vllm_sample_logprobs):
-            for token_id, sample_logprob in top_logprobs.items():
-                logprob = sample_logprob.logprob
-                torch.testing.assert_close(logprob,
-                                           hf_logprob[i][-1][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
-                if detokenize:
-                    assert isinstance(sample_logprob.decoded_token, str), (
-                        "The token should be decoded by the time it is returned"
-                        " to the user.")
-
-    # Test if prompt logprobs are correctly set.
-    for vllm_result in vllm_results:
-        token_ids = vllm_result.prompt_token_ids
-        prompt_logprobs = vllm_result.prompt_logprobs
-
-        # The first token doesn't have logprob.
-        assert prompt_logprobs[0] is None
-
-        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
-            assert token_id in logprob_dict
-
-
-def test_max_logprobs():
-    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
-    vllm_sampling_params = SamplingParams(logprobs=1)
-    # should pass
-    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-
-    bad_sampling_params = SamplingParams(logprobs=2)
-    with pytest.raises(ValueError):
-        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
-                       detokenize: bool, example_prompts):
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-    max_tokens = 5
-
-    with vllm_runner(
-            model,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=None,
-                                                       temperature=0.0,
-                                                       detokenize=detokenize)
-        results_logprobs_none = vllm_model.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_none)
-
-    for i in range(len(results_logprobs_none)):
-        assert results_logprobs_none[i].outputs[0].logprobs is None
-        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
diff --git a/tests/worker/__init__.py b/tests/worker/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
deleted file mode 100644
index 3f202d4dbe94..000000000000
--- a/tests/worker/conftest.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
deleted file mode 100644
index 0f28ef2ba857..000000000000
--- a/tests/worker/test_model_input.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-
-import torch
-
-from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.model_executor import SamplingMetadata
-from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
-
-class MockAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_impl_cls():
-        raise NotImplementedError
-
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return AttentionMetadata
-
-    @staticmethod
-    def get_builder_cls() -> type["AttentionMetadataBuilder"]:
-        return AttentionMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> tuple[int, ...]:
-        raise NotImplementedError
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        pass
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: list[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        pass
-
-
-def test_model_runner_input():
-    sampling_metadata = SamplingMetadata(
-        ["seq_group"],
-        "selected_token_indices",
-        "categorized_sample_indices",
-        "num_prompts",
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    model_input = ModelInputForGPUWithSamplingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
-
-    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (
-        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithSamplingMetadata)
-    assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
-    assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
-    assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
-    assert received_model_input.lora_requests is None
-    assert received_model_input.lora_requests == model_input.lora_requests
-    assert received_model_input.lora_mapping is None
-    assert received_model_input.lora_mapping == model_input.lora_mapping
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # For sampling metadata, only selected_token_indices is copied.
-    assert (received_model_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
-    assert received_model_input.sampling_metadata.seq_groups is None
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
deleted file mode 100644
index 0be25aa2fc35..000000000000
--- a/tests/worker/test_model_runner.py
+++ /dev/null
@@ -1,462 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
-from vllm.engine.arg_utils import EngineArgs
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import get_open_port
-from vllm.worker.model_runner import ModelRunner
-
-
-def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
-    engine_args = EngineArgs(model, *args, **kwargs)
-    engine_config = engine_args.create_engine_config()
-    model_runner = ModelRunner(
-        vllm_config=engine_config,
-        is_driver_worker=True,
-    )
-    return model_runner
-
-
-def test_deepseek_mla_attn_backend_module():
-    model_runner = _create_model_runner(
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
-        trust_remote_code=True,
-        enable_chunked_prefill=False,
-    )
-    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
-
-
-@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
-@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enable_prompt_embeds=True,
-    )
-
-    seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
-    expected_input_embeds_len = 0
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * seq_len,
-                prompt_embeds=torch.rand(seq_len, 10),
-            )
-            expected_input_embeds_len += seq_len
-        else:
-            seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len))
-
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=True,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-        )
-        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    expected_selected_token_indices = []
-    selected_token_start_idx = 0
-    for seq_len in seq_lens:
-        expected_selected_token_indices.append(selected_token_start_idx +
-                                               seq_len - 1)
-        selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-    slot_mapping = attn_metadata.slot_mapping
-    assert return_seq_lens == seq_lens
-    assert len(slot_mapping) == len(input_tokens)
-
-    # Verify input metadata is correct for prompts.
-    device = model_runner.device
-    assert attn_metadata.num_prefills > 0
-    assert attn_metadata.num_decode_tokens == 0
-    torch.testing.assert_close(
-        attn_metadata.seq_lens_tensor,
-        torch.tensor(seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
-    assert attn_metadata.max_decode_seq_len == 0
-
-    # Test subquery start locs.
-    start_idx = 0
-    start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-
-    # Test seq start locs. Note that for normal prefill it is
-    # equivalent to query_start_loc.
-    start_idx = 0
-    seq_start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        seq_start_loc.append(start_idx)
-
-    torch.testing.assert_close(
-        attn_metadata.seq_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-    torch.testing.assert_close(
-        attn_metadata.context_lens_tensor,
-        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
-                    dtype=torch.int,
-                    device=device))
-
-    expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))],
-                            dtype=torch.int32,
-                            device=model_runner.device)
-    torch.testing.assert_close(attn_metadata.block_tables, expected)
-    # Cuda graph should not be used for prerill.
-    assert attn_metadata.use_cuda_graph is False
-
-    assert len(input_tokens) == sum(seq_lens)
-    assert len(input_positions) == sum(seq_lens)
-    if expected_input_embeds_len == 0:
-        torch.testing.assert_close(input_tokens, input_positions)
-        assert input_embeds is None
-    else:
-        assert len(input_embeds) == expected_input_embeds_len
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
-    assert len(input_tokens) == sum(seq_lens)
-    assert len(input_positions) == sum(seq_lens)
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-    torch.allclose(input_tokens, input_positions)
-
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-
-
-@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
-@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=False,
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enable_prompt_embeds=True,
-    )
-
-    context_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    # Assume each seq group finishes prefill.
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        context_len = i % (model_runner.block_size - 1) + 1
-        context_lens.append(context_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * context_len,
-                prompt_embeds=torch.rand(context_len, 10),
-            )
-            output_embed = torch.rand(10)
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len))
-            output_embed = None
-        seq_data.update_num_computed_tokens(context_len)
-        # Append one token ID since prefill is finished.
-        seq_data.append_token_id(1, 0, output_embed)
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables={0: [1]},
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    slot_mapping = attn_metadata.slot_mapping
-
-    assert len(slot_mapping) == len(input_tokens)
-
-    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
-        len(seq_group_metadata_list))
-    # Verify input metadata is correct for prompts.
-    device = model_runner.device
-    assert attn_metadata.num_prefills == 0
-    assert attn_metadata.num_prefill_tokens == 0
-    seq_lens = [context_len + 1 for context_len in context_lens]
-    # seq_lens are padded to expected_bs
-    for _ in range(expected_bs - len(seq_lens)):
-        seq_lens.append(1)
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.num_decode_tokens == len(seq_lens)
-    start_idx = 0
-    start_loc = [start_idx]
-    for _ in context_lens:
-        # decode has only 1 token for query.
-        start_idx += 1
-        start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-
-    start_idx = 0
-    seq_start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        seq_start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.seq_start_loc,
-        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
-
-    torch.testing.assert_close(
-        attn_metadata.context_lens_tensor,
-        torch.tensor(context_lens, dtype=torch.int, device=device))
-    assert attn_metadata.max_decode_seq_len == max(seq_lens)
-    torch.testing.assert_close(
-        attn_metadata.seq_lens_tensor[:len(seq_lens)],
-        torch.tensor(seq_lens, dtype=torch.int, device=device))
-
-    # block table's first index corresponds to each batch, meaning in
-    # decoding it is each token.
-    assert attn_metadata.block_tables.shape[0] == len(input_tokens)
-    # Block table's second dim corresponds to each token's block number.
-    # It is padded up to
-    assert attn_metadata.block_tables.shape[1] == (
-        model_runner.get_max_block_per_batch())
-    assert attn_metadata.use_cuda_graph is True
-
-    assert len(input_tokens) == expected_bs
-    assert len(input_positions) == expected_bs
-    if use_prompt_embeds:
-        expected_input_embeds_length = start_loc[-1]
-        assert len(input_embeds) == expected_input_embeds_length
-        assert expected_input_embeds_length <= expected_bs
-    else:
-        assert input_embeds is None
-
-    # Verify Sampling
-    expected_selected_token_indices = []
-    for selected_token_start_idx, _ in enumerate(context_lens):
-        expected_selected_token_indices.append(selected_token_start_idx)
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        # query lens is all 1 for decode.
-        query_lens=[1 for _ in range(len(context_lens))],
-        device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-
-
-def test_empty_seq_group():
-    """Verify prepare prompt and decode returns empty output."""
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=False,
-    )
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    attn_metadata = model_input.attn_metadata
-
-    assert input_tokens is None
-    assert input_positions is None
-    assert attn_metadata is None
-
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-
-    assert input_tokens is None
-    assert input_positions is None
-    assert input_embeds is None
-    assert attn_metadata is None
-    assert return_seq_lens is None
-
-
-@pytest.fixture
-def distributed_init():
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
-        local_rank=0)
-    ensure_model_parallel_initialized(1, 1)
-
-
-@pytest.mark.parametrize("batch_size", list(range(2, 128, 3)))
-@pytest.mark.parametrize("enforce_eager", [True, False])
-@pytest.mark.parametrize('use_prompt_embeds', [True, False])
-def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
-                        distributed_init, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=enforce_eager,
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=True,
-        enable_prompt_embeds=True,
-    )
-
-    # Add prefill requests.
-    seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    prefill_metadata_list: list[SequenceGroupMetadata] = []
-    decode_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
-    prefill_batch_size = batch_size // 2
-    decode_batch_size = batch_size - prefill_batch_size
-    expected_input_embeds_len = 0
-    for i in range(prefill_batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * seq_len,
-                prompt_embeds=torch.rand(seq_len, 10),
-            )
-            expected_input_embeds_len += seq_len
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(seq_len), )
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=True,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-        )
-        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
-        seq_group_metadata_list.append(seq_group_metadata)
-        prefill_metadata_list.append(seq_group_metadata)
-
-    # Add decode requests
-    for i in range(prefill_batch_size, batch_size):
-        # make sure all tokens fit into one block
-        context_len = i % (model_runner.block_size - 1) + 1
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * context_len,
-                prompt_embeds=torch.rand(context_len, 10),
-            )
-            output_embed = torch.rand(10)
-            # This also iterates the expected input_embeds, because the model
-            # needs both the input and output embeddings passed into together
-            expected_input_embeds_len += 1
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len), )
-            output_embed = None
-        assert len(seq_data.prompt_token_ids) == context_len
-        seq_data.append_token_id(1, 0, output_embed)
-        seq_data.update_num_computed_tokens(context_len)
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables={0: [1]},
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_group_metadata_list.append(seq_group_metadata)
-        decode_metadata_list.append(seq_group_metadata)
-
-    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-
-    prefill_meta_actual = attn_metadata.prefill_metadata
-    decode_meta_actual = attn_metadata.decode_metadata
-
-    assert len(attn_metadata.slot_mapping) == len(input_tokens)
-    assert len(input_positions) == len(input_tokens)
-    assert attn_metadata.num_prefills == prefill_batch_size
-    assert attn_metadata.num_decode_tokens == decode_batch_size
-    assert attn_metadata.num_prefill_tokens == sum(seq_lens)
-    if expected_input_embeds_len == 0:
-        assert input_embeds is None
-    else:
-        assert len(input_embeds) == expected_input_embeds_len
-
-    # Verify attn metadata is consistent. We don't need to test individual
-    # values here because they are tested above.
-    attn_metadata = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list).attn_metadata
-
-    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
-                                          vars(prefill_meta_actual)):
-        assert attr_expected[1] == attr_actual[1]
-    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
-                                          vars(decode_meta_actual)):
-        assert attr_expected[1] == attr_actual[1]
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
deleted file mode 100644
index d8767f700b57..000000000000
--- a/tests/worker/test_profile.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.worker import Worker
-
-
-def test_gpu_memory_profiling():
-    # Tests the gpu profiling that happens in order to determine the number of
-    # KV cache blocks that we can allocate on the GPU.
-    # This test mocks the maximum available gpu memory so that it can run on
-    # any gpu setup.
-
-    # Set up engine args to build a worker.
-    engine_args = EngineArgs(model="facebook/opt-125m",
-                             dtype="half",
-                             load_format="dummy")
-    engine_config = engine_args.create_engine_config()
-    engine_config.cache_config.num_gpu_blocks = 1000
-    engine_config.cache_config.num_cpu_blocks = 1000
-
-    # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
-    worker = Worker(
-        vllm_config=engine_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=True,
-    )
-
-    # Set 10GiB as the total gpu ram to be device-agnostic
-    def mock_mem_info():
-        current_usage = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        mock_total_bytes = 10 * 1024**3
-        free = mock_total_bytes - current_usage
-
-        return (free, mock_total_bytes)
-
-    from unittest.mock import patch
-    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
-        # Load the model so we can profile it
-        worker.init_device()
-        worker.load_model()
-        gpu_blocks, _ = worker.determine_num_available_blocks()
-
-    # Peak vram usage by torch should be 0.47 GiB
-    # Model weights take 0.25 GiB
-    # No memory should be allocated outside of torch
-    # 9.0 GiB should be the utilization target
-    # 8.28 GiB should be available for the KV cache
-    block_size = CacheEngine.get_cache_block_size(
-        engine_config.cache_config, engine_config.model_config,
-        engine_config.parallel_config)
-
-    expected_blocks = (8.28 * 1024**3) // block_size
-
-    # Check within a small tolerance for portability
-    # Hardware, kernel, or dependency changes could all affect memory
-    # utilization.
-    # A 100 block tolerance here should be about 60MB of wiggle room.
-    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
deleted file mode 100644
index 6d9f404ac207..000000000000
--- a/tests/worker/test_swap.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.worker import Worker
-
-
-def test_swap() -> None:
-    # Configure the engine.
-    engine_args = EngineArgs(model="distilbert/distilgpt2",
-                             dtype="half",
-                             load_format="dummy")
-    engine_config = engine_args.create_engine_config()
-    engine_config.cache_config.num_gpu_blocks = 1000
-    engine_config.cache_config.num_cpu_blocks = 1000
-
-    # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
-    worker = Worker(
-        vllm_config=engine_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=True,
-    )
-
-    # Initialize the worker.
-    worker.init_device()
-    worker.load_model()
-    worker.initialize_cache(
-        num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
-        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
-
-    # Randomly initialize the cache.
-    gpu_cache = worker.cache_engine[0].gpu_cache
-    cpu_cache = worker.cache_engine[0].cpu_cache
-    num_layers = len(gpu_cache)
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        gpu_key_cache.random_()
-        gpu_value_cache.random_()
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        cpu_key_cache.random_()
-        cpu_value_cache.random_()
-
-    allclose = lambda a, b: torch.allclose(
-        a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
-
-    # Test swap out.
-    blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
-    execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=[],
-        blocks_to_swap_in=[],
-        blocks_to_swap_out=blocks_to_swap_out,
-        blocks_to_copy=[],
-    )
-    worker.execute_model(execute_model_req=execute_model_req)
-
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        for src, dst in blocks_to_swap_out:
-            assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
-            assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
-
-    # Test swap in.
-    execute_model_req.blocks_to_swap_out = []
-    execute_model_req.blocks_to_swap_in = [
-        (19, 45),
-        (67, 23),
-        (12, 78),
-        (40, 99),
-        (1, 71),
-    ]
-    worker.execute_model(execute_model_req=execute_model_req)
-
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        for src, dst in execute_model_req.blocks_to_swap_in:
-            assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
-            assert allclose(gpu_value_cache[dst], cpu_value_cache[src])

From 9011ad269ecab49b0a3f60533f84f57faff803fc Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 17:39:01 -0700
Subject: [PATCH 12/17] minor

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .github/CODEOWNERS | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 771dd2e17258..b8d6db06548d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -41,7 +41,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
@@ -50,7 +49,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm

From 2d60e15342e207b6c3e56aaea150f73d61240630 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 19:39:42 -0700
Subject: [PATCH 13/17] fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b7e9746bb745..82edb9745544 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -78,14 +78,12 @@ steps:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
   - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_preemption
   - tests/basic_correctness/test_cumem.py
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s basic_correctness/test_cumem.py
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Entrypoints Unit Tests # 5min
   timeout_in_minutes: 10

From 4de8edaf45aca1ae9e18c4cba87134cc86084eec Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 19:49:59 -0700
Subject: [PATCH 14/17] update

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../openai/test_return_tokens_as_ids.py       | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 5f43fdc9588f..ef9d5234f231 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -10,8 +10,30 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import default_server_args  # noqa: F401
-from .test_completion import MODEL_NAME
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+    ]
 
 
 @pytest.fixture(scope="module")

From 4f6be9c758673ea5730cb0482234a6f7f71de46a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 21:35:44 -0700
Subject: [PATCH 15/17] fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../openai/test_completion_with_prompt_embeds.py          | 3 +++
 tests/entrypoints/openai/test_metrics.py                  | 2 +-
 tests/entrypoints/openai/test_skip_tokenizer.py           | 8 --------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index dbfb1b024f7c..b78589e96ba3 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -14,6 +14,9 @@
 
 from ...utils import RemoteOpenAIServer
 
+pytest.skip("Skipping prompt_embeds test until V1 supports it.",
+            allow_module_level=True)
+
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 0c9e0f3a5142..8917aa5a5efb 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -22,7 +22,7 @@
 PREV_MINOR_VERSION = version._prev_minor_version()
 
 
-@pytest.fixture(scope="module", params=[True, False])
+@pytest.fixture(scope="module", params=[True])
 def use_v1(request):
     # Module-scoped variant of run_with_both_engines
     #
diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
index 840e0dac81c9..b469fc76fc7a 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -15,14 +15,6 @@
 DTYPE = "float16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = [

From a27dabc3b7d6d05af29e4a19a92c2cbccf3591f7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 18 Sep 2025 08:34:17 -0700
Subject: [PATCH 16/17] fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/entrypoints/openai/test_lora_adapters.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 10c0cb5f4d15..6f2addd3649d 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -53,12 +53,13 @@ def monkeypatch_module():
     mpatch.undo()
 
 
-@pytest.fixture(scope="module", params=[False, True])
+@pytest.fixture(scope="module", params=[True])
 def server_with_lora_modules_json(request, monkeypatch_module,
                                   zephyr_lora_files):
 
     use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    assert use_v1
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')
 
     # Define the json format LoRA module configurations
     lora_module_1 = {

From f2d9c622e9e9a29ee95c31c05bc44680473f347a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 18 Sep 2025 08:37:00 -0700
Subject: [PATCH 17/17] rm is_v1_server

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/entrypoints/openai/test_chat.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index cbd8fa38a27f..3bdfef7b4adb 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -472,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_chat(
-        client: openai.AsyncOpenAI, sample_structured_outputs_choices,
-        is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
+    client: openai.AsyncOpenAI,
+    sample_structured_outputs_choices,
+):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -513,12 +512,10 @@ async def test_structured_outputs_choice_chat(
 
 
 @pytest.mark.asyncio
-async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
-                                            sample_json_schema,
-                                            is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
-
+async def test_structured_outputs_json_chat(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -560,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
-                                             sample_regex, is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
+async def test_structured_outputs_regex_chat(
+    client: openai.AsyncOpenAI,
+    sample_regex,
+):
 
     messages = [{
         "role": "system",