Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
f8bc899
[Frontend] Use new Renderer for Completions and Tokenize API
DarkLight1337 Jan 22, 2026
a999855
Simplify
DarkLight1337 Jan 22, 2026
55f51e4
Shorten
DarkLight1337 Jan 22, 2026
9bc5ab0
Shorten name
DarkLight1337 Jan 22, 2026
a4bc111
Simplify
DarkLight1337 Jan 22, 2026
d94c320
Clean
DarkLight1337 Jan 22, 2026
c8b9e2e
Lazy import
DarkLight1337 Jan 22, 2026
673e724
Simplify
DarkLight1337 Jan 22, 2026
4279c5d
Fix
DarkLight1337 Jan 22, 2026
28be477
Simplify
DarkLight1337 Jan 22, 2026
b7d1666
Avoid cast
DarkLight1337 Jan 22, 2026
7026b15
Remove cast
DarkLight1337 Jan 22, 2026
a84eadc
Fix
DarkLight1337 Jan 22, 2026
33c79ac
Fix
DarkLight1337 Jan 22, 2026
d907c4f
Fix
DarkLight1337 Jan 22, 2026
a84f2cf
Fixes
DarkLight1337 Jan 22, 2026
d8f11a6
Reduce diff
DarkLight1337 Jan 22, 2026
8410354
Clean
DarkLight1337 Jan 22, 2026
fa5525b
Fix mypy
DarkLight1337 Jan 22, 2026
cfe6265
Don't overwrite with `None`
DarkLight1337 Jan 22, 2026
123c6b9
Fix
DarkLight1337 Jan 22, 2026
33771ef
Rename
DarkLight1337 Jan 22, 2026
88fe1fe
Optimize
DarkLight1337 Jan 22, 2026
a8766bc
Remove ignores
DarkLight1337 Jan 22, 2026
f811a16
Update
DarkLight1337 Jan 23, 2026
1b16051
Fix
DarkLight1337 Jan 23, 2026
199f073
Update
DarkLight1337 Jan 23, 2026
0b0d300
Update
DarkLight1337 Jan 23, 2026
66fc274
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 23, 2026
38addc5
Consolidate UUID check
DarkLight1337 Jan 23, 2026
b73d554
Move test
DarkLight1337 Jan 23, 2026
6152a3e
Migrate tests
DarkLight1337 Jan 23, 2026
2e1745d
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 23, 2026
25d863c
Update
DarkLight1337 Jan 23, 2026
25dcaa3
Clean
DarkLight1337 Jan 23, 2026
154f645
Simplify
DarkLight1337 Jan 23, 2026
4ea6c52
Migrate tests
DarkLight1337 Jan 23, 2026
6a96f2d
Fix mypy
DarkLight1337 Jan 23, 2026
f68ce60
Clean
DarkLight1337 Jan 23, 2026
491d927
Fix mypy
DarkLight1337 Jan 23, 2026
5ea5d4e
Be less strict
DarkLight1337 Jan 23, 2026
dbf38c2
Update
DarkLight1337 Jan 23, 2026
a135791
Simplify
DarkLight1337 Jan 23, 2026
983d709
Simplify
DarkLight1337 Jan 23, 2026
58da6d8
Fix types
DarkLight1337 Jan 23, 2026
47d65cb
Apply to disagg
DarkLight1337 Jan 23, 2026
1290e0b
Fix
DarkLight1337 Jan 23, 2026
30a3bf9
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 23, 2026
92246d7
Keep previous error
DarkLight1337 Jan 23, 2026
40255f9
Oops
DarkLight1337 Jan 23, 2026
2922361
Remove unused
DarkLight1337 Jan 23, 2026
98370df
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 26, 2026
bf95d8c
Fix
DarkLight1337 Jan 26, 2026
fb56a64
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 26, 2026
8b8c04b
Update
DarkLight1337 Jan 26, 2026
4e78896
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
343baca
Update tokenization params
DarkLight1337 Jan 27, 2026
2aaca9d
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
918f15d
Mypy
DarkLight1337 Jan 27, 2026
911da11
Avoid double BOS
DarkLight1337 Jan 27, 2026
fc58c8f
Fix prompt normalization
DarkLight1337 Jan 27, 2026
ffd948b
Update
DarkLight1337 Jan 27, 2026
318c61f
msg
DarkLight1337 Jan 27, 2026
a3c76c3
Update
DarkLight1337 Jan 27, 2026
ee0e9ab
Clean
DarkLight1337 Jan 27, 2026
d2c47dc
Handle `do_lower_case`
DarkLight1337 Jan 27, 2026
55c974b
Fix
DarkLight1337 Jan 27, 2026
22bc3da
Fix
DarkLight1337 Jan 27, 2026
1009199
Doc
DarkLight1337 Jan 27, 2026
a83c910
Simplify
DarkLight1337 Jan 27, 2026
241f68c
Update
DarkLight1337 Jan 27, 2026
437291f
Fix tests
DarkLight1337 Jan 27, 2026
b0f95a1
Deprecate
DarkLight1337 Jan 27, 2026
cbf2f7f
Let MM processor handle tokenization
DarkLight1337 Jan 27, 2026
c129b32
Fix `truncate_prompt_tokens == 0`
DarkLight1337 Jan 27, 2026
f5f97ef
Fix
DarkLight1337 Jan 27, 2026
add7cae
Fix
DarkLight1337 Jan 27, 2026
18097a0
Fix
DarkLight1337 Jan 27, 2026
a4a18d8
Don't allow zero
DarkLight1337 Jan 27, 2026
4d55f49
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
79e54b1
Fix mocks
DarkLight1337 Jan 27, 2026
ac90878
Fix tests
DarkLight1337 Jan 27, 2026
be60017
More fixes
DarkLight1337 Jan 27, 2026
2c3109b
Fix
DarkLight1337 Jan 27, 2026
8e949a5
Fix
DarkLight1337 Jan 27, 2026
47755a0
Fixes
DarkLight1337 Jan 27, 2026
1e1925d
Unused
DarkLight1337 Jan 27, 2026
e7bcbd9
Simplify
DarkLight1337 Jan 27, 2026
883298c
Fix
DarkLight1337 Jan 27, 2026
dfe2f15
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
e6efddc
Update
DarkLight1337 Jan 27, 2026
9e325b2
Update
DarkLight1337 Jan 27, 2026
cb523d0
Fix
DarkLight1337 Jan 27, 2026
f98127c
Allow `prompt=None`
DarkLight1337 Jan 27, 2026
b82d792
Fix ultravox
DarkLight1337 Jan 27, 2026
dcbf176
Fix
DarkLight1337 Jan 28, 2026
c24e821
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 28, 2026
4d60a6d
Revert params changes, will do in another PR
DarkLight1337 Jan 28, 2026
edef165
Fix
DarkLight1337 Jan 28, 2026
af83a1d
mypy
DarkLight1337 Jan 28, 2026
8ed3cf4
Fix double BOS
DarkLight1337 Jan 28, 2026
f69ac6d
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 28, 2026
8dc81f2
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 28, 2026
ae3c6f1
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 29, 2026
27f5610
Fix
DarkLight1337 Jan 29, 2026
c8977ec
Handle padding
DarkLight1337 Jan 29, 2026
c22c2dc
Don't use tokenizer attribute
DarkLight1337 Jan 29, 2026
e19cafd
Use bool
DarkLight1337 Jan 29, 2026
2666e7f
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 29, 2026
dc5f6a6
Fix
DarkLight1337 Jan 29, 2026
3fb0161
Fix
DarkLight1337 Jan 29, 2026
4663b8d
Typo
DarkLight1337 Jan 29, 2026
a827153
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 29, 2026
2e26322
Fix mock
DarkLight1337 Jan 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ def main():

completion = client.completions.create(
model=model_name,
# NOTE: The OpenAI client does not allow `None` as an input to
# `prompt`. Use an empty string if you have no text prompts.
prompt="",
prompt=None,
max_tokens=5,
temperature=0.0,
# NOTE: The OpenAI client allows passing in extra JSON body via the
Expand Down
6 changes: 5 additions & 1 deletion tests/engine/test_short_mm_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ def test_context_length_too_short(vllm_runner, image_assets, model):
with pytest.raises(ValueError, match="longer than the maximum model length"):
vllm_model = vllm_runner(
model,
max_model_len=128, # LLaVA has a feature size of 576
# LLaVA has a feature size of 576
# For the HF processor to execute successfully but still
# failing the overall context length check, we need the
# max_model_len to at least contain all image tokens
max_model_len=579,
enforce_eager=True,
load_format="dummy",
)
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/llm/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
valid_msg,
]
sampling_params = SamplingParams(temperature=0, max_tokens=10)
with pytest.raises(ValueError, match="longer than the maximum model length"):
with pytest.raises(ValueError, match="context length is only"):
llm.chat(batch_1, sampling_params=sampling_params)
outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
assert len(outputs_2) == len(batch_2)
Expand Down
29 changes: 13 additions & 16 deletions tests/entrypoints/openai/test_chat_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.tokenizers import get_tokenizer
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
from vllm.v1.engine.async_llm import AsyncLLM

MODEL_NAME = "openai-community/gpt2"
Expand Down Expand Up @@ -57,6 +58,15 @@ def get_diff_sampling_param(self):
return self.diff_sampling_param or {}


def _build_renderer(model_config: MockModelConfig):
_, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)

return HfRenderer(
model_config,
tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
)


def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
Expand All @@ -71,26 +81,13 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
chat_template_content_format="auto",
)

async def _fake_process_inputs(
request_id,
engine_prompt,
sampling_params,
*,
lora_request,
trace_headers,
priority,
data_parallel_rank,
):
return dict(engine_prompt), {}

async def _fake_preprocess_chat(*args, **kwargs):
# return conversation, engine_prompts
return (
[{"role": "user", "content": "Test"}],
[{"prompt_token_ids": [1, 2, 3]}],
)

serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
return serving_chat

Expand All @@ -99,11 +96,11 @@ async def _fake_preprocess_chat(*args, **kwargs):
async def test_chat_error_non_stream():
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)

serving_chat = _build_serving_chat(mock_engine)

Expand Down Expand Up @@ -153,11 +150,11 @@ async def mock_generate(*args, **kwargs):
async def test_chat_error_stream():
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)

serving_chat = _build_serving_chat(mock_engine)

Expand Down
31 changes: 13 additions & 18 deletions tests/entrypoints/openai/test_completion_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Any
from unittest.mock import AsyncMock, MagicMock
from unittest.mock import MagicMock

import pytest

Expand All @@ -15,7 +15,8 @@
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.tokenizers import get_tokenizer
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
from vllm.v1.engine.async_llm import AsyncLLM

MODEL_NAME = "openai-community/gpt2"
Expand Down Expand Up @@ -61,37 +62,31 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_completion = OpenAIServingCompletion(
return OpenAIServingCompletion(
engine,
models,
request_logger=None,
)

async def _fake_process_inputs(
request_id,
engine_prompt,
sampling_params,
*,
lora_request,
trace_headers,
priority,
data_parallel_rank,
):
return dict(engine_prompt), {}

serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
return serving_completion
def _build_renderer(model_config: MockModelConfig):
_, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)

return HfRenderer(
model_config,
tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
)


@pytest.mark.asyncio
async def test_completion_error_non_stream():
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)

serving_completion = _build_serving_completion(mock_engine)

Expand Down Expand Up @@ -141,11 +136,11 @@ async def mock_generate(*args, **kwargs):
async def test_completion_error_stream():
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)

serving_completion = _build_serving_completion(mock_engine)

Expand Down
20 changes: 10 additions & 10 deletions tests/entrypoints/openai/test_completion_with_prompt_embeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ async def test_completions_with_prompt_embeds(
# Test case: Single prompt embeds input
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
prompt=None,
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds},
Expand All @@ -121,7 +121,7 @@ async def test_completions_with_prompt_embeds(
# Test case: batch completion with prompt_embeds
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
prompt=None,
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
Expand All @@ -133,7 +133,7 @@ async def test_completions_with_prompt_embeds(
# Test case: streaming with prompt_embeds
single_completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
prompt=None,
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds},
Expand All @@ -142,7 +142,7 @@ async def test_completions_with_prompt_embeds(

stream = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
prompt=None,
max_tokens=5,
temperature=0.0,
stream=True,
Expand All @@ -162,7 +162,7 @@ async def test_completions_with_prompt_embeds(
# Test case: batch streaming with prompt_embeds
stream = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
prompt=None,
max_tokens=5,
temperature=0.0,
stream=True,
Expand Down Expand Up @@ -197,7 +197,7 @@ async def test_completions_with_prompt_embeds(
)
completion_embeds_only = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="",
prompt=None,
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds},
Expand All @@ -215,7 +215,7 @@ async def test_completions_errors_with_prompt_embeds(
# Test error case: invalid prompt_embeds
with pytest.raises(BadRequestError):
await client_with_prompt_embeds.completions.create(
prompt="",
prompt=None,
model=model_name,
max_tokens=5,
temperature=0.0,
Expand All @@ -237,7 +237,7 @@ async def test_completions_with_logprobs_and_prompt_embeds(
# Test case: Logprobs using prompt_embeds
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
prompt=None,
max_tokens=5,
temperature=0.0,
echo=False,
Expand All @@ -257,7 +257,7 @@ async def test_completions_with_logprobs_and_prompt_embeds(
# Test case: Log probs with batch completion and prompt_embeds
completion = await client_with_prompt_embeds.completions.create(
model=model_name,
prompt="", # Add empty prompt as required parameter
prompt=None,
max_tokens=5,
temperature=0.0,
echo=False,
Expand Down Expand Up @@ -287,7 +287,7 @@ async def test_prompt_logprobs_raises_error(
with pytest.raises(BadRequestError, match="not compatible"):
await client_with_prompt_embeds.completions.create(
model=MODEL_NAME,
prompt="",
prompt=None,
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
are rejected before they can cause crashes during model inference.

Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems
classes, not by CompletionRenderer or MediaIO classes.
classes, not by MediaIO classes.
"""

import pytest
Expand Down
21 changes: 13 additions & 8 deletions tests/entrypoints/openai/test_lora_resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.tokenizers import get_tokenizer
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
from vllm.v1.engine.async_llm import AsyncLLM

MODEL_NAME = "openai-community/gpt2"
Expand All @@ -35,6 +36,7 @@ class MockModelConfig:
"""Minimal mock ModelConfig for testing."""

model: str = MODEL_NAME
runner_type = "generate"
tokenizer: str = MODEL_NAME
trust_remote_code: bool = False
tokenizer_mode: str = "auto"
Expand Down Expand Up @@ -85,15 +87,21 @@ def register_mock_resolver():
del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]


def _build_renderer(model_config: MockModelConfig):
_, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)

return HfRenderer(
model_config,
tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
)


@pytest.fixture
def mock_serving_setup():
"""Provides a mocked engine and serving completion instance."""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.errored = False

tokenizer = get_tokenizer(MODEL_NAME)
mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer)

async def mock_add_lora_side_effect(lora_request: LoRARequest):
"""Simulate engine behavior when adding LoRAs."""
if lora_request.lora_name == "test-lora":
Expand All @@ -118,6 +126,7 @@ async def mock_generate(*args, **kwargs):
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)

models = OpenAIServingModels(
engine_client=mock_engine,
Expand All @@ -128,10 +137,6 @@ async def mock_generate(*args, **kwargs):
mock_engine, models, request_logger=None
)

serving_completion._process_inputs = AsyncMock(
return_value=(MagicMock(name="engine_request"), {})
)

return mock_engine, serving_completion


Expand Down
Loading