Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
f8bc899
[Frontend] Use new Renderer for Completions and Tokenize API
DarkLight1337 Jan 22, 2026
a999855
Simplify
DarkLight1337 Jan 22, 2026
55f51e4
Shorten
DarkLight1337 Jan 22, 2026
9bc5ab0
Shorten name
DarkLight1337 Jan 22, 2026
a4bc111
Simplify
DarkLight1337 Jan 22, 2026
d94c320
Clean
DarkLight1337 Jan 22, 2026
c8b9e2e
Lazy import
DarkLight1337 Jan 22, 2026
673e724
Simplify
DarkLight1337 Jan 22, 2026
4279c5d
Fix
DarkLight1337 Jan 22, 2026
28be477
Simplify
DarkLight1337 Jan 22, 2026
b7d1666
Avoid cast
DarkLight1337 Jan 22, 2026
7026b15
Remove cast
DarkLight1337 Jan 22, 2026
a84eadc
Fix
DarkLight1337 Jan 22, 2026
33c79ac
Fix
DarkLight1337 Jan 22, 2026
d907c4f
Fix
DarkLight1337 Jan 22, 2026
a84f2cf
Fixes
DarkLight1337 Jan 22, 2026
d8f11a6
Reduce diff
DarkLight1337 Jan 22, 2026
8410354
Clean
DarkLight1337 Jan 22, 2026
fa5525b
Fix mypy
DarkLight1337 Jan 22, 2026
cfe6265
Don't overwrite with `None`
DarkLight1337 Jan 22, 2026
123c6b9
Fix
DarkLight1337 Jan 22, 2026
33771ef
Rename
DarkLight1337 Jan 22, 2026
88fe1fe
Optimize
DarkLight1337 Jan 22, 2026
a8766bc
Remove ignores
DarkLight1337 Jan 22, 2026
f811a16
Update
DarkLight1337 Jan 23, 2026
1b16051
Fix
DarkLight1337 Jan 23, 2026
199f073
Update
DarkLight1337 Jan 23, 2026
0b0d300
Update
DarkLight1337 Jan 23, 2026
66fc274
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 23, 2026
38addc5
Consolidate UUID check
DarkLight1337 Jan 23, 2026
b73d554
Move test
DarkLight1337 Jan 23, 2026
6152a3e
Migrate tests
DarkLight1337 Jan 23, 2026
2e1745d
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 23, 2026
25d863c
Update
DarkLight1337 Jan 23, 2026
25dcaa3
Clean
DarkLight1337 Jan 23, 2026
154f645
Simplify
DarkLight1337 Jan 23, 2026
4ea6c52
Migrate tests
DarkLight1337 Jan 23, 2026
6a96f2d
Fix mypy
DarkLight1337 Jan 23, 2026
f68ce60
Clean
DarkLight1337 Jan 23, 2026
491d927
Fix mypy
DarkLight1337 Jan 23, 2026
5ea5d4e
Be less strict
DarkLight1337 Jan 23, 2026
dbf38c2
Update
DarkLight1337 Jan 23, 2026
a135791
Simplify
DarkLight1337 Jan 23, 2026
983d709
Simplify
DarkLight1337 Jan 23, 2026
58da6d8
Fix types
DarkLight1337 Jan 23, 2026
47d65cb
Apply to disagg
DarkLight1337 Jan 23, 2026
1290e0b
Fix
DarkLight1337 Jan 23, 2026
30a3bf9
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 23, 2026
92246d7
Keep previous error
DarkLight1337 Jan 23, 2026
40255f9
Oops
DarkLight1337 Jan 23, 2026
2922361
Remove unused
DarkLight1337 Jan 23, 2026
98370df
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 26, 2026
bf95d8c
Fix
DarkLight1337 Jan 26, 2026
fb56a64
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 26, 2026
8b8c04b
Update
DarkLight1337 Jan 26, 2026
4e78896
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
343baca
Update tokenization params
DarkLight1337 Jan 27, 2026
2aaca9d
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
918f15d
Mypy
DarkLight1337 Jan 27, 2026
911da11
Avoid double BOS
DarkLight1337 Jan 27, 2026
fc58c8f
Fix prompt normalization
DarkLight1337 Jan 27, 2026
ffd948b
Update
DarkLight1337 Jan 27, 2026
318c61f
msg
DarkLight1337 Jan 27, 2026
a3c76c3
Update
DarkLight1337 Jan 27, 2026
ee0e9ab
Clean
DarkLight1337 Jan 27, 2026
d2c47dc
Handle `do_lower_case`
DarkLight1337 Jan 27, 2026
55c974b
Fix
DarkLight1337 Jan 27, 2026
22bc3da
Fix
DarkLight1337 Jan 27, 2026
1009199
Doc
DarkLight1337 Jan 27, 2026
a83c910
Simplify
DarkLight1337 Jan 27, 2026
241f68c
Update
DarkLight1337 Jan 27, 2026
437291f
Fix tests
DarkLight1337 Jan 27, 2026
b0f95a1
Deprecate
DarkLight1337 Jan 27, 2026
cbf2f7f
Let MM processor handle tokenization
DarkLight1337 Jan 27, 2026
c129b32
Fix `truncate_prompt_tokens == 0`
DarkLight1337 Jan 27, 2026
f5f97ef
Fix
DarkLight1337 Jan 27, 2026
add7cae
Fix
DarkLight1337 Jan 27, 2026
18097a0
Fix
DarkLight1337 Jan 27, 2026
a4a18d8
Don't allow zero
DarkLight1337 Jan 27, 2026
4d55f49
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
79e54b1
Fix mocks
DarkLight1337 Jan 27, 2026
ac90878
Fix tests
DarkLight1337 Jan 27, 2026
be60017
More fixes
DarkLight1337 Jan 27, 2026
2c3109b
Fix
DarkLight1337 Jan 27, 2026
8e949a5
Fix
DarkLight1337 Jan 27, 2026
47755a0
Fixes
DarkLight1337 Jan 27, 2026
1e1925d
Unused
DarkLight1337 Jan 27, 2026
e7bcbd9
Simplify
DarkLight1337 Jan 27, 2026
883298c
Fix
DarkLight1337 Jan 27, 2026
dfe2f15
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 27, 2026
e6efddc
Update
DarkLight1337 Jan 27, 2026
9e325b2
Update
DarkLight1337 Jan 27, 2026
cb523d0
Fix
DarkLight1337 Jan 27, 2026
f98127c
Allow `prompt=None`
DarkLight1337 Jan 27, 2026
b82d792
Fix ultravox
DarkLight1337 Jan 27, 2026
dcbf176
Fix
DarkLight1337 Jan 28, 2026
c24e821
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 28, 2026
4d60a6d
Revert params changes, will do in another PR
DarkLight1337 Jan 28, 2026
edef165
Fix
DarkLight1337 Jan 28, 2026
af83a1d
mypy
DarkLight1337 Jan 28, 2026
8ed3cf4
Fix double BOS
DarkLight1337 Jan 28, 2026
f69ac6d
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 28, 2026
8dc81f2
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 28, 2026
ae3c6f1
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 29, 2026
27f5610
Fix
DarkLight1337 Jan 29, 2026
c8977ec
Handle padding
DarkLight1337 Jan 29, 2026
c22c2dc
Don't use tokenizer attribute
DarkLight1337 Jan 29, 2026
e19cafd
Use bool
DarkLight1337 Jan 29, 2026
2666e7f
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 29, 2026
dc5f6a6
Fix
DarkLight1337 Jan 29, 2026
3fb0161
Fix
DarkLight1337 Jan 29, 2026
4663b8d
Typo
DarkLight1337 Jan 29, 2026
a827153
Merge branch 'main' into init-renderer-2
DarkLight1337 Jan 29, 2026
2e26322
Fix mock
DarkLight1337 Jan 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 43 additions & 5 deletions vllm/entrypoints/openai/chat_completion/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
ChatCompletionAudio as OpenAIChatCompletionAudio,
)
from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
from pydantic import (
Field,
model_validator,
)
from pydantic import Field, model_validator

from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
)
from vllm.entrypoints.openai.engine.protocol import (
AnyResponseFormat,
DeltaMessage,
Expand All @@ -36,6 +37,7 @@
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.renderers import ChatParserParams, TokenizationParams
from vllm.sampling_params import (
BeamSearchParams,
RequestOutputKind,
Expand Down Expand Up @@ -356,6 +358,42 @@ class ChatCompletionRequest(OpenAIBaseModel):

# --8<-- [end:chat-completion-extra-params]

def build_chat_params(
self,
default_template: str | None,
default_template_content_format: ChatTemplateContentFormatOption,
) -> ChatParserParams:
return ChatParserParams(
chat_template=self.chat_template or default_template,
chat_template_content_format=default_template_content_format,
chat_template_kwargs=dict(
add_generation_prompt=self.add_generation_prompt,
continue_final_message=self.continue_final_message,
documents=self.documents,
reasoning_effort=self.reasoning_effort,
),
).with_defaults(self.chat_template_kwargs)
Comment thread
DarkLight1337 marked this conversation as resolved.
Outdated

def build_tok_params(self, model_config: ModelConfig) -> TokenizationParams:
max_tokens = self.max_completion_tokens

# Validate max_tokens before using it
if max_tokens is not None and max_tokens > model_config.max_model_len:
raise VLLMValidationError(
f"'max_tokens' ({max_tokens}) cannot be greater than the "
f"model's maximum context length ({model_config.max_model_len}).",
parameter="max_tokens",
value=max_tokens,
)

return TokenizationParams.from_config(
model_config,
max_length=model_config.max_model_len - (max_tokens or 0),
truncate_prompt_tokens=self.truncate_prompt_tokens,
add_special_tokens=self.add_special_tokens,
needs_detokenization=bool(self.echo and not self.return_token_ids),
)

# Default sampling parameters for chat completion requests
_DEFAULT_SAMPLING_PARAMS: dict = {
"repetition_penalty": 1.0,
Expand Down
55 changes: 26 additions & 29 deletions vllm/entrypoints/openai/chat_completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
)
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.inputs.data import TokensPrompt
from vllm.inputs.data import EmbedsPrompt, TokensPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import CompletionOutput, RequestOutput
Expand Down Expand Up @@ -186,8 +186,6 @@ async def warmup(self) -> None:
start_time = time.perf_counter()

try:
renderer = self.engine_client.renderer

# Create a minimal dummy request
dummy_request = ChatCompletionRequest(
messages=[{"role": "user", "content": "warmup"}],
Expand All @@ -202,18 +200,10 @@ async def warmup(self) -> None:
# 3. Tokenizer initialization for chat
await self._preprocess_chat(
dummy_request,
renderer,
dummy_request.messages,
chat_template=self.chat_template,
chat_template_content_format=self.chat_template_content_format,
add_generation_prompt=True,
continue_final_message=False,
tool_dicts=None,
documents=None,
chat_template_kwargs=None,
default_chat_template_kwargs=self.default_chat_template_kwargs,
tool_parser=None,
add_special_tokens=False,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=self.default_chat_template_kwargs,
)

elapsed = (time.perf_counter() - start_time) * 1000
Expand All @@ -226,7 +216,13 @@ async def warmup(self) -> None:
async def render_chat_request(
self,
request: ChatCompletionRequest,
) -> tuple[list[ConversationMessage], list[Any]] | ErrorResponse:
) -> (
tuple[
list[ConversationMessage],
list[TokensPrompt | EmbedsPrompt],
]
| ErrorResponse
):
"""
render chat request by validating and preprocessing inputs.

Expand Down Expand Up @@ -303,23 +299,14 @@ async def render_chat_request(
if error_check_ret is not None:
return error_check_ret

chat_template_kwargs = request.chat_template_kwargs or {}
chat_template_kwargs.update(reasoning_effort=request.reasoning_effort)

conversation, engine_prompts = await self._preprocess_chat(
request,
renderer,
request.messages,
chat_template=request.chat_template or self.chat_template,
chat_template_content_format=self.chat_template_content_format,
add_generation_prompt=request.add_generation_prompt,
continue_final_message=request.continue_final_message,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=self.default_chat_template_kwargs,
tool_dicts=tool_dicts,
documents=request.documents,
chat_template_kwargs=chat_template_kwargs,
default_chat_template_kwargs=self.default_chat_template_kwargs,
tool_parser=tool_parser,
add_special_tokens=request.add_special_tokens,
)
else:
# For GPT-OSS.
Expand Down Expand Up @@ -376,7 +363,10 @@ async def create_chat_completion(
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
for i, engine_prompt in enumerate(engine_prompts):
prompt_text, _, _ = self._get_prompt_components(engine_prompt)
prompt_text, prompt_ids, prompt_embeds = self._get_prompt_components(
engine_prompt
)

# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
sub_request_id = (
Expand All @@ -386,10 +376,17 @@ async def create_chat_completion(
if self.default_sampling_params is None:
self.default_sampling_params = {}

if prompt_ids is not None:
input_length = len(prompt_ids)
elif prompt_embeds is not None:
input_length = len(prompt_embeds)
else:
raise AssertionError(engine_prompt.keys())
Comment thread
DarkLight1337 marked this conversation as resolved.
Outdated

max_tokens = get_max_tokens(
max_model_len=self.max_model_len,
request=request,
input_length=len(engine_prompt["prompt_token_ids"]),
input_length=input_length,
default_sampling_params=self.default_sampling_params,
)

Expand Down
27 changes: 23 additions & 4 deletions vllm/entrypoints/openai/completion/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
from typing import Annotated, Any, Literal

import torch
from pydantic import (
Field,
model_validator,
)
from pydantic import Field, model_validator

from vllm.config import ModelConfig
from vllm.entrypoints.openai.engine.protocol import (
AnyResponseFormat,
LegacyStructuralTagResponseFormat,
Expand All @@ -27,6 +25,7 @@
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.renderers import TokenizationParams
from vllm.sampling_params import (
BeamSearchParams,
RequestOutputKind,
Expand Down Expand Up @@ -178,6 +177,26 @@ class CompletionRequest(OpenAIBaseModel):

# --8<-- [end:completion-extra-params]

def build_tok_params(self, model_config: ModelConfig) -> TokenizationParams:
max_tokens = self.max_tokens

# Validate max_tokens before using it
if max_tokens is not None and max_tokens > model_config.max_model_len:
raise VLLMValidationError(
f"'max_tokens' ({max_tokens}) cannot be greater than the "
f"model's maximum context length ({model_config.max_model_len}).",
parameter="max_tokens",
value=max_tokens,
)

return TokenizationParams.from_config(
model_config,
max_length=model_config.max_model_len - (max_tokens or 0),
truncate_prompt_tokens=self.truncate_prompt_tokens,
add_special_tokens=self.add_special_tokens,
needs_detokenization=bool(self.echo and not self.return_token_ids),
)

# Default sampling parameters for completion requests
_DEFAULT_SAMPLING_PARAMS: dict = {
"repetition_penalty": 1.0,
Expand Down
46 changes: 5 additions & 41 deletions vllm/entrypoints/openai/completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from collections.abc import Sequence as GenericSequence
from typing import cast

import jinja2
from fastapi import Request

from vllm.engine.protocol import EngineClient
Expand All @@ -32,7 +31,6 @@
clamp_prompt_logprobs,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.exceptions import VLLMValidationError
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
Expand Down Expand Up @@ -116,18 +114,11 @@ async def render_completion_request(
"prompt_logprobs is not compatible with prompt embeds."
)

try:
renderer = self._get_completion_renderer()
engine_prompts = await renderer.render_prompt_and_embeds(
prompt_or_prompts=request.prompt,
prompt_embeds=request.prompt_embeds,
config=self._build_render_config(request),
)
except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(e)

return engine_prompts
return await self._preprocess_completion(
request,
prompt_input=request.prompt,
prompt_embeds=request.prompt_embeds,
)

async def create_completion(
self,
Expand Down Expand Up @@ -222,10 +213,6 @@ async def create_completion(
else await self._get_trace_headers(raw_request.headers)
)

# Mypy inconsistently requires this second cast in different
# environments. It shouldn't be necessary (redundant from above)
# but pre-commit in CI fails without it.
engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
if isinstance(sampling_params, BeamSearchParams):
generator = self.beam_search(
prompt=engine_prompt,
Expand Down Expand Up @@ -728,26 +715,3 @@ def _create_completion_logprobs(
tokens=out_tokens,
top_logprobs=out_top_logprobs,
)

def _build_render_config(
self,
request: CompletionRequest,
max_input_length: int | None = None,
) -> RenderConfig:
# Validate max_tokens before using it
if request.max_tokens is not None and request.max_tokens > self.max_model_len:
raise VLLMValidationError(
f"'max_tokens' ({request.max_tokens}) cannot be greater than "
f"the model's maximum context length ({self.max_model_len}).",
parameter="max_tokens",
value=request.max_tokens,
)

max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
return RenderConfig(
max_length=max_input_tokens_len,
truncate_prompt_tokens=request.truncate_prompt_tokens,
add_special_tokens=request.add_special_tokens,
cache_salt=request.cache_salt,
needs_detokenization=bool(request.echo and not request.return_token_ids),
)
7 changes: 1 addition & 6 deletions vllm/entrypoints/openai/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import Any, ClassVar, Literal, TypeAlias

import regex as re
import torch
from pydantic import (
BaseModel,
ConfigDict,
Expand All @@ -17,16 +16,12 @@

from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.logger import init_logger
from vllm.sampling_params import (
SamplingParams,
)
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
from vllm.utils.import_utils import resolve_obj_by_qualname

logger = init_logger(__name__)

_LONG_INFO = torch.iinfo(torch.long)


class OpenAIBaseModel(BaseModel):
# OpenAI API does allow extra fields
Expand Down
Loading