Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"--api-server-count",
"-asc",
type=int,
default=1,
default=4,
help="How many API server processes to run.",
)
parser.add_argument(
Expand Down
27 changes: 14 additions & 13 deletions vllm/entrypoints/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import io
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Annotated
from typing import Annotated, Any

import pybase64
import torch
Expand Down Expand Up @@ -318,8 +318,11 @@ async def _create_prompt_from_text(
add_special_tokens: bool | None,
cache_salt: str | None,
) -> EngineTokensPrompt:
"""Tokenize text input asynchronously."""
async_tokenizer = self._get_async_tokenizer()
# Using a sync tokenizer is faster than using an async_tokenizer.
tokenizer = self.tokenizer

if tokenizer is None:
raise ValueError("No tokenizer available for text input processing")

# Handle encoder-specific preprocessing
if (
Expand All @@ -328,16 +331,14 @@ async def _create_prompt_from_text(
):
text = text.lower()

# Tokenize texts
if truncate_prompt_tokens is None:
encoded = await async_tokenizer(text, add_special_tokens=add_special_tokens)
else:
encoded = await async_tokenizer(
text,
add_special_tokens=add_special_tokens,
truncation=True,
max_length=truncate_prompt_tokens,
)
tokenization_kwargs: dict[str, Any] = {}
if add_special_tokens is not None:
tokenization_kwargs["add_special_tokens"] = add_special_tokens
if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens

encoded = tokenizer(text, **tokenization_kwargs)

return self._create_tokens_prompt(
encoded.input_ids, max_length, cache_salt, text
Expand Down