From ccc1edd164bd141ecee65efe471fb7b86cba2950 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 29 Oct 2025 18:14:51 +0800 Subject: [PATCH 1/5] Using a sync tokenizer is faster than using an async_tokenizer. Signed-off-by: wang.yuqi --- vllm/entrypoints/renderer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 3c5a396a99f9..c579c9faaf4c 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -318,8 +318,8 @@ async def _create_prompt_from_text( add_special_tokens: bool | None, cache_salt: str | None, ) -> EngineTokensPrompt: - """Tokenize text input asynchronously.""" - async_tokenizer = self._get_async_tokenizer() + # Using a sync tokenizer is faster than using an async_tokenizer. + tokenizer = self.tokenizer # Handle encoder-specific preprocessing if ( @@ -330,9 +330,9 @@ async def _create_prompt_from_text( # Tokenize texts if truncate_prompt_tokens is None: - encoded = await async_tokenizer(text, add_special_tokens=add_special_tokens) + encoded = tokenizer(text, add_special_tokens=add_special_tokens) else: - encoded = await async_tokenizer( + encoded = tokenizer( text, add_special_tokens=add_special_tokens, truncation=True, From 5efcf5518d531b7a67d2b478b279f8de8ccca308 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 29 Oct 2025 18:25:01 +0800 Subject: [PATCH 2/5] mypy Signed-off-by: wang.yuqi --- vllm/entrypoints/renderer.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index c579c9faaf4c..191c57a3ad26 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -5,7 +5,7 @@ import io from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Annotated +from typing import Annotated, Any import pybase64 import torch @@ -321,6 +321,9 @@ async def _create_prompt_from_text( # Using a sync tokenizer is faster than using an async_tokenizer. tokenizer = self.tokenizer + if self.tokenizer is None: + raise ValueError("No tokenizer available for text input processing") + # Handle encoder-specific preprocessing if ( self.model_config.encoder_config is not None @@ -328,15 +331,19 @@ async def _create_prompt_from_text( ): text = text.lower() + tokenization_kwargs: dict[str, Any] = {} + if add_special_tokens is not None: + tokenization_kwargs["add_special_tokens"] = add_special_tokens + # Tokenize texts if truncate_prompt_tokens is None: - encoded = tokenizer(text, add_special_tokens=add_special_tokens) + encoded = tokenizer(text, **tokenization_kwargs) else: encoded = tokenizer( text, - add_special_tokens=add_special_tokens, truncation=True, max_length=truncate_prompt_tokens, + **tokenization_kwargs, ) return self._create_tokens_prompt( From a365ea1054c2202156cb0b906c74a934e49598b2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 29 Oct 2025 18:26:48 +0800 Subject: [PATCH 3/5] update Signed-off-by: wang.yuqi --- vllm/entrypoints/renderer.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 191c57a3ad26..3051069ab1c8 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -334,17 +334,11 @@ async def _create_prompt_from_text( tokenization_kwargs: dict[str, Any] = {} if add_special_tokens is not None: tokenization_kwargs["add_special_tokens"] = add_special_tokens + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens - # Tokenize texts - if truncate_prompt_tokens is None: - encoded = tokenizer(text, **tokenization_kwargs) - else: - encoded = tokenizer( - text, - truncation=True, - max_length=truncate_prompt_tokens, - **tokenization_kwargs, - ) + encoded = tokenizer(text, **tokenization_kwargs) return self._create_tokens_prompt( encoded.input_ids, max_length, cache_salt, text From e1b90cc99cedc1339593a5220acdd9f46e7ab89f Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 29 Oct 2025 18:31:34 +0800 Subject: [PATCH 4/5] mypy Signed-off-by: wang.yuqi --- vllm/entrypoints/renderer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 3051069ab1c8..c93d6f40721a 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -321,7 +321,7 @@ async def _create_prompt_from_text( # Using a sync tokenizer is faster than using an async_tokenizer. tokenizer = self.tokenizer - if self.tokenizer is None: + if tokenizer is None: raise ValueError("No tokenizer available for text input processing") # Handle encoder-specific preprocessing From d7a7de65aaa0541d75b77313d5e22b3e448b7cbd Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 17 Nov 2025 15:22:23 +0800 Subject: [PATCH 5/5] set --api-server-count=4 as default Signed-off-by: wang.yuqi --- vllm/entrypoints/openai/cli_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 946362ce2ef0..df42b26a8e72 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -265,7 +265,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--api-server-count", "-asc", type=int, - default=1, + default=4, help="How many API server processes to run.", ) parser.add_argument(