diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 946362ce2ef0..df42b26a8e72 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -265,7 +265,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--api-server-count", "-asc", type=int, - default=1, + default=4, help="How many API server processes to run.", ) parser.add_argument( diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 3c5a396a99f9..c93d6f40721a 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -5,7 +5,7 @@ import io from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Annotated +from typing import Annotated, Any import pybase64 import torch @@ -318,8 +318,11 @@ async def _create_prompt_from_text( add_special_tokens: bool | None, cache_salt: str | None, ) -> EngineTokensPrompt: - """Tokenize text input asynchronously.""" - async_tokenizer = self._get_async_tokenizer() + # Using a sync tokenizer is faster than using an async_tokenizer. + tokenizer = self.tokenizer + + if tokenizer is None: + raise ValueError("No tokenizer available for text input processing") # Handle encoder-specific preprocessing if ( @@ -328,16 +331,14 @@ async def _create_prompt_from_text( ): text = text.lower() - # Tokenize texts - if truncate_prompt_tokens is None: - encoded = await async_tokenizer(text, add_special_tokens=add_special_tokens) - else: - encoded = await async_tokenizer( - text, - add_special_tokens=add_special_tokens, - truncation=True, - max_length=truncate_prompt_tokens, - ) + tokenization_kwargs: dict[str, Any] = {} + if add_special_tokens is not None: + tokenization_kwargs["add_special_tokens"] = add_special_tokens + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + encoded = tokenizer(text, **tokenization_kwargs) return self._create_tokens_prompt( encoded.input_ids, max_length, cache_salt, text