Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ markers = [
"distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1",
"optional: optional tests that are automatically skipped, include --optional to run them",
"extra_server_args: extra arguments to pass to the server fixture",
]

[tool.ty.src]
Expand Down
126 changes: 126 additions & 0 deletions tests/entrypoints/openai/test_enable_force_include_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai
import pytest
import pytest_asyncio

from ...utils import RemoteOpenAIServer


@pytest.fixture(scope="module")
def chat_server_with_force_include_usage(request): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"128",
"--enforce-eager",
"--max-num-seqs",
"1",
"--enable-force-include-usage",
"--port",
"55857",
"--gpu-memory-utilization",
"0.2",
]

with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
async with chat_server_with_force_include_usage.get_async_client() as async_client:
yield async_client


@pytest.mark.asyncio
async def test_chat_with_enable_force_include_usage(
chat_client_with_force_include_usage: openai.AsyncOpenAI,
):
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
]

stream = await chat_client_with_force_include_usage.chat.completions.create(
model="Qwen/Qwen3-0.6B",
messages=messages,
max_completion_tokens=10,
extra_body=dict(min_tokens=10),
temperature=0.0,
stream=True,
)
last_completion_tokens = 0
async for chunk in stream:
if not len(chunk.choices):
assert chunk.usage.prompt_tokens >= 0
assert (
last_completion_tokens == 0
or chunk.usage.completion_tokens > last_completion_tokens
or (
not chunk.choices
and chunk.usage.completion_tokens == last_completion_tokens
)
)
assert chunk.usage.total_tokens == (
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
)
else:
assert chunk.usage is None


@pytest.fixture(scope="module")
def transcription_server_with_force_include_usage():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-num-seqs",
"1",
"--enforce-eager",
"--enable-force-include-usage",
"--gpu-memory-utilization",
"0.2",
]

with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def transcription_client_with_force_include_usage(
transcription_server_with_force_include_usage,
):
async with (
transcription_server_with_force_include_usage.get_async_client() as async_client
):
yield async_client


@pytest.mark.asyncio
async def test_transcription_with_enable_force_include_usage(
transcription_client_with_force_include_usage, winning_call
):
res = (
await transcription_client_with_force_include_usage.audio.transcriptions.create(
model="openai/whisper-large-v3-turbo",
file=winning_call,
language="en",
temperature=0.0,
stream=True,
timeout=30,
)
)

async for chunk in res:
if not len(chunk.choices):
# final usage sent
usage = chunk.usage
assert isinstance(usage, dict)
assert usage["prompt_tokens"] > 0
assert usage["completion_tokens"] > 0
assert usage["total_tokens"] > 0
else:
assert not hasattr(chunk, "usage")
2 changes: 2 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1807,6 +1807,7 @@ async def init_app_state(
state.openai_serving_models,
request_logger=request_logger,
log_error_stack=args.log_error_stack,
enable_force_include_usage=args.enable_force_include_usage,
)
if "transcription" in supported_tasks
else None
Expand All @@ -1817,6 +1818,7 @@ async def init_app_state(
state.openai_serving_models,
request_logger=request_logger,
log_error_stack=args.log_error_stack,
enable_force_include_usage=args.enable_force_include_usage,
)
if "transcription" in supported_tasks
else None
Expand Down
8 changes: 8 additions & 0 deletions vllm/entrypoints/openai/run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ def make_arg_parser(parser: FlexibleArgumentParser):
default=False,
help="If set to True, enable prompt_tokens_details in usage.",
)
parser.add_argument(
"--enable-force-include-usage",
action="store_true",
default=False,
help="If set to True, include usage on every request "
"(even when stream_options is not specified)",
)

return parser

Expand Down Expand Up @@ -361,6 +368,7 @@ async def run_batch(
chat_template=None,
chat_template_content_format="auto",
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
)
if "generate" in supported_tasks
else None
Expand Down
15 changes: 4 additions & 11 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.entrypoints.utils import get_max_tokens
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
Expand Down Expand Up @@ -101,7 +101,6 @@ def __init__(
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
enable_force_include_usage=enable_force_include_usage,
log_error_stack=log_error_stack,
)

Expand Down Expand Up @@ -352,7 +351,6 @@ async def create_chat_completion(
conversation,
tokenizer,
request_metadata,
enable_force_include_usage=self.enable_force_include_usage,
)

try:
Expand Down Expand Up @@ -518,7 +516,6 @@ async def chat_completion_stream_generator(
conversation: list[ConversationMessage],
tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata,
enable_force_include_usage: bool,
) -> AsyncGenerator[str, None]:
created_time = int(time.time())
chunk_object_type: Final = "chat.completion.chunk"
Expand Down Expand Up @@ -596,13 +593,9 @@ async def chat_completion_stream_generator(
return

stream_options = request.stream_options
if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage
include_continuous_usage = (
include_usage and stream_options.continuous_usage_stats
)
else:
include_usage, include_continuous_usage = False, False
include_usage, include_continuous_usage = should_include_usage(
stream_options, self.enable_force_include_usage
)

try:
async for res in result_generator:
Expand Down
16 changes: 5 additions & 11 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig
from vllm.entrypoints.utils import get_max_tokens
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
Expand Down Expand Up @@ -56,11 +56,11 @@ def __init__(
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
enable_force_include_usage=enable_force_include_usage,
log_error_stack=log_error_stack,
)
self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.default_sampling_params = self.model_config.get_diff_sampling_param()
self.enable_force_include_usage = enable_force_include_usage
if self.default_sampling_params:
source = self.model_config.generation_config
source = "model" if source == "auto" else source
Expand Down Expand Up @@ -256,7 +256,6 @@ async def create_completion(
num_prompts=num_prompts,
tokenizer=tokenizer,
request_metadata=request_metadata,
enable_force_include_usage=self.enable_force_include_usage,
)

# Non-streaming response
Expand Down Expand Up @@ -320,7 +319,6 @@ async def completion_stream_generator(
num_prompts: int,
tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata,
enable_force_include_usage: bool,
) -> AsyncGenerator[str, None]:
num_choices = 1 if request.n is None else request.n
previous_text_lens = [0] * num_choices * num_prompts
Expand All @@ -331,13 +329,9 @@ async def completion_stream_generator(
first_iteration = True

stream_options = request.stream_options
if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage
include_continuous_usage = (
include_usage and stream_options.continuous_usage_stats
)
else:
include_usage, include_continuous_usage = False, False
include_usage, include_continuous_usage = should_include_usage(
stream_options, self.enable_force_include_usage
)

try:
async for prompt_idx, res in result_generator:
Expand Down
3 changes: 0 additions & 3 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ def __init__(
*,
request_logger: RequestLogger | None,
return_tokens_as_token_ids: bool = False,
enable_force_include_usage: bool = False,
log_error_stack: bool = False,
):
super().__init__()
Expand All @@ -260,8 +259,6 @@ def __init__(

self.request_logger = request_logger
self.return_tokens_as_token_ids = return_tokens_as_token_ids
self.enable_force_include_usage = enable_force_include_usage

self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
self._apply_mistral_chat_template_async = make_async(
apply_mistral_chat_template, executor=self._tokenizer_executor
Expand Down
1 change: 0 additions & 1 deletion vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ def __init__(
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
enable_force_include_usage=enable_force_include_usage,
log_error_stack=log_error_stack,
)

Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/serving_transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
request_logger: RequestLogger | None,
return_tokens_as_token_ids: bool = False,
log_error_stack: bool = False,
enable_force_include_usage: bool = False,
):
super().__init__(
engine_client=engine_client,
Expand All @@ -45,6 +46,7 @@ def __init__(
return_tokens_as_token_ids=return_tokens_as_token_ids,
task_type="transcribe",
log_error_stack=log_error_stack,
enable_force_include_usage=enable_force_include_usage,
)

async def create_transcription(
Expand Down Expand Up @@ -96,6 +98,7 @@ def __init__(
request_logger: RequestLogger | None,
return_tokens_as_token_ids: bool = False,
log_error_stack: bool = False,
enable_force_include_usage: bool = False,
):
super().__init__(
engine_client=engine_client,
Expand All @@ -104,6 +107,7 @@ def __init__(
return_tokens_as_token_ids=return_tokens_as_token_ids,
task_type="translate",
log_error_stack=log_error_stack,
enable_force_include_usage=enable_force_include_usage,
)

async def create_translation(
Expand Down
7 changes: 4 additions & 3 deletions vllm/entrypoints/openai/speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(
return_tokens_as_token_ids: bool = False,
task_type: Literal["transcribe", "translate"] = "transcribe",
log_error_stack: bool = False,
enable_force_include_usage: bool = False,
):
super().__init__(
engine_client=engine_client,
Expand All @@ -74,6 +75,8 @@ def __init__(
self.model_config, task_type
)

self.enable_force_include_usage = enable_force_include_usage

self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB

if self.default_sampling_params:
Expand Down Expand Up @@ -261,9 +264,7 @@ async def _speech_to_text_stream_generator(
completion_tokens = 0
num_prompt_tokens = 0

include_usage = (
request.stream_include_usage if request.stream_include_usage else False
)
include_usage = self.enable_force_include_usage or request.stream_include_usage
include_continuous_usage = (
request.stream_continuous_usage_stats
if include_usage and request.stream_continuous_usage_stats
Expand Down
19 changes: 18 additions & 1 deletion vllm/entrypoints/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@

from vllm.engine.arg_utils import EngineArgs
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
StreamOptions,
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
Expand Down Expand Up @@ -237,3 +241,16 @@ def log_non_default_args(args: Namespace | EngineArgs):
)

logger.info("non-default args: %s", non_default_args)


def should_include_usage(
stream_options: StreamOptions | None, enable_force_include_usage: bool
) -> tuple[bool, bool]:
if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage
include_continuous_usage = include_usage and bool(
stream_options.continuous_usage_stats
)
else:
include_usage, include_continuous_usage = enable_force_include_usage, False
return include_usage, include_continuous_usage