From 38479fec10128b4a55c0570a4b8918d0318bfe34 Mon Sep 17 00:00:00 2001 From: Stuart Swerdloff Date: Mon, 30 Mar 2026 18:43:54 +1300 Subject: [PATCH 1/2] fix: report prompt_tokens correctly for LLM models in SimpleEngine LLM.stream_generate() never set prompt_tokens on StreamingOutput, so the API always reported 0 prompt tokens for text-only models (including MiniMax-M2.5). The MLLM+MTP path worked because it tokenizes the prompt for KV caching, but the standard LLM path never counted. Changes: - Add prompt_tokens field to StreamingOutput dataclass - Count prompt tokens in LLM.stream_generate() via tokenizer.encode() - Add fallback in SimpleEngine.stream_generate() for normal finish path - Count prompt tokens in SimpleEngine.chat() non-streaming LLM path Co-Authored-By: clement-7074f29f --- vllm_mlx/engine/simple.py | 24 +++++++++++++++++++++++- vllm_mlx/models/llm.py | 5 +++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index e96317ef..4580adb6 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -369,7 +369,7 @@ async def stream_generate( ): prompt_tokens = ( chunk.prompt_tokens - if hasattr(chunk, "prompt_tokens") + if hasattr(chunk, "prompt_tokens") and chunk.prompt_tokens else prompt_tokens ) completion_tokens += 1 @@ -382,6 +382,9 @@ async def stream_generate( finish_reason = None if finished: finish_reason = getattr(chunk, "finish_reason", "stop") + # Ensure prompt_tokens is populated before final yield + if prompt_tokens == 0: + prompt_tokens = len(self._model.tokenizer.encode(prompt)) yield GenerationOutput( text=accumulated_text, @@ -472,9 +475,28 @@ async def chat( **kwargs, ) text = clean_output_text(output.text) + # Count prompt tokens from the full templated prompt + prompt_token_count = 0 + if hasattr(self._model, "tokenizer"): + tokenizer = self._model.tokenizer + if hasattr(tokenizer, "apply_chat_template"): + try: + template_kwargs = { + "tokenize": True, + "add_generation_prompt": True, + } + if template_tools: + template_kwargs["tools"] = template_tools + prompt_ids = tokenizer.apply_chat_template( + messages, **template_kwargs + ) + prompt_token_count = len(prompt_ids) + except (TypeError, Exception): + pass return GenerationOutput( text=text, tokens=output.tokens, + prompt_tokens=prompt_token_count, completion_tokens=len(output.tokens), finish_reason=output.finish_reason, ) diff --git a/vllm_mlx/models/llm.py b/vllm_mlx/models/llm.py index 72182037..75bbab85 100644 --- a/vllm_mlx/models/llm.py +++ b/vllm_mlx/models/llm.py @@ -30,6 +30,7 @@ class StreamingOutput: token: int finished: bool = False finish_reason: str | None = None + prompt_tokens: int = 0 class MLXLanguageModel: @@ -203,6 +204,9 @@ def stream_generate( # Create sampler with parameters sampler = self._create_sampler(temperature, top_p) + # Count prompt tokens once upfront + num_prompt_tokens = len(self.tokenizer.encode(prompt)) + token_count = 0 accumulated_text = "" @@ -241,6 +245,7 @@ def stream_generate( token=response.token if hasattr(response, "token") else 0, finished=finished, finish_reason=finish_reason, + prompt_tokens=num_prompt_tokens, ) if finished: From 54b4d65519f19252c4b819872ed692a4640919dd Mon Sep 17 00:00:00 2001 From: Wayner Barrios Date: Tue, 31 Mar 2026 11:35:25 -0500 Subject: [PATCH 2/2] cleanup: remove redundant fallback tokenization and defensive hasattr checks --- vllm_mlx/engine/simple.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index 4580adb6..da3ccfc1 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -382,9 +382,6 @@ async def stream_generate( finish_reason = None if finished: finish_reason = getattr(chunk, "finish_reason", "stop") - # Ensure prompt_tokens is populated before final yield - if prompt_tokens == 0: - prompt_tokens = len(self._model.tokenizer.encode(prompt)) yield GenerationOutput( text=accumulated_text, @@ -476,23 +473,15 @@ async def chat( ) text = clean_output_text(output.text) # Count prompt tokens from the full templated prompt - prompt_token_count = 0 - if hasattr(self._model, "tokenizer"): - tokenizer = self._model.tokenizer - if hasattr(tokenizer, "apply_chat_template"): - try: - template_kwargs = { - "tokenize": True, - "add_generation_prompt": True, - } - if template_tools: - template_kwargs["tools"] = template_tools - prompt_ids = tokenizer.apply_chat_template( - messages, **template_kwargs - ) - prompt_token_count = len(prompt_ids) - except (TypeError, Exception): - pass + tokenizer = self._model.tokenizer + template_kwargs = { + "tokenize": True, + "add_generation_prompt": True, + } + if template_tools: + template_kwargs["tools"] = template_tools + prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs) + prompt_token_count = len(prompt_ids) return GenerationOutput( text=text, tokens=output.tokens,