From 38479fec10128b4a55c0570a4b8918d0318bfe34 Mon Sep 17 00:00:00 2001
From: Stuart Swerdloff <sjswerdloff@gmail.com>
Date: Mon, 30 Mar 2026 18:43:54 +1300
Subject: [PATCH 1/2] fix: report prompt_tokens correctly for LLM models in
 SimpleEngine

LLM.stream_generate() never set prompt_tokens on StreamingOutput, so
the API always reported 0 prompt tokens for text-only models (including
MiniMax-M2.5). The MLLM+MTP path worked because it tokenizes the prompt
for KV caching, but the standard LLM path never counted.

Changes:
- Add prompt_tokens field to StreamingOutput dataclass
- Count prompt tokens in LLM.stream_generate() via tokenizer.encode()
- Add fallback in SimpleEngine.stream_generate() for normal finish path
- Count prompt tokens in SimpleEngine.chat() non-streaming LLM path

Co-Authored-By: clement-7074f29f <clement-7074f29f@sjstargetedsolutions.co.nz>
---
 vllm_mlx/engine/simple.py | 24 +++++++++++++++++++++++-
 vllm_mlx/models/llm.py    |  5 +++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index e96317ef..4580adb6 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -369,7 +369,7 @@ async def stream_generate(
             ):
                 prompt_tokens = (
                     chunk.prompt_tokens
-                    if hasattr(chunk, "prompt_tokens")
+                    if hasattr(chunk, "prompt_tokens") and chunk.prompt_tokens
                     else prompt_tokens
                 )
                 completion_tokens += 1
@@ -382,6 +382,9 @@ async def stream_generate(
                 finish_reason = None
                 if finished:
                     finish_reason = getattr(chunk, "finish_reason", "stop")
+                    # Ensure prompt_tokens is populated before final yield
+                    if prompt_tokens == 0:
+                        prompt_tokens = len(self._model.tokenizer.encode(prompt))
 
                 yield GenerationOutput(
                     text=accumulated_text,
@@ -472,9 +475,28 @@ async def chat(
                     **kwargs,
                 )
                 text = clean_output_text(output.text)
+                # Count prompt tokens from the full templated prompt
+                prompt_token_count = 0
+                if hasattr(self._model, "tokenizer"):
+                    tokenizer = self._model.tokenizer
+                    if hasattr(tokenizer, "apply_chat_template"):
+                        try:
+                            template_kwargs = {
+                                "tokenize": True,
+                                "add_generation_prompt": True,
+                            }
+                            if template_tools:
+                                template_kwargs["tools"] = template_tools
+                            prompt_ids = tokenizer.apply_chat_template(
+                                messages, **template_kwargs
+                            )
+                            prompt_token_count = len(prompt_ids)
+                        except (TypeError, Exception):
+                            pass
                 return GenerationOutput(
                     text=text,
                     tokens=output.tokens,
+                    prompt_tokens=prompt_token_count,
                     completion_tokens=len(output.tokens),
                     finish_reason=output.finish_reason,
                 )
diff --git a/vllm_mlx/models/llm.py b/vllm_mlx/models/llm.py
index 72182037..75bbab85 100644
--- a/vllm_mlx/models/llm.py
+++ b/vllm_mlx/models/llm.py
@@ -30,6 +30,7 @@ class StreamingOutput:
     token: int
     finished: bool = False
     finish_reason: str | None = None
+    prompt_tokens: int = 0
 
 
 class MLXLanguageModel:
@@ -203,6 +204,9 @@ def stream_generate(
         # Create sampler with parameters
         sampler = self._create_sampler(temperature, top_p)
 
+        # Count prompt tokens once upfront
+        num_prompt_tokens = len(self.tokenizer.encode(prompt))
+
         token_count = 0
         accumulated_text = ""
 
@@ -241,6 +245,7 @@ def stream_generate(
                 token=response.token if hasattr(response, "token") else 0,
                 finished=finished,
                 finish_reason=finish_reason,
+                prompt_tokens=num_prompt_tokens,
             )
 
             if finished:

From 54b4d65519f19252c4b819872ed692a4640919dd Mon Sep 17 00:00:00 2001
From: Wayner Barrios <waybarrios@gmail.com>
Date: Tue, 31 Mar 2026 11:35:25 -0500
Subject: [PATCH 2/2] cleanup: remove redundant fallback tokenization and
 defensive hasattr checks

---
 vllm_mlx/engine/simple.py | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index 4580adb6..da3ccfc1 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -382,9 +382,6 @@ async def stream_generate(
                 finish_reason = None
                 if finished:
                     finish_reason = getattr(chunk, "finish_reason", "stop")
-                    # Ensure prompt_tokens is populated before final yield
-                    if prompt_tokens == 0:
-                        prompt_tokens = len(self._model.tokenizer.encode(prompt))
 
                 yield GenerationOutput(
                     text=accumulated_text,
@@ -476,23 +473,15 @@ async def chat(
                 )
                 text = clean_output_text(output.text)
                 # Count prompt tokens from the full templated prompt
-                prompt_token_count = 0
-                if hasattr(self._model, "tokenizer"):
-                    tokenizer = self._model.tokenizer
-                    if hasattr(tokenizer, "apply_chat_template"):
-                        try:
-                            template_kwargs = {
-                                "tokenize": True,
-                                "add_generation_prompt": True,
-                            }
-                            if template_tools:
-                                template_kwargs["tools"] = template_tools
-                            prompt_ids = tokenizer.apply_chat_template(
-                                messages, **template_kwargs
-                            )
-                            prompt_token_count = len(prompt_ids)
-                        except (TypeError, Exception):
-                            pass
+                tokenizer = self._model.tokenizer
+                template_kwargs = {
+                    "tokenize": True,
+                    "add_generation_prompt": True,
+                }
+                if template_tools:
+                    template_kwargs["tools"] = template_tools
+                prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
+                prompt_token_count = len(prompt_ids)
                 return GenerationOutput(
                     text=text,
                     tokens=output.tokens,