From f16f854282002485998db31ad039aefe96a26615 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Tue, 24 Mar 2026 19:21:09 +0100
Subject: [PATCH 1/5] simple-engine: unify tool-enabled chat on streaming path
 (#10)

* fix: unify tool-enabled simple chat on streaming path

* fix: preserve simple chat contracts on streaming path

* fix: keep tool chat on the streaming execution path

* fix: preserve streamed completion token counts
---
 tests/test_simple_engine.py | 57 +++++++++++++++++++++++++++++++++++++
 vllm_mlx/engine/simple.py   | 30 +++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py
index 7202f625f..7c0956693 100644
--- a/tests/test_simple_engine.py
+++ b/tests/test_simple_engine.py
@@ -12,6 +12,10 @@
 class TestSimpleEngineConcurrency:
     """Test SimpleEngine lock behavior with concurrent requests."""
 
+    @pytest.fixture
+    def anyio_backend(self):
+        return "asyncio"
+
     @pytest.fixture
     def mock_model(self):
         """Create a mock model that tracks concurrent calls."""
@@ -117,6 +121,59 @@ async def test_lock_prevents_concurrent_chat(self, mock_llm_model):
                 "The lock is not working correctly."
             )
 
+    async def test_chat_with_tools_aggregates_streaming_path(self, mock_llm_model):
+        """Tool-enabled non-stream chat should use the streaming path."""
+        from vllm_mlx.engine.simple import SimpleEngine
+
+        async def fake_stream_chat(*args, **kwargs):
+            yield MagicMock(
+                text="partial",
+                tokens=[],
+                prompt_tokens=11,
+                completion_tokens=1,
+                finish_reason=None,
+                finished=False,
+            )
+            yield MagicMock(
+                text="<|im_end|><tool_call>{\"name\":\"bash\",\"arguments\":{\"command\":\"pwd\"}}</tool_call>",
+                tokens=[],
+                prompt_tokens=11,
+                completion_tokens=4,
+                finish_reason="stop",
+                finished=True,
+            )
+
+        with patch("vllm_mlx.engine.simple.is_mllm_model", return_value=False):
+            engine = SimpleEngine("test-model")
+            engine._model = mock_llm_model
+            engine._loaded = True
+            engine._model.tokenizer.encode = MagicMock(return_value=[7, 8, 9])
+            engine.stream_chat = fake_stream_chat  # type: ignore[method-assign]
+
+            output = await engine.chat(
+                messages=[{"role": "user", "content": "run pwd"}],
+                max_tokens=16,
+                tools=[
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "bash",
+                            "parameters": {"type": "object", "properties": {}},
+                        },
+                    }
+                ],
+            )
+
+            assert output.text.startswith("<tool_call>")
+            assert output.tokens == [7, 8, 9]
+            assert output.prompt_tokens == 11
+            assert output.completion_tokens == 4
+            assert output.finish_reason == "stop"
+            mock_llm_model.chat.assert_not_called()
+            engine._model.tokenizer.encode.assert_called_once_with(
+                output.text, add_special_tokens=False
+            )
+
     @pytest.mark.anyio
     async def test_lock_serializes_stream_generate(self, mock_model):
         """Test that stream_generate uses the same lock as other methods."""
diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index 39cfa849d..d376b101e 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -453,6 +453,36 @@ async def chat(
         if not self._loaded:
             await self.start()
 
+        # mlx-lm non-streaming chat with tools can stall indefinitely on some
+        # local models, while the streaming path completes normally. Reuse the
+        # streaming implementation and aggregate its final state so both chat
+        # APIs share the same tool-capable execution path.
+        if tools and not self._is_mllm:
+            final_output = GenerationOutput(text="")
+            async for output in self.stream_chat(
+                messages=messages,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                tools=tools,
+                images=images,
+                videos=videos,
+                **kwargs,
+            ):
+                final_output = output
+            text = clean_output_text(final_output.text)
+            try:
+                tokens = self._model.tokenizer.encode(text, add_special_tokens=False)
+            except TypeError:
+                tokens = self._model.tokenizer.encode(text)
+            return GenerationOutput(
+                text=text,
+                tokens=tokens,
+                prompt_tokens=final_output.prompt_tokens,
+                completion_tokens=final_output.completion_tokens,
+                finish_reason=final_output.finish_reason,
+            )
+
         # Convert tools for template if provided
         template_tools = convert_tools_for_template(tools) if tools else None
 

From 51b4f6912089ec7ef6626bae5c8498399b326916 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Tue, 24 Mar 2026 20:04:47 +0100
Subject: [PATCH 2/5] fix: preserve streamed tool-chat token ids

---
 tests/test_simple_engine.py | 6 +-----
 vllm_mlx/engine/simple.py   | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py
index 7c0956693..3db3cc92c 100644
--- a/tests/test_simple_engine.py
+++ b/tests/test_simple_engine.py
@@ -147,7 +147,6 @@ async def fake_stream_chat(*args, **kwargs):
             engine = SimpleEngine("test-model")
             engine._model = mock_llm_model
             engine._loaded = True
-            engine._model.tokenizer.encode = MagicMock(return_value=[7, 8, 9])
             engine.stream_chat = fake_stream_chat  # type: ignore[method-assign]
 
             output = await engine.chat(
@@ -165,14 +164,11 @@ async def fake_stream_chat(*args, **kwargs):
             )
 
             assert output.text.startswith("<tool_call>")
-            assert output.tokens == [7, 8, 9]
+            assert output.tokens == []
             assert output.prompt_tokens == 11
             assert output.completion_tokens == 4
             assert output.finish_reason == "stop"
             mock_llm_model.chat.assert_not_called()
-            engine._model.tokenizer.encode.assert_called_once_with(
-                output.text, add_special_tokens=False
-            )
 
     @pytest.mark.anyio
     async def test_lock_serializes_stream_generate(self, mock_model):
diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index d376b101e..3dfbf7b09 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -477,7 +477,7 @@ async def chat(
                 tokens = self._model.tokenizer.encode(text)
             return GenerationOutput(
                 text=text,
-                tokens=tokens,
+                tokens=list(final_output.tokens),
                 prompt_tokens=final_output.prompt_tokens,
                 completion_tokens=final_output.completion_tokens,
                 finish_reason=final_output.finish_reason,

From 014edebf7dfe2d5dd3bd14da8fd7a11f0a087288 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Thu, 26 Mar 2026 01:06:21 +0100
Subject: [PATCH 3/5] remove dead token-encode block in tool-chat fallback

The try/except block computing `tokens` via tokenizer.encode() was
unused -- the return statement already reads from final_output.tokens.
---
 vllm_mlx/engine/simple.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index 3dfbf7b09..b93c20c0a 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -471,10 +471,6 @@ async def chat(
             ):
                 final_output = output
             text = clean_output_text(final_output.text)
-            try:
-                tokens = self._model.tokenizer.encode(text, add_special_tokens=False)
-            except TypeError:
-                tokens = self._model.tokenizer.encode(text)
             return GenerationOutput(
                 text=text,
                 tokens=list(final_output.tokens),

From 040a724118d194d77639067fc8a5dc49701b64a2 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Thu, 9 Apr 2026 09:31:12 +0200
Subject: [PATCH 4/5] style: format simple engine tool-chat test

---
 tests/test_simple_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py
index 3db3cc92c..c507b7a90 100644
--- a/tests/test_simple_engine.py
+++ b/tests/test_simple_engine.py
@@ -135,7 +135,7 @@ async def fake_stream_chat(*args, **kwargs):
                 finished=False,
             )
             yield MagicMock(
-                text="<|im_end|><tool_call>{\"name\":\"bash\",\"arguments\":{\"command\":\"pwd\"}}</tool_call>",
+                text='<|im_end|><tool_call>{"name":"bash","arguments":{"command":"pwd"}}</tool_call>',
                 tokens=[],
                 prompt_tokens=11,
                 completion_tokens=4,

From 2990f7b93c7b711d84699da9757aa8668b872eae Mon Sep 17 00:00:00 2001
From: Thump604 <thump@cosmiccooler.org>
Date: Sat, 11 Apr 2026 11:30:40 -0500
Subject: [PATCH 5/5] test: align tool-chat aggregation regression

---
 tests/test_simple_engine.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py
index c507b7a90..b06b48971 100644
--- a/tests/test_simple_engine.py
+++ b/tests/test_simple_engine.py
@@ -128,7 +128,7 @@ async def test_chat_with_tools_aggregates_streaming_path(self, mock_llm_model):
         async def fake_stream_chat(*args, **kwargs):
             yield MagicMock(
                 text="partial",
-                tokens=[],
+                tokens=[1],
                 prompt_tokens=11,
                 completion_tokens=1,
                 finish_reason=None,
@@ -136,7 +136,7 @@ async def fake_stream_chat(*args, **kwargs):
             )
             yield MagicMock(
                 text='<|im_end|><tool_call>{"name":"bash","arguments":{"command":"pwd"}}</tool_call>',
-                tokens=[],
+                tokens=[7, 8, 9],
                 prompt_tokens=11,
                 completion_tokens=4,
                 finish_reason="stop",
@@ -163,8 +163,8 @@ async def fake_stream_chat(*args, **kwargs):
                 ],
             )
 
-            assert output.text.startswith("<tool_call>")
-            assert output.tokens == []
+            assert output.text == '{"name":"bash","arguments":{"command":"pwd"}}'
+            assert output.tokens == [7, 8, 9]
             assert output.prompt_tokens == 11
             assert output.completion_tokens == 4
             assert output.finish_reason == "stop"