From f16f854282002485998db31ad039aefe96a26615 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Tue, 24 Mar 2026 19:21:09 +0100 Subject: [PATCH 1/5] simple-engine: unify tool-enabled chat on streaming path (#10) * fix: unify tool-enabled simple chat on streaming path * fix: preserve simple chat contracts on streaming path * fix: keep tool chat on the streaming execution path * fix: preserve streamed completion token counts --- tests/test_simple_engine.py | 57 +++++++++++++++++++++++++++++++++++++ vllm_mlx/engine/simple.py | 30 +++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py index 7202f625f..7c0956693 100644 --- a/tests/test_simple_engine.py +++ b/tests/test_simple_engine.py @@ -12,6 +12,10 @@ class TestSimpleEngineConcurrency: """Test SimpleEngine lock behavior with concurrent requests.""" + @pytest.fixture + def anyio_backend(self): + return "asyncio" + @pytest.fixture def mock_model(self): """Create a mock model that tracks concurrent calls.""" @@ -117,6 +121,59 @@ async def test_lock_prevents_concurrent_chat(self, mock_llm_model): "The lock is not working correctly." ) + async def test_chat_with_tools_aggregates_streaming_path(self, mock_llm_model): + """Tool-enabled non-stream chat should use the streaming path.""" + from vllm_mlx.engine.simple import SimpleEngine + + async def fake_stream_chat(*args, **kwargs): + yield MagicMock( + text="partial", + tokens=[], + prompt_tokens=11, + completion_tokens=1, + finish_reason=None, + finished=False, + ) + yield MagicMock( + text="<|im_end|>{\"name\":\"bash\",\"arguments\":{\"command\":\"pwd\"}}", + tokens=[], + prompt_tokens=11, + completion_tokens=4, + finish_reason="stop", + finished=True, + ) + + with patch("vllm_mlx.engine.simple.is_mllm_model", return_value=False): + engine = SimpleEngine("test-model") + engine._model = mock_llm_model + engine._loaded = True + engine._model.tokenizer.encode = MagicMock(return_value=[7, 8, 9]) + engine.stream_chat = fake_stream_chat # type: ignore[method-assign] + + output = await engine.chat( + messages=[{"role": "user", "content": "run pwd"}], + max_tokens=16, + tools=[ + { + "type": "function", + "function": { + "name": "bash", + "parameters": {"type": "object", "properties": {}}, + }, + } + ], + ) + + assert output.text.startswith("") + assert output.tokens == [7, 8, 9] + assert output.prompt_tokens == 11 + assert output.completion_tokens == 4 + assert output.finish_reason == "stop" + mock_llm_model.chat.assert_not_called() + engine._model.tokenizer.encode.assert_called_once_with( + output.text, add_special_tokens=False + ) + @pytest.mark.anyio async def test_lock_serializes_stream_generate(self, mock_model): """Test that stream_generate uses the same lock as other methods.""" diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index 39cfa849d..d376b101e 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -453,6 +453,36 @@ async def chat( if not self._loaded: await self.start() + # mlx-lm non-streaming chat with tools can stall indefinitely on some + # local models, while the streaming path completes normally. Reuse the + # streaming implementation and aggregate its final state so both chat + # APIs share the same tool-capable execution path. + if tools and not self._is_mllm: + final_output = GenerationOutput(text="") + async for output in self.stream_chat( + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + tools=tools, + images=images, + videos=videos, + **kwargs, + ): + final_output = output + text = clean_output_text(final_output.text) + try: + tokens = self._model.tokenizer.encode(text, add_special_tokens=False) + except TypeError: + tokens = self._model.tokenizer.encode(text) + return GenerationOutput( + text=text, + tokens=tokens, + prompt_tokens=final_output.prompt_tokens, + completion_tokens=final_output.completion_tokens, + finish_reason=final_output.finish_reason, + ) + # Convert tools for template if provided template_tools = convert_tools_for_template(tools) if tools else None From 51b4f6912089ec7ef6626bae5c8498399b326916 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Tue, 24 Mar 2026 20:04:47 +0100 Subject: [PATCH 2/5] fix: preserve streamed tool-chat token ids --- tests/test_simple_engine.py | 6 +----- vllm_mlx/engine/simple.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py index 7c0956693..3db3cc92c 100644 --- a/tests/test_simple_engine.py +++ b/tests/test_simple_engine.py @@ -147,7 +147,6 @@ async def fake_stream_chat(*args, **kwargs): engine = SimpleEngine("test-model") engine._model = mock_llm_model engine._loaded = True - engine._model.tokenizer.encode = MagicMock(return_value=[7, 8, 9]) engine.stream_chat = fake_stream_chat # type: ignore[method-assign] output = await engine.chat( @@ -165,14 +164,11 @@ async def fake_stream_chat(*args, **kwargs): ) assert output.text.startswith("") - assert output.tokens == [7, 8, 9] + assert output.tokens == [] assert output.prompt_tokens == 11 assert output.completion_tokens == 4 assert output.finish_reason == "stop" mock_llm_model.chat.assert_not_called() - engine._model.tokenizer.encode.assert_called_once_with( - output.text, add_special_tokens=False - ) @pytest.mark.anyio async def test_lock_serializes_stream_generate(self, mock_model): diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index d376b101e..3dfbf7b09 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -477,7 +477,7 @@ async def chat( tokens = self._model.tokenizer.encode(text) return GenerationOutput( text=text, - tokens=tokens, + tokens=list(final_output.tokens), prompt_tokens=final_output.prompt_tokens, completion_tokens=final_output.completion_tokens, finish_reason=final_output.finish_reason, From 014edebf7dfe2d5dd3bd14da8fd7a11f0a087288 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Thu, 26 Mar 2026 01:06:21 +0100 Subject: [PATCH 3/5] remove dead token-encode block in tool-chat fallback The try/except block computing `tokens` via tokenizer.encode() was unused -- the return statement already reads from final_output.tokens. --- vllm_mlx/engine/simple.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index 3dfbf7b09..b93c20c0a 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -471,10 +471,6 @@ async def chat( ): final_output = output text = clean_output_text(final_output.text) - try: - tokens = self._model.tokenizer.encode(text, add_special_tokens=False) - except TypeError: - tokens = self._model.tokenizer.encode(text) return GenerationOutput( text=text, tokens=list(final_output.tokens), From 040a724118d194d77639067fc8a5dc49701b64a2 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Thu, 9 Apr 2026 09:31:12 +0200 Subject: [PATCH 4/5] style: format simple engine tool-chat test --- tests/test_simple_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py index 3db3cc92c..c507b7a90 100644 --- a/tests/test_simple_engine.py +++ b/tests/test_simple_engine.py @@ -135,7 +135,7 @@ async def fake_stream_chat(*args, **kwargs): finished=False, ) yield MagicMock( - text="<|im_end|>{\"name\":\"bash\",\"arguments\":{\"command\":\"pwd\"}}", + text='<|im_end|>{"name":"bash","arguments":{"command":"pwd"}}', tokens=[], prompt_tokens=11, completion_tokens=4, From 2990f7b93c7b711d84699da9757aa8668b872eae Mon Sep 17 00:00:00 2001 From: Thump604 Date: Sat, 11 Apr 2026 11:30:40 -0500 Subject: [PATCH 5/5] test: align tool-chat aggregation regression --- tests/test_simple_engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_simple_engine.py b/tests/test_simple_engine.py index c507b7a90..b06b48971 100644 --- a/tests/test_simple_engine.py +++ b/tests/test_simple_engine.py @@ -128,7 +128,7 @@ async def test_chat_with_tools_aggregates_streaming_path(self, mock_llm_model): async def fake_stream_chat(*args, **kwargs): yield MagicMock( text="partial", - tokens=[], + tokens=[1], prompt_tokens=11, completion_tokens=1, finish_reason=None, @@ -136,7 +136,7 @@ async def fake_stream_chat(*args, **kwargs): ) yield MagicMock( text='<|im_end|>{"name":"bash","arguments":{"command":"pwd"}}', - tokens=[], + tokens=[7, 8, 9], prompt_tokens=11, completion_tokens=4, finish_reason="stop", @@ -163,8 +163,8 @@ async def fake_stream_chat(*args, **kwargs): ], ) - assert output.text.startswith("") - assert output.tokens == [] + assert output.text == '{"name":"bash","arguments":{"command":"pwd"}}' + assert output.tokens == [7, 8, 9] assert output.prompt_tokens == 11 assert output.completion_tokens == 4 assert output.finish_reason == "stop"