From 68fc8b704ac3cb152e3a31676da3dc5d095bc5ce Mon Sep 17 00:00:00 2001 From: Keegan Mullaney Date: Sat, 4 Apr 2026 15:33:55 -0700 Subject: [PATCH 1/2] feat: add Gemma 4 tool call parser Gemma 4 uses a unique tool calling format with special tokens for delimiters instead of JSON quotes. This adds a state-machine parser that converts the custom argument format to valid JSON. - New gemma4_tool_parser.py with streaming support - Auto-detection in auto_tool_parser.py (both non-streaming and streaming) - 21 tests covering arg conversion, nested objects, arrays, hyphenated names, streaming, think tags, and auto-detection - CLI --tool-call-parser gemma4 option Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_gemma4_parser.py | 237 ++++++++++++++++++++ vllm_mlx/cli.py | 3 +- vllm_mlx/tool_parsers/__init__.py | 3 + vllm_mlx/tool_parsers/auto_tool_parser.py | 64 ++++-- vllm_mlx/tool_parsers/gemma4_tool_parser.py | 219 ++++++++++++++++++ 5 files changed, 512 insertions(+), 14 deletions(-) create mode 100644 tests/test_gemma4_parser.py create mode 100644 vllm_mlx/tool_parsers/gemma4_tool_parser.py diff --git a/tests/test_gemma4_parser.py b/tests/test_gemma4_parser.py new file mode 100644 index 000000000..acf499be2 --- /dev/null +++ b/tests/test_gemma4_parser.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the Gemma 4 tool call parser.""" + +import json + +from vllm_mlx.tool_parsers.gemma4_tool_parser import ( + Gemma4ToolParser, + _gemma_args_to_json, +) + +# The Gemma 4 escape token for quotes +Q = '<|"|>' + + +def test_args_to_json_simple_strings(): + raw = f"location:{Q}San Francisco{Q},unit:{Q}celsius{Q}" + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"location": "San Francisco", "unit": "celsius"} + + +def test_args_to_json_numbers(): + raw = f"temperature:15,humidity:0.75" + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"temperature": 15, "humidity": 0.75} + + +def test_args_to_json_booleans(): + raw = f"enabled:true,verbose:false" + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"enabled": True, "verbose": False} + + +def test_args_to_json_mixed(): + raw = f"name:{Q}test{Q},count:42,active:true" + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"name": "test", "count": 42, "active": True} + + +def test_args_to_json_nested_object(): + raw = f"query:{Q}weather{Q},options:{{format:{Q}json{Q},verbose:true}}" + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"query": "weather", "options": {"format": "json", "verbose": True}} + + +def test_args_to_json_array(): + raw = f"tags:[{Q}a{Q},{Q}b{Q},{Q}c{Q}]" + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"tags": ["a", "b", "c"]} + + +def test_args_to_json_string_with_special_chars(): + raw = f'message:{Q}Hello, how are you? I\'m fine!{Q}' + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"message": "Hello, how are you? I'm fine!"} + + +def test_args_to_json_string_with_embedded_quotes(): + raw = f'message:{Q}She said "hello"{Q}' + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"message": 'She said "hello"'} + + +def test_args_to_json_null(): + raw = f"name:{Q}test{Q},value:null" + result = "{" + _gemma_args_to_json(raw) + "}" + parsed = json.loads(result) + assert parsed == {"name": "test", "value": None} + + +def test_extract_single_tool_call(): + parser = Gemma4ToolParser() + output = f"<|tool_call>call:get_weather{{location:{Q}San Francisco{Q},unit:{Q}celsius{Q}}}" + result = parser.extract_tool_calls(output) + + assert result.tools_called is True + assert len(result.tool_calls) == 1 + tc = result.tool_calls[0] + assert tc["name"] == "get_weather" + args = json.loads(tc["arguments"]) + assert args == {"location": "San Francisco", "unit": "celsius"} + assert tc["id"].startswith("call_") + + +def test_extract_multiple_tool_calls(): + parser = Gemma4ToolParser() + output = ( + f"<|tool_call>call:get_weather{{location:{Q}SF{Q}}}" + f"<|tool_call>call:get_time{{timezone:{Q}PST{Q}}}" + ) + result = parser.extract_tool_calls(output) + + assert result.tools_called is True + assert len(result.tool_calls) == 2 + assert result.tool_calls[0]["name"] == "get_weather" + assert result.tool_calls[1]["name"] == "get_time" + + +def test_extract_tool_call_with_surrounding_text(): + parser = Gemma4ToolParser() + output = f"Let me check the weather. <|tool_call>call:get_weather{{location:{Q}SF{Q}}} Done." + result = parser.extract_tool_calls(output) + + assert result.tools_called is True + assert len(result.tool_calls) == 1 + assert result.content == "Let me check the weather. Done." + + +def test_no_tool_calls(): + parser = Gemma4ToolParser() + output = "Just a regular response with no tool calls." + result = parser.extract_tool_calls(output) + + assert result.tools_called is False + assert len(result.tool_calls) == 0 + assert result.content == output + + +def test_extract_with_think_tags(): + parser = Gemma4ToolParser() + output = f"I should check the weather<|tool_call>call:get_weather{{location:{Q}SF{Q}}}" + result = parser.extract_tool_calls(output) + + assert result.tools_called is True + assert len(result.tool_calls) == 1 + assert result.tool_calls[0]["name"] == "get_weather" + + +def test_extract_numeric_args(): + parser = Gemma4ToolParser() + output = f"<|tool_call>call:set_temp{{value:72,unit:{Q}fahrenheit{Q}}}" + result = parser.extract_tool_calls(output) + + assert result.tools_called is True + args = json.loads(result.tool_calls[0]["arguments"]) + assert args == {"value": 72, "unit": "fahrenheit"} + + +def test_streaming_no_tool_call(): + parser = Gemma4ToolParser() + result = parser.extract_tool_calls_streaming( + previous_text="", + current_text="Hello", + delta_text="Hello", + ) + assert result == {"content": "Hello"} + + +def test_streaming_tool_call_buffering(): + parser = Gemma4ToolParser() + # Mid tool call — should buffer + result = parser.extract_tool_calls_streaming( + previous_text="", + current_text=f"<|tool_call>call:get_weather{{location:{Q}SF", + delta_text=f"{Q}SF", + ) + assert result is None # buffering + + +def test_streaming_tool_call_complete(): + parser = Gemma4ToolParser() + full = f"<|tool_call>call:get_weather{{location:{Q}SF{Q}}}" + result = parser.extract_tool_calls_streaming( + previous_text=f"<|tool_call>call:get_weather{{location:{Q}SF{Q}}}", + current_text=full, + delta_text="", + ) + assert result is not None + assert "tool_calls" in result + assert len(result["tool_calls"]) == 1 + assert result["tool_calls"][0]["function"]["name"] == "get_weather" + + +def test_extract_hyphenated_tool_name(): + parser = Gemma4ToolParser() + output = f"<|tool_call>call:web-search{{query:{Q}hello{Q}}}" + result = parser.extract_tool_calls(output) + + assert result.tools_called is True + assert result.tool_calls[0]["name"] == "web-search" + args = json.loads(result.tool_calls[0]["arguments"]) + assert args == {"query": "hello"} + + +def test_auto_parser_detects_gemma4(): + from vllm_mlx.tool_parsers.auto_tool_parser import AutoToolParser + + parser = AutoToolParser() + output = f"<|tool_call>call:search{{query:{Q}hello world{Q}}}" + result = parser.extract_tool_calls(output) + + assert result.tools_called is True + assert result.tool_calls[0]["name"] == "search" + + +def test_auto_parser_streaming_gemma4(): + from vllm_mlx.tool_parsers.auto_tool_parser import AutoToolParser + + parser = AutoToolParser() + full = f"<|tool_call>call:get_weather{{location:{Q}SF{Q}}}" + + # No marker yet — pass through + r1 = parser.extract_tool_calls_streaming("", "Hello", "Hello") + assert r1 == {"content": "Hello"} + + # Mid tool call — buffer + r2 = parser.extract_tool_calls_streaming( + "", f"<|tool_call>call:get_weather{{location:{Q}SF", f"{Q}SF" + ) + assert r2 is None + + # End marker arrives — parse + r3 = parser.extract_tool_calls_streaming( + f"<|tool_call>call:get_weather{{location:{Q}SF{Q}}}", + full, + "", + ) + assert r3 is not None + assert "tool_calls" in r3 + assert r3["tool_calls"][0]["function"]["name"] == "get_weather" + + +if __name__ == "__main__": + tests = [v for k, v in globals().items() if k.startswith("test_")] + for test in tests: + try: + test() + print(f" PASS: {test.__name__}") + except Exception as e: + print(f" FAIL: {test.__name__}: {e}") diff --git a/vllm_mlx/cli.py b/vllm_mlx/cli.py index 8a90bc9be..ca45f10f5 100644 --- a/vllm_mlx/cli.py +++ b/vllm_mlx/cli.py @@ -838,12 +838,13 @@ def main(): "nemotron", "xlam", "functionary", + "gemma4", "glm47", ], help=( "Select the tool call parser for the model. Options: " "auto (auto-detect), mistral, qwen, qwen3_coder, llama, hermes, " - "deepseek, kimi, granite, nemotron, xlam, functionary, glm47. " + "deepseek, gemma4, kimi, granite, nemotron, xlam, functionary, glm47. " "Required for --enable-auto-tool-choice." ), ) diff --git a/vllm_mlx/tool_parsers/__init__.py b/vllm_mlx/tool_parsers/__init__.py index 16f744080..b0232d30c 100644 --- a/vllm_mlx/tool_parsers/__init__.py +++ b/vllm_mlx/tool_parsers/__init__.py @@ -10,6 +10,7 @@ - mistral: Mistral models ([TOOL_CALLS] format) - qwen/qwen3: Qwen models ( and [Calling tool:] formats) - llama/llama3/llama4: Llama models ( format) +- gemma4/gemma_4: Google Gemma 4 models (<|tool_call>call:name{} format) - hermes/nous: Hermes/NousResearch models - deepseek/deepseek_v3/deepseek_r1: DeepSeek models (unicode tokens) - kimi/kimi_k2/moonshot: Kimi/Moonshot models @@ -48,6 +49,7 @@ from .deepseek_tool_parser import DeepSeekToolParser from .functionary_tool_parser import FunctionaryToolParser from .granite_tool_parser import GraniteToolParser +from .gemma4_tool_parser import Gemma4ToolParser from .hermes_tool_parser import HermesToolParser from .kimi_tool_parser import KimiToolParser from .llama_tool_parser import LlamaToolParser @@ -65,6 +67,7 @@ "ExtractedToolCallInformation", # Specific parsers "AutoToolParser", + "Gemma4ToolParser", "MistralToolParser", "QwenToolParser", "LlamaToolParser", diff --git a/vllm_mlx/tool_parsers/auto_tool_parser.py b/vllm_mlx/tool_parsers/auto_tool_parser.py index fc02d8fc6..dd1512326 100644 --- a/vllm_mlx/tool_parsers/auto_tool_parser.py +++ b/vllm_mlx/tool_parsers/auto_tool_parser.py @@ -16,6 +16,8 @@ ToolParser, ToolParserManager, ) +from .gemma4_tool_parser import TOOL_CALL_PATTERN as GEMMA4_PATTERN +from .gemma4_tool_parser import _gemma_args_to_json def generate_tool_id() -> str: @@ -29,12 +31,13 @@ class AutoToolParser(ToolParser): Auto-detecting tool call parser. Tries multiple formats in order: - 1. Mistral: [TOOL_CALLS] ... - 2. Qwen bracket: [Calling tool: func_name({...})] - 3. Qwen/Hermes XML: {"name": "...", "arguments": {...}} - 4. Llama: {"arg": "value"} - 5. Nemotron: ... - 6. Raw JSON: {"name": "...", "arguments": {...}} + 1. Gemma 4: <|tool_call>call:name{...} + 2. Mistral: [TOOL_CALLS] ... + 3. Qwen bracket: [Calling tool: func_name({...})] + 4. Qwen/Hermes XML: {"name": "...", "arguments": {...}} + 5. Llama: {"arg": "value"} + 6. Nemotron: ... + 7. Raw JSON: {"name": "...", "arguments": {...}} This is the default parser when no specific parser is selected. """ @@ -63,7 +66,41 @@ def extract_tool_calls( tool_calls: list[dict[str, Any]] = [] cleaned_text = model_output - # 1. Try Mistral format + # 1. Try Gemma 4 format (most distinctive marker) + if "<|tool_call>" in model_output: + gemma_matches = GEMMA4_PATTERN.findall(model_output) + for name, raw_args in gemma_matches: + try: + json_str = "{" + _gemma_args_to_json(raw_args) + "}" + arguments = json.loads(json_str) + tool_calls.append( + { + "id": generate_tool_id(), + "name": name.strip(), + "arguments": ( + json.dumps(arguments, ensure_ascii=False) + if isinstance(arguments, dict) + else str(arguments) + ), + } + ) + except (json.JSONDecodeError, ValueError): + tool_calls.append( + { + "id": generate_tool_id(), + "name": name.strip(), + "arguments": raw_args, + } + ) + if gemma_matches: + cleaned_text = GEMMA4_PATTERN.sub("", cleaned_text).strip() + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=cleaned_text if cleaned_text else None, + ) + + # 2. Try Mistral format if self.MISTRAL_TOKEN in model_output: parts = model_output.split(self.MISTRAL_TOKEN) content = parts[0].strip() @@ -113,7 +150,7 @@ def extract_tool_calls( content=content if content else None, ) - # 2. Try Qwen bracket pattern + # 3. Try Qwen bracket pattern bracket_matches = self.QWEN_BRACKET_PATTERN.findall(model_output) for name, args_str in bracket_matches: try: @@ -141,7 +178,7 @@ def extract_tool_calls( if bracket_matches: cleaned_text = self.QWEN_BRACKET_PATTERN.sub("", cleaned_text).strip() - # 3. Try Nemotron pattern (before Qwen XML as it's more specific) + # 4. Try Nemotron pattern (before Qwen XML as it's more specific) nemotron_matches = self.NEMOTRON_PATTERN.findall(cleaned_text) for name, params_block in nemotron_matches: params = self.NEMOTRON_PARAM_PATTERN.findall(params_block) @@ -157,7 +194,7 @@ def extract_tool_calls( if nemotron_matches: cleaned_text = self.NEMOTRON_PATTERN.sub("", cleaned_text).strip() - # 4. Try Qwen/Hermes XML pattern + # 5. Try Qwen/Hermes XML pattern xml_matches = self.QWEN_XML_PATTERN.findall(cleaned_text) for match in xml_matches: try: @@ -182,7 +219,7 @@ def extract_tool_calls( if xml_matches: cleaned_text = self.QWEN_XML_PATTERN.sub("", cleaned_text).strip() - # 5. Try Llama pattern + # 6. Try Llama pattern llama_matches = self.LLAMA_PATTERN.findall(cleaned_text) for name, args_str in llama_matches: try: @@ -210,7 +247,7 @@ def extract_tool_calls( if llama_matches: cleaned_text = self.LLAMA_PATTERN.sub("", cleaned_text).strip() - # 6. Fallback: Try raw JSON + # 7. Fallback: Try raw JSON if not tool_calls: raw_calls = self._parse_raw_json_tool_calls(cleaned_text) if raw_calls: @@ -327,6 +364,7 @@ def extract_tool_calls_streaming( """ # Check for any tool call markers markers = [ + "<|tool_call>", self.MISTRAL_TOKEN, "[Calling tool:", "", @@ -339,7 +377,7 @@ def extract_tool_calls_streaming( return {"content": delta_text} # Check for completion markers - end_markers = ["", "", ")]"] + end_markers = ["", "", "", ")]"] if any(m in delta_text for m in end_markers): result = self.extract_tool_calls(current_text) if result.tools_called: diff --git a/vllm_mlx/tool_parsers/gemma4_tool_parser.py b/vllm_mlx/tool_parsers/gemma4_tool_parser.py new file mode 100644 index 000000000..a3324ebd3 --- /dev/null +++ b/vllm_mlx/tool_parsers/gemma4_tool_parser.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Gemma 4 tool call parser for vllm-mlx. + +Handles Gemma 4's tool calling format: +- Tool call: <|tool_call>call:name{key:<|"|>value<|"|>,num:42} + +Gemma 4 uses a custom non-JSON argument format: +- Keys are unquoted +- Strings are delimited by <|"|> (a special token) instead of " +- Booleans are true/false, numbers are bare +- Nested objects use {}, arrays use [] +""" + +import json +import re +import uuid +from collections.abc import Sequence +from typing import Any + +from .abstract_tool_parser import ( + ExtractedToolCallInformation, + ToolParser, + ToolParserManager, +) + + +def generate_tool_id() -> str: + """Generate a unique tool call ID.""" + return f"call_{uuid.uuid4().hex[:8]}" + + +# Match: <|tool_call>call:NAME{ARGS} +# Capture group 1: function name +# Capture group 2: raw arguments (Gemma's custom format) +TOOL_CALL_PATTERN = re.compile( + r"<\|tool_call>call:([\w-]+)\{(.*?)\}", re.DOTALL +) + + +def _gemma_args_to_json(raw: str) -> str: + """ + Convert Gemma 4's custom argument format to valid JSON. + + Gemma format: key:<|"|>value<|"|>,other_key:42,flag:true + JSON format: {"key":"value","other_key":42,"flag":true} + + Uses a character-by-character state machine to correctly handle + nested structures and string values containing special characters. + """ + # Replace the escape token with a placeholder that won't conflict + # with the state machine, then we'll produce proper JSON quotes + QUOTE = "\x00" # null byte as temporary placeholder + text = raw.replace('<|"|>', QUOTE) + + result = [] + i = 0 + length = len(text) + + while i < length: + ch = text[i] + + if ch == QUOTE: + # Start of a string value — copy until closing QUOTE + result.append('"') + i += 1 + while i < length and text[i] != QUOTE: + c = text[i] + if c == '"': + result.append('\\"') + elif c == '\\': + result.append('\\\\') + elif c == '\n': + result.append('\\n') + elif c == '\t': + result.append('\\t') + else: + result.append(c) + i += 1 + result.append('"') + i += 1 # skip closing QUOTE + + elif ch in '{}[],:': + # Structural characters pass through + result.append(ch) + i += 1 + + elif ch in ' \t\n\r': + # Whitespace passes through + result.append(ch) + i += 1 + + else: + # Bare token: could be a key, number, boolean, or null + start = i + while i < length and text[i] not in (QUOTE + '{}[],: \t\n\r'): + i += 1 + token = text[start:i] + + # Look ahead: if next non-whitespace char is ':', this is a key + j = i + while j < length and text[j] in ' \t\n\r': + j += 1 + + if j < length and text[j] == ':': + # It's a key — quote it + result.append(f'"{token}"') + else: + # It's a value — keep bare for numbers/booleans/null + result.append(token) + + return "".join(result) + + +@ToolParserManager.register_module(["gemma4", "gemma_4", "gemma4_27b"]) +class Gemma4ToolParser(ToolParser): + """ + Tool call parser for Google Gemma 4 models. + + Supports Gemma 4 tool call format: + - <|tool_call>call:name{key:<|"|>value<|"|>} + + Gemma 4's chat template uses <|tool> / for definitions + and <|tool_call> / for calls, with <|"|> as string + delimiters instead of standard JSON quotes. + + Used when --enable-auto-tool-choice --tool-call-parser gemma4 are set. + """ + + SUPPORTS_NATIVE_TOOL_FORMAT = True + + def extract_tool_calls( + self, model_output: str, request: dict[str, Any] | None = None + ) -> ExtractedToolCallInformation: + """Extract tool calls from a complete Gemma 4 model response.""" + # Strip think tags if present (Gemma 4 uses <|channel>... + # for thinking but may also emit standard tags) + text = self.strip_think_tags(model_output) + + tool_calls = [] + matches = TOOL_CALL_PATTERN.findall(text) + + for name, raw_args in matches: + try: + json_str = "{" + _gemma_args_to_json(raw_args) + "}" + arguments = json.loads(json_str) + tool_calls.append( + { + "id": generate_tool_id(), + "name": name.strip(), + "arguments": ( + json.dumps(arguments, ensure_ascii=False) + if isinstance(arguments, dict) + else str(arguments) + ), + } + ) + except (json.JSONDecodeError, ValueError): + # Fall back to raw args string + tool_calls.append( + { + "id": generate_tool_id(), + "name": name.strip(), + "arguments": raw_args, + } + ) + + # Remove tool call markup from content + cleaned_text = text + if matches: + cleaned_text = TOOL_CALL_PATTERN.sub("", cleaned_text).strip() + + if tool_calls: + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=cleaned_text if cleaned_text else None, + ) + else: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int] | None = None, + current_token_ids: Sequence[int] | None = None, + delta_token_ids: Sequence[int] | None = None, + request: dict[str, Any] | None = None, + ) -> dict[str, Any] | None: + """Extract tool calls from streaming Gemma 4 model output.""" + # No tool call started yet — pass through as content + if "<|tool_call>" not in current_text: + return {"content": delta_text} + + # Tool call end marker arrived — parse all tool calls + if "" in delta_text: + result = self.extract_tool_calls(current_text) + if result.tools_called: + return { + "tool_calls": [ + { + "index": i, + "id": tc["id"], + "type": "function", + "function": { + "name": tc["name"], + "arguments": tc["arguments"], + }, + } + for i, tc in enumerate(result.tool_calls) + ] + } + + # Inside a tool call — buffer (return None to suppress partial output) + return None From 24114c327d0921bdea84ccf80c7eaaa01ab06f2b Mon Sep 17 00:00:00 2001 From: Keegan Mullaney Date: Sun, 5 Apr 2026 18:27:57 -0700 Subject: [PATCH 2/2] style: remove extraneous f-string prefixes to fix ruff F541 Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_gemma4_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_gemma4_parser.py b/tests/test_gemma4_parser.py index acf499be2..840da3ea1 100644 --- a/tests/test_gemma4_parser.py +++ b/tests/test_gemma4_parser.py @@ -20,14 +20,14 @@ def test_args_to_json_simple_strings(): def test_args_to_json_numbers(): - raw = f"temperature:15,humidity:0.75" + raw = "temperature:15,humidity:0.75" result = "{" + _gemma_args_to_json(raw) + "}" parsed = json.loads(result) assert parsed == {"temperature": 15, "humidity": 0.75} def test_args_to_json_booleans(): - raw = f"enabled:true,verbose:false" + raw = "enabled:true,verbose:false" result = "{" + _gemma_args_to_json(raw) + "}" parsed = json.loads(result) assert parsed == {"enabled": True, "verbose": False}