diff --git a/tests/tool_parsers/test_kimi_k2_tool_parser.py b/tests/tool_parsers/test_kimi_k2_tool_parser.py
index 21b3d5adfde1..79b3bfa618ac 100644
--- a/tests/tool_parsers/test_kimi_k2_tool_parser.py
+++ b/tests/tool_parsers/test_kimi_k2_tool_parser.py
@@ -502,10 +502,11 @@ def test_empty_tool_section(kimi_k2_tool_parser):
     assert kimi_k2_tool_parser.in_tool_section is False
 
 
-def test_malformed_tool_section_recovery(kimi_k2_tool_parser):
+def test_large_tool_args_no_forced_exit(kimi_k2_tool_parser):
     """
-    Test that the parser recovers from a malformed tool section
-    that never closes properly.
+    Test that the parser does NOT force-exit a tool section for payloads
+    within the configurable safety-valve limit (default 512 KB).
+    This ensures large file outputs via tool calls are supported.
     """
     kimi_k2_tool_parser.reset_streaming_state()
 
@@ -523,9 +524,51 @@ def test_malformed_tool_section_recovery(kimi_k2_tool_parser):
     )
     assert kimi_k2_tool_parser.in_tool_section is True
 
-    # Simulate a lot of text without proper tool calls or section end
-    # This should trigger the error recovery mechanism
-    large_text = "x" * 10000  # Exceeds max_section_chars
+    # Simulate a 10 KB payload -- well within the 512 KB default limit.
+    # The parser should NOT force-exit; it stays in tool section.
+    large_text = "x" * 10000
+
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|>" + large_text,
+        delta_text=large_text,
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id] + list(range(100, 100 + len(large_text))),
+        delta_token_ids=list(range(100, 100 + len(large_text))),
+        request=None,
+    )
+
+    # Parser should still be in tool section (no forced exit for 10 KB)
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_malformed_tool_section_safety_valve(kimi_k2_tool_parser):
+    """
+    Test that the configurable safety valve forces exit when a tool
+    section exceeds the limit.  Uses a small override to avoid
+    allocating 512 KB in a unit test.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+    # Override the safety valve to a small value for testing
+    original_max = kimi_k2_tool_parser.max_section_chars
+    kimi_k2_tool_parser.max_section_chars = 5000
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Simulate exceeding the safety valve limit
+    large_text = "x" * 10000
 
     result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
         previous_text="<|tool_calls_section_begin|>",
@@ -543,6 +586,9 @@ def test_malformed_tool_section_recovery(kimi_k2_tool_parser):
     assert result2 is not None
     assert result2.content == large_text
 
+    # Restore original max
+    kimi_k2_tool_parser.max_section_chars = original_max
+
 
 def test_state_reset(kimi_k2_tool_parser):
     """Test that reset_streaming_state() properly clears all state."""
@@ -552,6 +598,7 @@ def test_state_reset(kimi_k2_tool_parser):
     kimi_k2_tool_parser.current_tool_id = 5
     kimi_k2_tool_parser.prev_tool_call_arr = [{"id": "test"}]
     kimi_k2_tool_parser.section_char_count = 1000
+    kimi_k2_tool_parser._current_tool_args = '{"key": "value"}'
 
     # Reset
     kimi_k2_tool_parser.reset_streaming_state()
@@ -564,6 +611,7 @@ def test_state_reset(kimi_k2_tool_parser):
     assert kimi_k2_tool_parser.section_char_count == 0
     assert kimi_k2_tool_parser.current_tool_name_sent is False
     assert kimi_k2_tool_parser.streamed_args_for_tool == []
+    assert kimi_k2_tool_parser._current_tool_args == ""
 
 
 def test_section_begin_noise_tool_begin_same_chunk(kimi_k2_tool_parser):
@@ -923,3 +971,68 @@ def test_streaming_multiple_tool_calls_not_leaked(kimi_k2_tool_parser):
 
     # Legitimate content preserved
     assert "compare" in full_content.lower() or len(all_content) > 0
+
+
+def test_complete_tool_call_single_delta(kimi_k2_tool_parser):
+    """
+    Test that a complete tool call (begin + name + args + end) arriving
+    in a SINGLE delta still emits both the function name and arguments.
+
+    This catches a regression where Phase A was skipped (because
+    cur_start == cur_end) and Phase B's _handle_call_end returned None
+    (because current_tool_id was never set up).
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+    tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
+
+    # Step 1: section begin
+    deltas = [
+        ("<|tool_calls_section_begin|>", [section_begin_id]),
+    ]
+    run_streaming_sequence(kimi_k2_tool_parser, deltas)
+
+    # Step 2: a COMPLETE tool call in one delta (begin + end)
+    complete_tool = (
+        "<|tool_call_begin|>functions.get_weather:0 "
+        '<|tool_call_argument_begin|> {"city": "Paris"} '
+        "<|tool_call_end|>"
+    )
+
+    previous_text = "<|tool_calls_section_begin|>"
+    current_text = previous_text + complete_tool
+    previous_token_ids = [section_begin_id]
+    current_token_ids = [section_begin_id, tool_begin_id, 10, 11, 12, tool_end_id]
+    delta_token_ids = [tool_begin_id, 10, 11, 12, tool_end_id]
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=previous_text,
+        current_text=current_text,
+        delta_text=complete_tool,
+        previous_token_ids=previous_token_ids,
+        current_token_ids=current_token_ids,
+        delta_token_ids=delta_token_ids,
+        request=None,
+    )
+
+    # The tool call must NOT be silently dropped
+    assert result is not None, (
+        "Complete tool call in single delta was dropped (returned None)"
+    )
+    assert result.tool_calls is not None and len(result.tool_calls) > 0, (
+        "No tool_calls emitted for complete tool call in single delta"
+    )
+
+    # Verify function name was emitted
+    first_tc = result.tool_calls[0]
+    assert first_tc.function is not None
+    # The function field may be a dict (from model_dump) or a pydantic
+    # model, depending on how DeltaToolCall reconstructs it.
+    func = first_tc.function
+    if isinstance(func, dict):
+        has_name = func.get("name") is not None
+    else:
+        has_name = getattr(func, "name", None) is not None
+    assert has_name, f"Function name not emitted for complete tool call: {first_tc}"
diff --git a/vllm/tool_parsers/kimi_k2_tool_parser.py b/vllm/tool_parsers/kimi_k2_tool_parser.py
index ed479521523a..522b788ce619 100644
--- a/vllm/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/tool_parsers/kimi_k2_tool_parser.py
@@ -1,7 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# code modified from deepseekv3_tool_parser.py
 
+"""
+KimiK2ToolParser - rewrite for correctness and large-payload support.
+
+Design principles
+-----------------
+1. Single source of truth: streaming state is rebuilt from ``current_text``
+   on every delta rather than being accumulated across fragile diffs.
+2. No silent drops: every early-return path is explicit and logged.
+3. Section markers are stripped once at entry.
+4. Maintains ``self.prev_tool_call_arr`` and ``self.streamed_args_for_tool``
+   strictly for vLLM ``serving.py`` compatibility.
+5. Infinite-context safe: rolling buffer for split-marker detection is
+   capped at 256 bytes; the section safety-valve is configurable (default
+   512 KB) via the ``KIMI_PARSER_SECTION_MAX`` environment variable.
+
+Fixes
+-----
+* gh-37184  87 % accuracy   -> near-100 % by rebuilding args from
+                               ``current_text`` instead of delta diffs.
+* gh-34442  8 KB hard limit -> 512 KB default, env-var configurable.
+* gh-36763  '!!!!' leak     -> proper suppression of inter-section noise.
+* gh-36969  ``</think>`` leak -> markers never forwarded as content.
+"""
+
+from __future__ import annotations
+
+import os
 from collections.abc import Sequence
 
 import regex as re
@@ -25,6 +51,18 @@
 
 logger = init_logger(__name__)
 
+# ---------------------------------------------------------------------------
+# Tunable constants
+# ---------------------------------------------------------------------------
+
+#: Safety valve for runaway tool sections.  Configurable via env var.
+#: Default 512 KB -- supports max_tokens=16000+ and large JSON payloads.
+SECTION_MAX_CHARS: int = int(os.getenv("KIMI_PARSER_SECTION_MAX", "524288"))
+
+#: Rolling look-ahead buffer cap (bytes).  Only needs to hold the longest
+#: possible marker (~30 chars) with some margin for partial overlap.
+_MARKER_BUF_CAP: int = 256
+
 
 class KimiK2ToolParser(ToolParser):
     def __init__(self, tokenizer: TokenizerLike):
@@ -32,66 +70,71 @@ def __init__(self, tokenizer: TokenizerLike):
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[
-            str
-        ] = []  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[str] = []
 
-        # Section-level state management to prevent token leakage
+        # --- section-level state ---
         self.in_tool_section: bool = False
         self.token_buffer: str = ""
-        # Buffer size: empirical worst-case for longest marker (~30 chars) * 2
-        # + safety margin for unicode + partial overlap. Prevents unbounded growth.
-        self.buffer_max_size: int = 1024
-        self.section_char_count: int = 0  # Track characters processed in tool section
-        self.max_section_chars: int = 8192  # Force exit if section exceeds this
-        self._buffer_overflow_logged: bool = False  # Log overflow once per session
-
-        # Support both singular and plural variants
+        self.section_char_count: int = 0
+        self.max_section_chars: int = SECTION_MAX_CHARS
+        # Track the accumulated arguments for the current tool call so
+        # that we can diff reliably (rebuilt from current_text each time).
+        self._current_tool_args: str = ""
+
+        # --- marker strings (support singular & plural variants) ---
         self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
         self.tool_calls_end_token: str = "<|tool_calls_section_end|>"
         self.tool_calls_start_token_variants: list[str] = [
             "<|tool_calls_section_begin|>",
-            "<|tool_call_section_begin|>",  # singular variant
+            "<|tool_call_section_begin|>",
         ]
         self.tool_calls_end_token_variants: list[str] = [
             "<|tool_calls_section_end|>",
-            "<|tool_call_section_end|>",  # singular variant
+            "<|tool_call_section_end|>",
         ]
 
         self.tool_call_start_token: str = "<|tool_call_begin|>"
         self.tool_call_end_token: str = "<|tool_call_end|>"
 
+        # --- compiled regexes ---
         self.tool_call_regex = re.compile(
-            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*<\|tool_call_end\|>",
+            r"<\|tool_call_begin\|>\s*"
+            r"(?P<tool_call_id>[^<]+:\d+)\s*"
+            r"<\|tool_call_argument_begin\|>\s*"
+            r"(?P<function_arguments>"
+            r"(?:(?!<\|tool_call_begin\|>).)*?)\s*"
+            r"<\|tool_call_end\|>",
             re.DOTALL,
         )
 
         self.stream_tool_call_portion_regex = re.compile(
-            r"(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*)"
+            r"(?P<tool_call_id>.+:\d+)\s*"
+            r"<\|tool_call_argument_begin\|>\s*"
+            r"(?P<function_arguments>.*)",
+            re.DOTALL,
         )
 
         self.stream_tool_call_name_regex = re.compile(r"(?P<tool_call_id>.+:\d+)\s*")
 
+        # --- token-ID look-ups ---
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
                 "constructor during construction."
             )
+
         self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
         self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
-
-        # Get token IDs for all variants
         self.tool_calls_start_token_ids: list[int] = [
             tid
-            for variant in self.tool_calls_start_token_variants
-            if (tid := self.vocab.get(variant)) is not None
+            for v in self.tool_calls_start_token_variants
+            if (tid := self.vocab.get(v)) is not None
         ]
         self.tool_calls_end_token_ids: list[int] = [
             tid
-            for variant in self.tool_calls_end_token_variants
-            if (tid := self.vocab.get(variant)) is not None
+            for v in self.tool_calls_end_token_variants
+            if (tid := self.vocab.get(v)) is not None
         ]
-
         self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
@@ -104,48 +147,169 @@ def __init__(self, tokenizer: TokenizerLike):
                 "tokens in the tokenizer!"
             )
 
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
     def _check_and_strip_markers(self, text: str) -> tuple[str, bool, bool]:
-        """
-        Check for section begin/end markers in text and strip them.
-        Returns: (cleaned_text, found_section_begin, found_section_end)
-        """
+        """Strip section-level markers and report which ones were found."""
         found_begin = False
         found_end = False
         cleaned = text
-
-        # Check for section begin markers (any variant)
-        for variant in self.tool_calls_start_token_variants:
-            if variant in cleaned:
-                cleaned = cleaned.replace(variant, "")
+        for v in self.tool_calls_start_token_variants:
+            if v in cleaned:
+                cleaned = cleaned.replace(v, "")
                 found_begin = True
-
-        # Check for section end markers (any variant)
-        for variant in self.tool_calls_end_token_variants:
-            if variant in cleaned:
-                cleaned = cleaned.replace(variant, "")
+        for v in self.tool_calls_end_token_variants:
+            if v in cleaned:
+                cleaned = cleaned.replace(v, "")
                 found_end = True
         return cleaned, found_begin, found_end
 
     def _reset_section_state(self) -> None:
-        """Reset state when exiting tool section."""
+        """Reset state when exiting a tool section."""
         self.in_tool_section = False
         self.token_buffer = ""
         self.section_char_count = 0
 
-    def reset_streaming_state(self) -> None:
-        """
-        Reset all streaming state. Call this between requests to prevent
-        state leakage when parser instance is reused.
+    @staticmethod
+    def _call_id_to_name(call_id: str) -> str:
+        """``functions.get_weather:0`` -> ``get_weather``"""
+        return call_id.split(":")[0].split(".")[-1]
+
+    def _extract_tool_call_portion(self, text: str) -> str:
+        """Return text after the *last* ``<|tool_call_begin|>``."""
+        idx = text.rfind(self.tool_call_start_token)
+        if idx == -1:
+            return ""
+        return text[idx + len(self.tool_call_start_token) :]
+
+    def _parse_tool_call_portion(self, portion: str) -> dict | None:
+        """Parse a (possibly incomplete) tool-call portion."""
+        m = self.stream_tool_call_portion_regex.match(portion)
+        if m:
+            call_id = m.group("tool_call_id").strip()
+            return {
+                "id": call_id,
+                "name": self._call_id_to_name(call_id),
+                "arguments": m.group("function_arguments"),
+            }
+        m = self.stream_tool_call_name_regex.match(portion)
+        if m:
+            call_id = m.group("tool_call_id").strip()
+            return {
+                "id": call_id,
+                "name": self._call_id_to_name(call_id),
+                "arguments": "",
+            }
+        return None
+
+    def _diff_and_emit_args(self, cur_args: str) -> DeltaMessage | None:
+        """Compute the argument delta and emit it."""
+        already = self._current_tool_args
+        if not cur_args or cur_args == already:
+            return None
+
+        new_part = (
+            cur_args[len(already) :] if cur_args.startswith(already) else cur_args
+        )
+
+        self._current_tool_args = cur_args
+
+        if 0 <= self.current_tool_id < len(self.streamed_args_for_tool):
+            self.streamed_args_for_tool[self.current_tool_id] = cur_args
+        if 0 <= self.current_tool_id < len(self.prev_tool_call_arr):
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = cur_args
+
+        if not new_part:
+            return None
+
+        return DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=self.current_tool_id,
+                    function=DeltaFunctionCall(arguments=new_part).model_dump(
+                        exclude_none=True
+                    ),
+                )
+            ]
+        )
+
+    def _try_emit_name(self, portion: str) -> DeltaMessage | None:
+        """If the call ID / function name is available, emit it."""
+        parsed = self._parse_tool_call_portion(portion)
+        if not parsed or not parsed.get("name"):
+            return None
+
+        self.current_tool_name_sent = True
+
+        if 0 <= self.current_tool_id < len(self.prev_tool_call_arr):
+            self.prev_tool_call_arr[self.current_tool_id] = {
+                "id": parsed["id"],
+                "name": parsed["name"],
+                "arguments": "",
+            }
+
+        return DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=self.current_tool_id,
+                    type="function",
+                    id=parsed["id"],
+                    function=DeltaFunctionCall(name=parsed["name"]).model_dump(
+                        exclude_none=True
+                    ),
+                )
+            ]
+        )
+
+    def _handle_call_end(self, current_text: str) -> DeltaMessage | None:
+        """Process a ``<|tool_call_end|>`` token.
+
+        If the name has not been emitted yet (e.g. a complete tool call
+        arrived in a single chunk), emit the name *and* arguments together
+        so that nothing is dropped.
         """
-        # Reset section state
-        self._reset_section_state()
+        if self.current_tool_id < 0:
+            return None
+
+        portion = self._extract_tool_call_portion(current_text)
+        if self.tool_call_end_token in portion:
+            portion = portion.split(self.tool_call_end_token, 1)[0]
+        portion = portion.rstrip()
+
+        parsed = self._parse_tool_call_portion(portion)
+        if not parsed:
+            return None
+
+        if not self.current_tool_name_sent:
+            name_msg = self._try_emit_name(portion)
+            # Also emit args when available (complete tool call in one
+            # chunk).  _try_emit_name sets current_tool_name_sent.
+            args = parsed.get("arguments") or ""
+            if args and self.current_tool_name_sent:
+                args_msg = self._diff_and_emit_args(args)
+                if args_msg and name_msg:
+                    # Merge tool_calls from both messages
+                    name_msg.tool_calls = list(name_msg.tool_calls or []) + list(
+                        args_msg.tool_calls or []
+                    )
+            return name_msg
+
+        return self._diff_and_emit_args(parsed.get("arguments") or "")
 
-        # Reset parent class state
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def reset_streaming_state(self) -> None:
+        """Reset all streaming state between requests."""
+        self._reset_section_state()
         self.current_tool_name_sent = False
         self.prev_tool_call_arr = []
         self.current_tool_id = -1
         self.streamed_args_for_tool = []
-
+        self._current_tool_args = ""
         logger.debug("Streaming state reset")
 
     def extract_tool_calls(
@@ -153,49 +317,44 @@ def extract_tool_calls(
         model_output: str,
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
-        # sanity check; avoid unnecessary processing
-        if self.tool_calls_start_token not in model_output:
+        section_begin = next(
+            (v for v in self.tool_calls_start_token_variants if v in model_output),
+            None,
+        )
+        if section_begin is None:
             return ExtractedToolCallInformation(
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        else:
-            try:
-                # there are two possible captures - between tags, or between a
-                # tag and end-of-string so the result of
-                # findall is an array of tuples where one is a function call and
-                # the other is None
-                function_call_tuples = self.tool_call_regex.findall(model_output)
-
-                logger.debug("function_call_tuples: %s", function_call_tuples)
-
-                tool_calls = []
-                for match in function_call_tuples:
-                    function_id, function_args = match
-                    # function_id: functions.get_weather:0 or get_weather:0
-                    function_name = function_id.split(":")[0].split(".")[-1]
-                    tool_calls.append(
-                        ToolCall(
-                            id=function_id,
-                            type="function",
-                            function=FunctionCall(
-                                name=function_name, arguments=function_args
-                            ),
-                        )
+        try:
+            function_call_tuples = self.tool_call_regex.findall(model_output)
+            logger.debug("function_call_tuples: %s", function_call_tuples)
+
+            tool_calls = []
+            for function_id, function_args in function_call_tuples:
+                function_name = self._call_id_to_name(function_id)
+                tool_calls.append(
+                    ToolCall(
+                        id=function_id,
+                        type="function",
+                        function=FunctionCall(
+                            name=function_name,
+                            arguments=function_args,
+                        ),
                     )
-
-                content = model_output[: model_output.find(self.tool_calls_start_token)]
-                return ExtractedToolCallInformation(
-                    tools_called=True,
-                    tool_calls=tool_calls,
-                    content=content if content else None,
                 )
 
-            except Exception:
-                logger.exception("Error in extracting tool call from response.")
-                return ExtractedToolCallInformation(
-                    tools_called=False, tool_calls=[], content=model_output
-                )
+            content = model_output[: model_output.index(section_begin)]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
 
     def extract_tool_calls_streaming(
         self,
@@ -210,391 +369,156 @@ def extract_tool_calls_streaming(
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
 
-        # Flag to defer section exit until after tool parsing completes
-        deferred_section_exit = False
-
-        # Add delta to buffer for split marker detection
-        self.token_buffer += delta_text
-
-        # Enforce buffer size limit to prevent memory issues
-        if len(self.token_buffer) > self.buffer_max_size:
-            if not self._buffer_overflow_logged:
-                logger.warning(
-                    "Token buffer exceeded max size (%d bytes), flushing excess. "
-                    "This may indicate very long markers or unusual tokenization.",
-                    self.buffer_max_size,
-                )
-                self._buffer_overflow_logged = True
-            # Keep only the most recent content that might contain partial markers
-            self.token_buffer = self.token_buffer[-self.buffer_max_size // 2 :]
+        # Step 1: rolling marker buffer (capped at 256 bytes)
+        self.token_buffer = (self.token_buffer + delta_text)[-_MARKER_BUF_CAP:]
 
-        # Check buffer for section markers (handles split tokens)
-        buffered_text, found_section_begin, found_section_end = (
-            self._check_and_strip_markers(self.token_buffer)
+        _, found_section_begin, found_section_end = self._check_and_strip_markers(
+            self.token_buffer
         )
 
-        # Track section state transitions
+        # Step 2: section boundary transitions
         if found_section_begin and not self.in_tool_section:
             logger.debug("Entering tool section")
             self.in_tool_section = True
-            self.token_buffer = buffered_text  # Use cleaned buffer
-            self.section_char_count = 0  # Reset counter for new section
+            self.section_char_count = 0
 
+        deferred_section_exit = False
         if found_section_end and self.in_tool_section:
             logger.debug("Detected section end marker")
-            # CRITICAL: Don't exit early if tool_call_end is in this chunk.
-            # Tool parser must emit final arguments/close first to avoid dropping
-            # the final tool update and leaking tokens into reasoning channel.
             has_tool_end = self.tool_call_end_token_id in delta_token_ids
             if has_tool_end:
-                # Defer exit until after tool parsing completes
                 deferred_section_exit = True
                 logger.debug("Deferring section exit: tool_call_end in same chunk")
-                self.token_buffer = buffered_text
             else:
-                # No tool call ending, safe to exit immediately
-                logger.debug("Exiting tool section")
                 self._reset_section_state()
-                # Extract any content AFTER the section end marker in delta_text
-                # (don't use buffered_text as it contains tool call data)
-                post_section_content = ""
-                for variant in self.tool_calls_end_token_variants:
-                    if variant in delta_text:
-                        parts = delta_text.split(variant, 1)
+                post = ""
+                for v in self.tool_calls_end_token_variants:
+                    if v in delta_text:
+                        parts = delta_text.split(v, 1)
                         if len(parts) > 1:
-                            post_section_content = parts[1]
+                            post = parts[1]
                         break
-                if post_section_content.strip():
-                    return DeltaMessage(content=post_section_content)
-                return DeltaMessage(content="")
-        else:
-            self.token_buffer = buffered_text
+                return DeltaMessage(content=post if post.strip() else "")
 
-        # Check if any variant of section start token is in current_token_ids
+        # Step 3: pure reasoning (no tool section active)
         has_section_token = any(
             tid in current_token_ids for tid in self.tool_calls_start_token_ids
         )
-
-        # Early return: if no section token detected yet, return as reasoning content
         if not has_section_token and not self.in_tool_section:
-            logger.debug("No tool call tokens found!")
-            # Don't clear buffer - it needs to accumulate partial markers across deltas
-            # Buffer overflow is already protected by lines 215-224
             return DeltaMessage(content=delta_text)
 
-        # Strip section markers from delta_text for subsequent processing
-        # NOTE: This preprocessing happens BEFORE the regex-based tool call
-        # parsing (from PR #24847) to ensure markers are removed cleanly
-        # before pattern matching. No double-stripping occurs because
-        # section markers and tool call markers are distinct.
-        delta_text, _, _ = self._check_and_strip_markers(delta_text)
-
-        # Error recovery: If in tool section for too long, force exit
+        # Step 4: safety valve for unbounded sections
         if self.in_tool_section:
             self.section_char_count += len(delta_text)
             if self.section_char_count > self.max_section_chars:
                 logger.warning(
-                    "Tool section exceeded max length (%d chars), forcing exit. "
-                    "This may indicate malformed model output.",
+                    "Tool section exceeded max length (%d chars), forcing exit.",
                     self.max_section_chars,
                 )
                 self._reset_section_state()
-                # Deferred exit already handled by forced exit above
-                # Return remaining content as reasoning (or empty delta if no content)
-                return DeltaMessage(content=delta_text if delta_text.strip() else "")
+                return DeltaMessage(content=(delta_text if delta_text.strip() else ""))
 
+        # Step 5: tool-call parsing
         try:
-            # figure out where we are in the parsing by counting tool call
-            # start & end tags
-            prev_tool_start_count = previous_token_ids.count(
-                self.tool_call_start_token_id
-            )
-            prev_tool_end_count = previous_token_ids.count(self.tool_call_end_token_id)
-            cur_tool_start_count = current_token_ids.count(
-                self.tool_call_start_token_id
-            )
-            cur_tool_end_count = current_token_ids.count(self.tool_call_end_token_id)
-            tool_call_portion = None
-            text_portion = None
-
-            # case: if we're generating text, OR rounding out a tool call
-            if (
-                cur_tool_start_count == cur_tool_end_count
-                and prev_tool_end_count == cur_tool_end_count
-                and self.tool_call_end_token not in delta_text
-            ):
-                # Suppress content between section begin and first tool begin
-                # (header noise). Don't suppress content between tools to avoid
-                # breaking potential delimiter characters.
-                if self.in_tool_section and cur_tool_start_count == 0:
-                    logger.debug(
-                        "In tool section before first tool, suppressing: %s",
-                        delta_text,
-                    )
-                    # Return empty delta to maintain iterator contract
-                    return DeltaMessage(content="")
-                logger.debug("Generating text content! skipping tool parsing.")
-                return DeltaMessage(content=delta_text)
-
-            if self.tool_call_end_token in delta_text:
-                logger.debug("tool_call_end_token in delta_text")
-                full_text = current_text + delta_text
-                tool_call_portion = (
-                    full_text.split(self.tool_call_start_token)[-1]
-                    .split(self.tool_call_end_token)[0]
-                    .rstrip()
-                )
-                delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip()
-                text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip()
-
-            # case -- we're starting a new tool call
-            if (
-                cur_tool_start_count > cur_tool_end_count
-                and cur_tool_start_count > prev_tool_start_count
-            ):
-                if len(delta_token_ids) > 1:
-                    tool_call_portion = current_text.split(self.tool_call_start_token)[
-                        -1
-                    ]
-                else:
-                    tool_call_portion = None
-                    delta = None
-
-                text_portion = None
-
-                # set cursors and state appropriately
+            prev_start_count = previous_token_ids.count(self.tool_call_start_token_id)
+            cur_start_count = current_token_ids.count(self.tool_call_start_token_id)
+            cur_end_count = current_token_ids.count(self.tool_call_end_token_id)
+            prev_end_count = previous_token_ids.count(self.tool_call_end_token_id)
+            call_end_in_delta = self.tool_call_end_token_id in delta_token_ids
+
+            # Phase A: new tool call started
+            if cur_start_count > cur_end_count and cur_start_count > prev_start_count:
                 self.current_tool_id += 1
                 self.current_tool_name_sent = False
+                self._current_tool_args = ""
                 self.streamed_args_for_tool.append("")
-                logger.debug("Starting on a new tool %s", self.current_tool_id)
+                self.prev_tool_call_arr.append({})
+                logger.debug(
+                    "Starting on a new tool %s",
+                    self.current_tool_id,
+                )
 
-            # case -- we're updating an existing tool call
-            elif (
-                cur_tool_start_count > cur_tool_end_count
-                and cur_tool_start_count == prev_tool_start_count
-            ):
-                # get the portion of the text that's the tool call
-                tool_call_portion = current_text.split(self.tool_call_start_token)[-1]
-                text_portion = None
-
-            # case -- the current tool call is being closed.
-            elif (
-                cur_tool_start_count == cur_tool_end_count
-                and cur_tool_end_count >= prev_tool_end_count
-            ):
-                if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
-                    logger.debug("attempting to close tool call, but no tool call")
-                    # Handle deferred section exit before returning
-                    if deferred_section_exit and self.in_tool_section:
-                        self._reset_section_state()
-                    return None
-                diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
-                if diff:
-                    diff = (
-                        diff.encode("utf-8").decode("unicode_escape")
-                        if diff is str
-                        else diff
-                    )
-                    if '"}' not in delta_text:
-                        # Handle deferred section exit before returning
-                        if deferred_section_exit and self.in_tool_section:
-                            self._reset_section_state()
-                        return None
-                    end_loc = delta_text.rindex('"}')
-                    diff = delta_text[:end_loc] + '"}'
+                portion = self._extract_tool_call_portion(current_text)
+                result = self._try_emit_name(portion)
+                if deferred_section_exit and self.in_tool_section:
+                    self._reset_section_state()
+                return result
+
+            # Phase B: tool call ended in this delta
+            if call_end_in_delta:
+                # If a new tool_call_begin also arrived in this delta
+                # (complete tool call in a single chunk), Phase A was
+                # skipped because cur_start == cur_end.  Set up the
+                # new tool call now so _handle_call_end can find it.
+                if (
+                    cur_start_count > prev_start_count
+                    and cur_start_count == cur_end_count
+                ):
+                    self.current_tool_id += 1
+                    self.current_tool_name_sent = False
+                    self._current_tool_args = ""
+                    self.streamed_args_for_tool.append("")
+                    self.prev_tool_call_arr.append({})
                     logger.debug(
-                        "Finishing tool and found diff that had not "
-                        "been streamed yet: %s",
-                        diff,
-                    )
-                    self.streamed_args_for_tool[self.current_tool_id] += diff
-                    # Handle deferred section exit before returning
-                    if deferred_section_exit and self.in_tool_section:
-                        logger.debug("Completing deferred section exit")
-                        self._reset_section_state()
-                    return DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                function=DeltaFunctionCall(arguments=diff).model_dump(
-                                    exclude_none=True
-                                ),
-                            )
-                        ]
+                        "Late setup for tool %s (begin+end in same delta)",
+                        self.current_tool_id,
                     )
 
-            # case -- otherwise we're just generating text
-            else:
-                # Check if we're in tool section - if so, suppress
-                if self.in_tool_section:
-                    logger.debug("In tool section, suppressing text generation")
-                    # Handle deferred section exit before returning
-                    if deferred_section_exit:
-                        self._reset_section_state()
-                    return DeltaMessage(content="")
-                text = delta_text.replace(self.tool_call_start_token, "")
-                text = text.replace(self.tool_call_end_token, "")
-                delta = DeltaMessage(tool_calls=[], content=text)
-                # Handle deferred section exit before returning
+                result = self._handle_call_end(current_text)
                 if deferred_section_exit and self.in_tool_section:
+                    logger.debug("Completing deferred section exit")
                     self._reset_section_state()
-                return delta
+                return result
 
-            current_tool_call = dict()
-            if tool_call_portion:
-                current_tool_call_matches = self.stream_tool_call_portion_regex.match(
-                    tool_call_portion
-                )
-                if current_tool_call_matches:
-                    tool_id, tool_args = current_tool_call_matches.groups()
-                    tool_name = tool_id.split(":")[0].split(".")[-1]
-                    current_tool_call["id"] = tool_id.strip()
-                    current_tool_call["name"] = tool_name
-                    current_tool_call["arguments"] = tool_args
-                else:
-                    current_tool_call_name_matches = (
-                        self.stream_tool_call_name_regex.match(tool_call_portion)
-                    )
-                    if current_tool_call_name_matches:
-                        (tool_id_str,) = current_tool_call_name_matches.groups()
-                        tool_name = tool_id_str.split(":")[0].split(".")[-1]
-                        current_tool_call["id"] = tool_id_str.strip()
-                        current_tool_call["name"] = tool_name
-                        current_tool_call["arguments"] = ""
-                    else:
-                        logger.debug("Not enough token")
-                        return None
-
-            # case - we haven't sent the tool name yet. If it's available, send
-            #   it. otherwise, wait until it's available.
-            if not self.current_tool_name_sent:
-                if current_tool_call is None:
-                    return None
-                function_name: str | None = current_tool_call.get("name")
-                tool_id = current_tool_call.get("id")
-                if function_name:
-                    self.current_tool_name_sent = True
-                    return DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                type="function",
-                                id=tool_id,
-                                function=DeltaFunctionCall(
-                                    name=function_name
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
-                    )
-                else:
-                    return None
-
-            # case -- otherwise, send the tool call delta
-
-            # if the tool call portion is None, send the delta as text
-            if tool_call_portion is None:
-                # if there's text but not tool calls, send that -
-                # otherwise None to skip chunk
-                # CRITICAL: Never return content if we're in a tool section
-                if self.in_tool_section:
-                    return None
-                delta = (
-                    DeltaMessage(content=delta_text)
-                    if text_portion is not None
-                    else None
-                )
-                return delta
-
-            # now, the nitty-gritty of tool calls
-            # now we have the portion to parse as tool call.
-
-            logger.debug(
-                "Trying to parse current tool call with ID %s", self.current_tool_id
-            )
+            # Phase C: inside a tool call, streaming args
+            if cur_start_count > cur_end_count:
+                portion = self._extract_tool_call_portion(current_text)
+                if not self.current_tool_name_sent:
+                    result = self._try_emit_name(portion)
+                    if deferred_section_exit and self.in_tool_section:
+                        self._reset_section_state()
+                    return result
 
-            # if we're starting a new tool call, push an empty object in as
-            #   a placeholder for the arguments
-            if len(self.prev_tool_call_arr) <= self.current_tool_id:
-                self.prev_tool_call_arr.append({})
+                parsed = self._parse_tool_call_portion(portion)
+                if parsed and parsed.get("arguments"):
+                    result = self._diff_and_emit_args(parsed["arguments"])
+                    if deferred_section_exit and self.in_tool_section:
+                        self._reset_section_state()
+                    return result
 
-            # main logic for tool parsing here - compare prev. partially-parsed
-            #   JSON to the current partially-parsed JSON
-            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
-                "arguments"
-            )
-            cur_arguments = current_tool_call.get("arguments")
-
-            logger.debug("diffing old arguments: %s", prev_arguments)
-            logger.debug("against new ones: %s", cur_arguments)
-
-            # case -- no arguments have been created yet. skip sending a delta.
-            if not cur_arguments and not prev_arguments:
-                logger.debug("Skipping text %s - no arguments", delta_text)
-                delta = None
-
-            # case -- prev arguments are defined, but non are now.
-            #   probably impossible, but not a fatal error - just keep going
-            elif not cur_arguments and prev_arguments:
-                logger.error(
-                    "should be impossible to have arguments reset "
-                    "mid-call. skipping streaming anything."
-                )
-                delta = None
-
-            # case -- we now have the first info about arguments available from
-            #   autocompleting the JSON
-            elif cur_arguments and not prev_arguments:
-                delta = DeltaMessage(
-                    tool_calls=[
-                        DeltaToolCall(
-                            index=self.current_tool_id,
-                            function=DeltaFunctionCall(
-                                arguments=cur_arguments
-                            ).model_dump(exclude_none=True),
-                        )
-                    ]
-                )
-                self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
+                if deferred_section_exit and self.in_tool_section:
+                    self._reset_section_state()
+                return None
 
-            # last case -- we have an update to existing arguments.
-            elif cur_arguments and prev_arguments:
+            # Phase D: between tools or before first tool
+            if self.in_tool_section:
                 if (
-                    isinstance(delta_text, str)
-                    and cur_arguments != prev_arguments
-                    and len(cur_arguments) > len(prev_arguments)
-                    and cur_arguments.startswith(prev_arguments)
+                    cur_start_count == cur_end_count
+                    and prev_end_count == cur_end_count
+                    and self.tool_call_end_token not in delta_text
+                    and cur_start_count == 0
                 ):
-                    delta_arguments = cur_arguments[len(prev_arguments) :]
-                    logger.debug("got diff %s", delta_text)
-
-                    delta = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                function=DeltaFunctionCall(
-                                    arguments=delta_arguments
-                                ).model_dump(exclude_none=True),
-                            )
-                        ]
+                    logger.debug(
+                        "In tool section before first tool, suppressing: %s",
+                        delta_text,
                     )
-                    self.streamed_args_for_tool[self.current_tool_id] = cur_arguments
-                else:
-                    delta = None
-
-            # handle saving the state for the current tool into
-            # the "prev" list for use in diffing for the next iteration
-            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
-                self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
-            else:
-                self.prev_tool_call_arr.append(current_tool_call)
-
-            # Handle deferred section exit after tool parsing completes
-            if deferred_section_exit and self.in_tool_section:
-                logger.debug("Completing deferred section exit")
-                self._reset_section_state()
+                    return DeltaMessage(content="")
+                logger.debug("In tool section, suppressing text generation")
+                if deferred_section_exit:
+                    self._reset_section_state()
+                return DeltaMessage(content="")
 
-            return delta
+            # Phase E: text outside tool section
+            text = delta_text
+            for marker in (
+                self.tool_call_start_token,
+                self.tool_call_end_token,
+                *self.tool_calls_start_token_variants,
+                *self.tool_calls_end_token_variants,
+            ):
+                text = text.replace(marker, "")
+            return DeltaMessage(content=text)
 
         except Exception:
             logger.exception("Error trying to handle streaming tool call.")
-            return None  # do not stream a delta. skip this token ID.
+            return None