unslothai · danielhanchen · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/studio/backend/core/inference/anthropic_compat.py b/studio/backend/core/inference/anthropic_compat.py
@@ -253,9 +253,28 @@ def feed(self, event: dict) -> list[str]:
         elif etype == "metadata":
             self._usage = event.get("usage", {})
             return []
-        # status events — no Anthropic equivalent
+        elif etype == "status" and event.get("boundary"):
+            # Iteration-boundary marker (auto-continue reprompt). Close
+            # the open text block + reset _prev_text so the next content
+            # event diffs against zero. Non-boundary status events (UI
+            # badge clears) don't reach this branch.
+            return self._handle_boundary()
+        # Other status events have no Anthropic equivalent.
         return []
 
+    def _handle_boundary(self) -> list[str]:
+        # Close the current text block + reset the cumulative cursor.
+        # Do NOT pre-open a new text block here -- if the next event is
+        # a tool_start (not text), the eager-open would emit a zero-length
+        # text content block between intent text and the tool_use.
+        # _handle_content lazy-opens when real text arrives.
+        events = []
+        if self._text_block_open:
+            events.append(self._close_block())
+            self.block_index += 1
+        self._prev_text = ""
+        return events
+
     def finish(self, stop_reason: str = "end_turn") -> list[str]:
         """Close any open block and emit message_delta + message_stop."""
         events = []

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -10,6 +10,7 @@
 
 import atexit
 import contextlib
+import itertools as _itertools
 import json
 import os
 import re
@@ -56,6 +57,51 @@
 )
 _MAX_REPROMPTS = 3
 
+# Mid-plan EOS detectors. Three shapes: trailing intent, list under a
+# "Let me ...:" header, bare trailing colon. "let me know" is a closer,
+# not a plan signal -- excluded via negative lookahead.
+_TRAILING_PLAN_INTENT = re.compile(
+    r"(?i)("
+    r"(?:now\s+)?let me(?!\s+know\b)|i['’]ll now|next,?\s*i['’]ll|"
+    r"i['’]m going to|i will now|let['’]s now"
+    r")[^.!?\n]*[.!?]?\s*$"
+)
+_TRAILING_PLAN_LIST = re.compile(
+    # `\Z` (not `$`) so the regex only fires when the list is the last
+    # thing in the buffer, not when a closing paragraph follows.
+    # `[ \t]*` before each marker keeps it line-anchored so a mid-prose
+    # "42." cannot pose as a phantom list item.
+    r"(?i)"
+    r"(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
+    r"here['’]?s (?:my |the |a )?(?:plan|approach|steps?)|"
+    r"as follows|the (?:plan|steps?) (?:is|are))"
+    r"[^:\n]{0,160}:\s*\n"
+    r"(?:[ \t]*(?:[-*•]|\d+\.)[ \t]+[^\n]+(?:\n|\Z))+"
+    r"\s*\Z"
+)
+_TRAILING_PLAN_COLON = re.compile(
+    r"(?i)(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
+    r"now i['’]ll|now i will)"
+    r"[^\n:]{0,200}:\s*$"
+)
+_TRAILING_PLAN_WINDOW = 600
+_MAX_CONTINUES = 3
+
+
+def _trailing_plan_hit(stripped: str) -> bool:
+    """True if the last `_TRAILING_PLAN_WINDOW` chars look mid-plan."""
+    if not stripped:
+        return False
+    tail = stripped[-_TRAILING_PLAN_WINDOW:]
+    if _TRAILING_PLAN_INTENT.search(tail) is not None:
+        return True
+    if _TRAILING_PLAN_LIST.search(tail) is not None:
+        return True
+    if _TRAILING_PLAN_COLON.search(tail) is not None:
+        return True
+    return False
-    if _TRAILING_PLAN_INTENT.search(tail) is not None:
-        return True
-    if _TRAILING_PLAN_LIST.search(tail) is not None:
-        return True
-    if _TRAILING_PLAN_COLON.search(tail) is not None:
-        return True
-    return False
+    return any(
+        regex.search(tail) is not None
+        for regex in (_TRAILING_PLAN_INTENT, _TRAILING_PLAN_LIST, _TRAILING_PLAN_COLON)
+    )
-    if _TRAILING_PLAN_INTENT.search(tail) is not None:
-        return True
-    if _TRAILING_PLAN_LIST.search(tail) is not None:
-        return True
-    if _TRAILING_PLAN_COLON.search(tail) is not None:
-        return True
-    return False
+    return any(
+        regex.search(tail) is not None
+        for regex in (_TRAILING_PLAN_INTENT, _TRAILING_PLAN_LIST, _TRAILING_PLAN_COLON)
+    )
+
+
 # Without max_tokens, llama-server defaults to n_predict = n_ctx (up to
 # 262144 for Qwen3.5), producing many-minute zombie decodes when cancel
 # fails. t_max_predict_ms is a wall-clock backstop applied unconditionally,
@@ -4015,12 +4061,16 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
         # direct answer like "4" or "Hello!" will not match.
         # Pattern is compiled once at module level (_INTENT_SIGNAL).
         _reprompt_count = 0
-
-        # Reserve extra iterations for re-prompts so they don't
-        # consume the caller's tool-call budget.  Only add the
-        # extra slot when tool iterations are actually allowed.
-        _extra = _MAX_REPROMPTS if max_tool_iterations > 0 else 0
-        for iteration in range(max_tool_iterations + _extra):
+        # Separate counter so auto-continue doesn't steal reprompt budget.
+        _continue_count = 0
+
+        # Dynamic cap: caller's max_tool_iterations is honored exactly
+        # until a reprompt/continue actually fires; each consumed event
+        # earns its own slot back. itertools.count preserves loop-body
+        # `continue` semantics.
+        for iteration in _itertools.count():
+            if iteration >= (max_tool_iterations + _reprompt_count + _continue_count):
+                break
             if cancel_event is not None and cancel_event.is_set():
                 return
 
@@ -4353,18 +4403,41 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                         _stripped = content_accum.strip()
                         if not _stripped:
                             _stripped = reasoning_accum.strip()
-                        if (
+
+                        # Tool-coercive reprompt: intent text without a tool call.
+                        _tool_intent_hit = (
                             tools
                             and _reprompt_count < _MAX_REPROMPTS
                             and 0 < len(_stripped) < _REPROMPT_MAX_CHARS
-                            and _INTENT_SIGNAL.search(_stripped)
-                        ):
-                            _reprompt_count += 1
-                            logger.info(
-                                f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: "
-                                f"model responded without calling tools "
-                                f"({len(_stripped)} chars)"
-                            )
+                            and _INTENT_SIGNAL.search(_stripped) is not None
+                        )
+                        # Neutral auto-continue on mid-plan EOS. Works without tools.
+                        _trailing_hit = (
+                            _continue_count < _MAX_CONTINUES
+                            and _trailing_plan_hit(_stripped)
+                        )
+
+                        if _tool_intent_hit or _trailing_hit:
+                            if _tool_intent_hit:
+                                _reprompt_count += 1
+                                logger.info(
+                                    f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: "
+                                    f"model responded without calling tools "
+                                    f"({len(_stripped)} chars)"
+                                )
+                                _nudge = (
+                                    "STOP. Do NOT write code or explain. "
+                                    "You MUST call a tool NOW. "
+                                    "Call web_search or python immediately."
+                                )
+                            else:
+                                _continue_count += 1
+                                logger.info(
+                                    f"Auto-continue {_continue_count}/{_MAX_CONTINUES}: "
+                                    f"model ended turn mid-plan "
+                                    f"({len(_stripped)} chars)"
+                                )
+                                _nudge = "Continue."
                             conversation.append(
                                 {
                                     "role": "assistant",
@@ -4374,11 +4447,7 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                             conversation.append(
                                 {
                                     "role": "user",
-                                    "content": (
-                                        "STOP. Do NOT write code or explain. "
-                                        "You MUST call a tool NOW. "
-                                        "Call web_search or python immediately."
-                                    ),
+                                    "content": _nudge,
                                 }
                             )
                             # Accumulate tokens and timing from this iteration
@@ -4389,7 +4458,9 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                             _it_r = _iter_timings or {}
                             _accumulated_predicted_ms += _it_r.get("predicted_ms", 0)
                             _accumulated_predicted_n += _it_r.get("predicted_n", 0)
-                            yield {"type": "status", "text": ""}
+                            # boundary=True: next iter starts a fresh
+                            # turn, so adapters must reset their cursor.
+                            yield {"type": "status", "text": "", "boundary": True}
                             continue
 
                         # Content was already streamed.  Yield metadata.
@@ -4657,9 +4728,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                         tool_msg["tool_call_id"] = tool_call_id
                     conversation.append(tool_msg)
 
-                # Clear tool status badge before next generation iteration
+                # UI badge clear. NOT a boundary: tool_end already
+                # reset adapter cursors (would emit a spurious empty
+                # block if we flagged it).
                 yield {"type": "status", "text": ""}
-                # Continue the loop to let model respond with context
                 continue
 
             except httpx.ConnectError:

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
@@ -2459,11 +2459,10 @@ async def gguf_tool_stream():
                             break
 
                         if event["type"] == "status":
-                            # Empty status marks an iteration boundary
-                            # in the GGUF tool loop (e.g. after a
-                            # re-prompt).  Reset the cumulative cursor
-                            # so the next assistant turn streams cleanly.
-                            if not event["text"]:
+                            # boundary=True: auto-continue reprompt.
+                            # Reset cursor only then; plain empty-status
+                            # events (badge clears) keep the cursor.
+                            if event.get("boundary"):
                                 prev_text = ""
                             # Emit tool status as a custom SSE event
                             # (including empty ones to clear UI badges)
@@ -2477,8 +2476,10 @@ async def gguf_tool_stream():
                             continue
 
                         if event["type"] in ("tool_start", "tool_end"):
-                            if event["type"] == "tool_start":
-                                prev_text = ""
+                            # Both edges of a tool call restart cumulative
+                            # text: tool_start opens a new stream, tool_end
+                            # is the cursor reset for the post-tool turn.
+                            prev_text = ""
                             yield f"data: {json.dumps(event)}\n\n"
                             continue
 
@@ -4370,6 +4371,9 @@ async def _anthropic_tool_non_streaming(run_gen, message_id, model_name):
             )
         elif etype == "tool_end":
             prev_text = ""
+        elif etype == "status" and event.get("boundary"):
+            # Iteration-boundary marker: reset like tool_end does.
+            prev_text = ""
         elif etype == "metadata":
             usage = event.get("usage", {})
 

@@ -629,6 +629,90 @@ def test_text_after_tool_resets_prev_text(self):
         parsed = json.loads(events[0].split("data: ")[1])
         assert parsed["delta"]["text"] == "After tool"
 
+    def test_boundary_flag_closes_block_and_resets_cursor(self):
+        """An iteration-boundary status (boundary=True) must close the
+        open text block, open a fresh one, and reset _prev_text so the
+        next content delta starts from zero."""
+
+        e = AnthropicStreamEmitter()
+        e.start("msg_1", "m")
+        e.feed({"type": "content", "text": "first turn"})
+        boundary = e.feed({"type": "status", "text": "", "boundary": True})
+        # Boundary must produce content_block_stop + content_block_start
+        # so the next text lives in a new block.
+        joined = "\n".join(boundary)
+        assert "content_block_stop" in joined
+        assert "content_block_start" in joined
+        # Next content delta must include the full "second turn", not a
+        # diff against the previous turn's length.
+        nxt = e.feed({"type": "content", "text": "second turn"})
+        parsed = json.loads(nxt[0].split("data: ")[1])
+        assert parsed["delta"]["text"] == "second turn"
+
+    def test_post_tool_empty_status_does_not_double_close(self):
+        """After tool_end already opens a fresh text block, the post-tool
+        empty-status event emitted by llama_cpp.py (line 4766) must NOT
+        close that fresh block. Otherwise every tool call produces a
+        spurious empty content_block_stop + content_block_start pair
+        before the model's post-tool text arrives. Regression test for
+        PR 5549 codex 02:42Z."""
+
+        e = AnthropicStreamEmitter()
+        e.start("msg_1", "m")
+        e.feed({"type": "content", "text": "pre"})
+        e.feed(
+            {
+                "type": "tool_start",
+                "tool_name": "t",
+                "tool_call_id": "tc_1",
+                "arguments": {},
+            }
+        )
+        e.feed(
+            {
+                "type": "tool_end",
+                "tool_name": "t",
+                "tool_call_id": "tc_1",
+                "result": "ok",
+            }
+        )
+        block_after_tool = e.block_index
+        # Post-tool empty status (no boundary flag): should produce zero
+        # SSE events and leave block_index unchanged. The previous
+        # behaviour was to close+reopen, which produced a duplicate
+        # empty content block.
+        out = e.feed({"type": "status", "text": ""})
+        assert out == []
+        assert e.block_index == block_after_tool
+        # Next content delta lands in the same fresh text block that
+        # tool_end opened.
+        nxt = e.feed({"type": "content", "text": "post"})
+        parsed = json.loads(nxt[0].split("data: ")[1])
+        assert parsed["delta"]["text"] == "post"
+
+    def test_empty_status_without_boundary_does_not_close_block(self):
+        """A non-boundary empty-status event (UI badge clear at normal
+        stream end, draining fallbacks, final status yields in
+        llama_cpp.py at lines 4501, 4584, 4794) must NOT close the
+        current text block or reset _prev_text - otherwise every normal
+        Anthropic response gets extra content_block_start/stop pairs
+        around its final text. Regression test for PR 5549 codex P2."""
+
+        e = AnthropicStreamEmitter()
+        e.start("msg_1", "m")
+        block_before = e.block_index
+        e.feed({"type": "content", "text": "hello "})
+        # Plain empty status (no boundary flag) -> no extra SSE events.
+        out = e.feed({"type": "status", "text": ""})
+        assert out == []
+        # block_index must not have advanced (no close+reopen happened).
+        assert e.block_index == block_before
+        # Next content delta is diffed against "hello ", so we only emit
+        # " world" (the new suffix).
+        nxt = e.feed({"type": "content", "text": "hello world"})
+        parsed = json.loads(nxt[0].split("data: ")[1])
+        assert parsed["delta"]["text"] == "world"
+
 
 # =====================================================================
 # Pass-through emitter tests (client-side tool execution path)