From 3a118e5df0061e7b8ec2c69d130d005f0b6eabbd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 May 2026 11:00:43 +0000
Subject: [PATCH 01/11] studio: auto-continue when model stops mid-plan

The existing intent-signal re-prompt fires only when tools are armed
and the response is short. Models often stop mid-plan in other shapes
too: a trailing "Let me clone the repo.", a "Let me ...:" header
followed by a numbered list, or a bare trailing colon. When this
happens the turn ends with the structured workload only partially
delivered.

Add a neutral "Continue." nudge that runs alongside the tool-coercive
re-prompt:

- _TRAILING_PLAN_INTENT, _TRAILING_PLAN_LIST, _TRAILING_PLAN_COLON
  cover the three observed shapes, scanned over the last 600 chars.
- _trailing_plan_hit() returns True if any of them match.
- _MAX_CONTINUES (3) is independent of _MAX_REPROMPTS so the two
  paths cannot starve each other.
- _continue_count threads through the agentic loop; auto-continue
  fires with "Continue." regardless of tool armament.

Regex tested against the patterns above plus negative controls
(complete sentences, benign "let me" earlier in the buffer) before
landing.
---
 studio/backend/core/inference/llama_cpp.py | 117 +++++++++++++++++----
 1 file changed, 98 insertions(+), 19 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 21f2fe71b5..560d50e92e 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -56,6 +56,54 @@
 )
 _MAX_REPROMPTS = 3
 
+# Mid-plan EOS detectors. Auto-continue runs alongside the _INTENT_SIGNAL
+# re-prompt path but stays neutral ("Continue.") instead of tool-coercive,
+# so it also handles tool-less turns. Three observed shapes:
+#   1. Trailing intent at end of buffer: "Let me clone the repo."
+#   2. Numbered or bulleted list under a "Let me ...:" header.
+#   3. Bare trailing colon: "Let me check the repo:"
+_TRAILING_PLAN_INTENT = re.compile(
+    r"(?i)("
+    r"let me|now let me|i['’]ll now|next,?\s*i['’]ll|"
+    r"i['’]m going to|i will now|let['’]s now"
+    r")[^.!?\n]*[.!?]?\s*$"
+)
+_TRAILING_PLAN_LIST = re.compile(
+    r"(?ims)"
+    r"(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
+    r"here['’]?s (?:my |the |a )?(?:plan|approach|steps?)|"
+    r"as follows|the (?:plan|steps?) (?:is|are))"
+    r"[^:\n]{0,160}:\s*\n"
+    r"(?:\s*(?:[-*•]|\d+\.)\s+[^\n]+\n?)+"
+    r"\s*$"
+)
+_TRAILING_PLAN_COLON = re.compile(
+    r"(?i)(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
+    r"now i['’]ll|now i will)"
+    r"[^\n:]{0,200}:\s*$"
+)
+_TRAILING_PLAN_WINDOW = 600
+_MAX_CONTINUES = 3
+
+
+def _trailing_plan_hit(stripped: str) -> bool:
+    """True if the last `_TRAILING_PLAN_WINDOW` chars look mid-plan.
+
+    The window covers both single-line trailing intent and list endings
+    where the intent cue is several lines above the last item.
+    """
+    if not stripped:
+        return False
+    tail = stripped[-_TRAILING_PLAN_WINDOW:]
+    if _TRAILING_PLAN_INTENT.search(tail) is not None:
+        return True
+    if _TRAILING_PLAN_LIST.search(tail) is not None:
+        return True
+    if _TRAILING_PLAN_COLON.search(tail) is not None:
+        return True
+    return False
+
+
 # Without max_tokens, llama-server defaults to n_predict = n_ctx (up to
 # 262144 for Qwen3.5), producing many-minute zombie decodes when cancel
 # fails. t_max_predict_ms is a wall-clock backstop applied unconditionally,
@@ -4015,11 +4063,18 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
         # direct answer like "4" or "Hello!" will not match.
         # Pattern is compiled once at module level (_INTENT_SIGNAL).
         _reprompt_count = 0
-
-        # Reserve extra iterations for re-prompts so they don't
-        # consume the caller's tool-call budget.  Only add the
-        # extra slot when tool iterations are actually allowed.
-        _extra = _MAX_REPROMPTS if max_tool_iterations > 0 else 0
+        # Auto-continue (mid-plan EOS) uses its own counter so it does
+        # not steal the tool-coercive re-prompt budget.
+        _continue_count = 0
+
+        # Reserve extra iterations for re-prompts and continues so they
+        # don't consume the caller's tool-call budget. Only add the
+        # extra slots when tool iterations are actually allowed.
+        _extra = (
+            _MAX_REPROMPTS + _MAX_CONTINUES
+            if max_tool_iterations > 0
+            else 0
+        )
         for iteration in range(max_tool_iterations + _extra):
             if cancel_event is not None and cancel_event.is_set():
                 return
@@ -4353,18 +4408,46 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                         _stripped = content_accum.strip()
                         if not _stripped:
                             _stripped = reasoning_accum.strip()
-                        if (
+
+                        # Tool-coercive re-prompt fires when there are
+                        # tools and the model wrote intent text without
+                        # invoking one.
+                        _tool_intent_hit = (
                             tools
                             and _reprompt_count < _MAX_REPROMPTS
                             and 0 < len(_stripped) < _REPROMPT_MAX_CHARS
-                            and _INTENT_SIGNAL.search(_stripped)
-                        ):
-                            _reprompt_count += 1
-                            logger.info(
-                                f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: "
-                                f"model responded without calling tools "
-                                f"({len(_stripped)} chars)"
-                            )
+                            and _INTENT_SIGNAL.search(_stripped) is not None
+                        )
+                        # Neutral auto-continue fires when the model
+                        # stops mid-plan with a trailing intent cue,
+                        # numbered/bulleted list, or bare colon. Works
+                        # with or without tools, on any response length.
+                        _trailing_hit = (
+                            _continue_count < _MAX_CONTINUES
+                            and _trailing_plan_hit(_stripped)
+                        )
+
+                        if _tool_intent_hit or _trailing_hit:
+                            if _tool_intent_hit:
+                                _reprompt_count += 1
+                                logger.info(
+                                    f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: "
+                                    f"model responded without calling tools "
+                                    f"({len(_stripped)} chars)"
+                                )
+                                _nudge = (
+                                    "STOP. Do NOT write code or explain. "
+                                    "You MUST call a tool NOW. "
+                                    "Call web_search or python immediately."
+                                )
+                            else:
+                                _continue_count += 1
+                                logger.info(
+                                    f"Auto-continue {_continue_count}/{_MAX_CONTINUES}: "
+                                    f"model ended turn mid-plan "
+                                    f"({len(_stripped)} chars)"
+                                )
+                                _nudge = "Continue."
                             conversation.append(
                                 {
                                     "role": "assistant",
@@ -4374,11 +4457,7 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                             conversation.append(
                                 {
                                     "role": "user",
-                                    "content": (
-                                        "STOP. Do NOT write code or explain. "
-                                        "You MUST call a tool NOW. "
-                                        "Call web_search or python immediately."
-                                    ),
+                                    "content": _nudge,
                                 }
                             )
                             # Accumulate tokens and timing from this iteration

From af31c54d0212f3cd540d96c549ab6beea333c9b1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 May 2026 11:01:29 +0000
Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/llama_cpp.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 560d50e92e..3839e97228 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -4070,11 +4070,7 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
         # Reserve extra iterations for re-prompts and continues so they
         # don't consume the caller's tool-call budget. Only add the
         # extra slots when tool iterations are actually allowed.
-        _extra = (
-            _MAX_REPROMPTS + _MAX_CONTINUES
-            if max_tool_iterations > 0
-            else 0
-        )
+        _extra = _MAX_REPROMPTS + _MAX_CONTINUES if max_tool_iterations > 0 else 0
         for iteration in range(max_tool_iterations + _extra):
             if cancel_event is not None and cancel_event.is_set():
                 return

From 8503fb33ae650d14d995b961edaef64a3395c10c Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Mon, 18 May 2026 19:39:25 +0000
Subject: [PATCH 03/11] studio: exclude "let me know" from trailing-plan
 auto-continue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex review on #5549 flagged that endings like
"If you need anything else, let me know." match the trailing-plan
intent pattern (the regex matches "let me <anything>." at end of
buffer). On a finished, user-facing closing this fires the auto-
continue branch up to three times, costing latency / tokens and
appending unrelated text after the response.

Add a negative lookahead so "let me" only counts as a mid-plan signal
when it is NOT immediately followed by "know". Other intent phrases
("now let me", "i'll now", "i'm going to", "i will now", "let's now")
already require a planning verb so they are unaffected.

Verified against `scripts/r6_trailing_plan_regex_test.py`: closing
"let me know" variants no longer match; "let me clone/check/run …"
still does.
---
 studio/backend/core/inference/llama_cpp.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 3839e97228..1356232a38 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -62,9 +62,12 @@
 #   1. Trailing intent at end of buffer: "Let me clone the repo."
 #   2. Numbered or bulleted list under a "Let me ...:" header.
 #   3. Bare trailing colon: "Let me check the repo:"
+# "let me know" is a closing phrase, not a mid-plan signal; exclude it via
+# negative lookahead so a final "If you need anything else, let me know."
+# does not trigger spurious auto-continues.
 _TRAILING_PLAN_INTENT = re.compile(
     r"(?i)("
-    r"let me|now let me|i['’]ll now|next,?\s*i['’]ll|"
+    r"let me(?!\s+know\b)|now let me|i['’]ll now|next,?\s*i['’]ll|"
     r"i['’]m going to|i will now|let['’]s now"
     r")[^.!?\n]*[.!?]?\s*$"
 )

From 25acc2062e766e794338d4cd1ffb2b0106452a17 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Mon, 18 May 2026 21:42:31 +0000
Subject: [PATCH 04/11] studio: reset Anthropic adapter cursor on auto-continue
 boundary

Codex P2 on #5549 flagged that the auto-continue branch yields only
a `{"type":"status","text":""}` event between turns; the Anthropic
streaming emitter (`AnthropicStreamEmitter`) and the non-streaming
tool path (`_anthropic_tool_non_streaming`) both ignore `status`
events, so their cumulative-text cursor still holds the previous
turn's full length when the continuation starts streaming. Shorter
continuations get dropped entirely and longer ones lose their prefix.

Treat the empty-text status as an auto-continue boundary in both
paths:

  - `AnthropicStreamEmitter`: close any open text block, open a fresh
    one (matches the `tool_end` reset pattern), and clear `_prev_text`.
  - `_anthropic_tool_non_streaming`: clear `prev_text` so the next
    `content` event's diff baseline is empty.

Non-empty status events (tool progress text) keep their existing
no-op semantics.
---
 .../core/inference/anthropic_compat.py        | 21 ++++++++++++++++++-
 studio/backend/routes/inference.py            |  7 +++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/anthropic_compat.py b/studio/backend/core/inference/anthropic_compat.py
index 263718c540..8e1c939e55 100644
--- a/studio/backend/core/inference/anthropic_compat.py
+++ b/studio/backend/core/inference/anthropic_compat.py
@@ -253,9 +253,28 @@ def feed(self, event: dict) -> list[str]:
         elif etype == "metadata":
             self._usage = event.get("usage", {})
             return []
-        # status events — no Anthropic equivalent
+        elif etype == "status" and not event.get("text"):
+            # Auto-continue boundary marker emitted by
+            # generate_chat_completion_with_tools — the next "content"
+            # event resets to a fresh cumulative baseline, so close any
+            # open text block and clear the diff cursor. Without this
+            # the next continuation gets diffed against the previous
+            # turn's length (shorter continuations are dropped, longer
+            # ones lose their prefix).
+            return self._handle_boundary()
+        # Other status events (tool progress text) have no Anthropic
+        # equivalent.
         return []
 
+    def _handle_boundary(self) -> list[str]:
+        events = []
+        if self._text_block_open:
+            events.append(self._close_block())
+            self.block_index += 1
+            events.extend(self._open_text_block())
+        self._prev_text = ""
+        return events
+
     def finish(self, stop_reason: str = "end_turn") -> list[str]:
         """Close any open block and emit message_delta + message_stop."""
         events = []
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 607245467c..6b4c2f0268 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -4370,6 +4370,13 @@ async def _anthropic_tool_non_streaming(run_gen, message_id, model_name):
             )
         elif etype == "tool_end":
             prev_text = ""
+        elif etype == "status" and not event.get("text"):
+            # Auto-continue boundary marker: the next content event
+            # restarts the cumulative diff baseline, so reset prev_text
+            # the same way tool_end does. Without this a shorter
+            # continuation gets dropped entirely and a longer one
+            # loses its prefix.
+            prev_text = ""
         elif etype == "metadata":
             usage = event.get("usage", {})
 

From fd53dab93146f8b6e75c5ce17948da2deb0932a0 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Mon, 18 May 2026 23:38:47 +0000
Subject: [PATCH 05/11] studio: tighten trailing-plan list anchor + grow
 tool-iter cap on demand

Codex P2 review on #5549 surfaced two related risks in the auto-continue
plumbing:

1. `_TRAILING_PLAN_LIST` was compiled with `(?ims)`. The `m` flag makes
   the terminal `\s*$` match end-of-line, so a complete answer like
   "Here's my plan:\n- a\n- b\n\nDone, that should work." still matched
   the list-block sub-pattern and tripped a spurious `Continue.` retry.
   Drop the `m` (and the unused `s`) flag and re-anchor with `\Z` so the
   list pattern only fires when the list is genuinely the last thing in
   the buffer.

2. The agent loop pre-reserved `_MAX_REPROMPTS + _MAX_CONTINUES` (= 6)
   extra iterations on top of the caller's `max_tool_iterations`
   unconditionally. That weakens the caller-provided budget: a turn
   that never trips the reprompt or continue path could still run up
   to N+6 full iterations and execute their tool calls.

   Switch the bound to a dynamic cap that grows only as reprompts /
   continues are actually consumed: `iteration < max_tool_iterations +
   _reprompt_count + _continue_count`. With both counters at zero the
   loop honors the caller cap exactly; once a continue or reprompt
   fires it earns its own slot back.

   Implemented with `itertools.count()` so the existing `continue`
   statements in the loop body keep their semantics.

Regex behaviour pinned by `scripts/r6_trailing_plan_regex_test.py`
(updated separately for the new list-tail case).
---
 studio/backend/core/inference/llama_cpp.py | 26 ++++++++++++++++------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 1356232a38..e6cd877896 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -10,6 +10,7 @@
 
 import atexit
 import contextlib
+import itertools as _itertools
 import json
 import os
 import re
@@ -72,13 +73,16 @@
     r")[^.!?\n]*[.!?]?\s*$"
 )
 _TRAILING_PLAN_LIST = re.compile(
-    r"(?ims)"
+    # No `m` flag: terminal `\s*$` must match end-of-string, not end-of-line.
+    # With `m` an answer like "1. one\n2. two\n\nDone." would still match on
+    # the list block and trigger a spurious auto-continue.
+    r"(?i)"
     r"(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
     r"here['’]?s (?:my |the |a )?(?:plan|approach|steps?)|"
     r"as follows|the (?:plan|steps?) (?:is|are))"
     r"[^:\n]{0,160}:\s*\n"
     r"(?:\s*(?:[-*•]|\d+\.)\s+[^\n]+\n?)+"
-    r"\s*$"
+    r"\s*\Z"
 )
 _TRAILING_PLAN_COLON = re.compile(
     r"(?i)(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
@@ -4070,11 +4074,19 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
         # not steal the tool-coercive re-prompt budget.
         _continue_count = 0
 
-        # Reserve extra iterations for re-prompts and continues so they
-        # don't consume the caller's tool-call budget. Only add the
-        # extra slots when tool iterations are actually allowed.
-        _extra = _MAX_REPROMPTS + _MAX_CONTINUES if max_tool_iterations > 0 else 0
-        for iteration in range(max_tool_iterations + _extra):
+        # Grant headroom for re-prompts and continues only as they're
+        # actually consumed, so the caller's tool-iteration cap is
+        # respected end-to-end. Each consumed re-prompt or continue
+        # raises the effective cap by exactly one slot, so the total
+        # never exceeds max_tool_iterations + reprompts + continues
+        # (bounded above by max_tool_iterations + _MAX_REPROMPTS +
+        # _MAX_CONTINUES). itertools.count keeps `continue` semantics
+        # intact in the loop body below.
+        for iteration in _itertools.count():
+            if iteration >= (
+                max_tool_iterations + _reprompt_count + _continue_count
+            ):
+                break
             if cancel_event is not None and cancel_event.is_set():
                 return
 

From 66d59e01d0022f76d3e8e468c5a0a983a24f504e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 May 2026 23:39:03 +0000
Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/llama_cpp.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index e6cd877896..37c8d61f55 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -4083,9 +4083,7 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
         # _MAX_CONTINUES). itertools.count keeps `continue` semantics
         # intact in the loop body below.
         for iteration in _itertools.count():
-            if iteration >= (
-                max_tool_iterations + _reprompt_count + _continue_count
-            ):
+            if iteration >= (max_tool_iterations + _reprompt_count + _continue_count):
                 break
             if cancel_event is not None and cancel_event.is_set():
                 return

From 7dbe18ab51cd7262e7588bd172a4976bc73a9084 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Tue, 19 May 2026 01:43:39 +0000
Subject: [PATCH 07/11] studio: line-anchor trailing-plan list items and ship a
 backend pytest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While drafting backend/tests/test_trailing_plan.py for the changes
landed in b4e0985, the new tests surfaced a deeper false-positive the
earlier regex tightening missed.

For input "Let me explain:\n1. The function returns 42.\n\nThat's the
answer." the previous `(?:\s*(?:[-*•]|\d+\.)\s+[^\n]+\n?)+` allowed the
regex engine to backtrack and treat the in-prose "42." substring as a
second list-item marker: iter 1 consumed "1. The function returns 4"
and iter 2 consumed "2.\n\nThat's the answer." (with `\s+` greedily
crossing the empty-line break). The pattern then satisfied `\s*\Z` and
the buffer fired a spurious `Continue.` retry on what was already a
fully-formed answer.

Tighten the per-item boundary:

  - `[ \t]*` before the marker (no newlines): forces the marker to sit
    at the start of its own line. A mid-prose "42." cannot satisfy this
    because the engine cannot rewind past the preceding `\n` without
    invalidating the previous iteration's `[^\n]+\n` close.
  - `[ \t]+` between the marker and content: blocks an `\s+`-driven
    cross-newline reach into a closing paragraph.
  - `(?:\n|\Z)` at end of item: a real line break OR end of buffer.
    Preserves the "list at EOB with no trailing newline" case while
    eliminating the backtrack route.

Land backend/tests/test_trailing_plan.py at the same time, covering:

  - `_TRAILING_PLAN_INTENT` "let me know" closer exclusion (cycle-3 fix).
  - `_TRAILING_PLAN_LIST` correctly fires on genuine trailing lists
    (dash, asterisk, unicode bullet, numeric).
  - `_TRAILING_PLAN_LIST` does NOT fire on list + closing paragraph,
    list + closing sentence, list embedded mid-text, or the "42."
    in-prose digit case.
  - `_TRAILING_PLAN_COLON` fires on bare trailing intent-colons only.
  - `_trailing_plan_hit` composite cases.
  - The 600-char window slicing.

31 cases, all pass. Pins the regex behaviour against future regressions
inside the repo (the prior pin script lived only in the probing
workspace, not the studio tree).
---
 studio/backend/core/inference/llama_cpp.py |  16 ++-
 studio/backend/tests/test_trailing_plan.py | 150 +++++++++++++++++++++
 2 files changed, 162 insertions(+), 4 deletions(-)
 create mode 100644 studio/backend/tests/test_trailing_plan.py

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 37c8d61f55..a1b7734301 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -73,15 +73,23 @@
     r")[^.!?\n]*[.!?]?\s*$"
 )
 _TRAILING_PLAN_LIST = re.compile(
-    # No `m` flag: terminal `\s*$` must match end-of-string, not end-of-line.
-    # With `m` an answer like "1. one\n2. two\n\nDone." would still match on
-    # the list block and trigger a spurious auto-continue.
+    # No `m` flag: the trailing anchor must match end-of-string, not
+    # end-of-line. With `m` an answer like "1. one\n2. two\n\nDone." would
+    # still match on the list block and trigger a spurious auto-continue.
+    #
+    # Anchors below also defend against a subtler backtrack: each list
+    # item starts with `[ \t]*` (horizontal whitespace only, NOT `\s*`)
+    # so the marker must sit on its own line, not be picked up mid-prose
+    # via a substring like "42." inside a sentence such as
+    # "Let me explain:\n1. The function returns 42.\n\nThat's the answer."
+    # The item content ends with `(?:\n|\Z)` so the iteration boundary
+    # is a real line break or end-of-buffer.
     r"(?i)"
     r"(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
     r"here['’]?s (?:my |the |a )?(?:plan|approach|steps?)|"
     r"as follows|the (?:plan|steps?) (?:is|are))"
     r"[^:\n]{0,160}:\s*\n"
-    r"(?:\s*(?:[-*•]|\d+\.)\s+[^\n]+\n?)+"
+    r"(?:[ \t]*(?:[-*•]|\d+\.)[ \t]+[^\n]+(?:\n|\Z))+"
     r"\s*\Z"
 )
 _TRAILING_PLAN_COLON = re.compile(
diff --git a/studio/backend/tests/test_trailing_plan.py b/studio/backend/tests/test_trailing_plan.py
new file mode 100644
index 0000000000..d4c5cc7e07
--- /dev/null
+++ b/studio/backend/tests/test_trailing_plan.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""Tests for the mid-plan auto-continue regexes and ``_trailing_plan_hit``.
+
+The trailing-plan detector lives in ``core.inference.llama_cpp`` and decides
+whether the model just stopped mid-plan (and therefore deserves a neutral
+``Continue.`` re-prompt). False positives cost real tool calls and latency,
+so the patterns get explicit coverage here.
+
+Bug history pinned by these tests:
+
+* ``"If you need anything else, let me know."`` matched ``_TRAILING_PLAN_INTENT``
+  before the negative lookahead landed.
+* ``"Here's my plan:\\n- a\\n- b\\n\\nDone, that should work."`` matched
+  ``_TRAILING_PLAN_LIST`` before the regex was switched off the ``m`` flag
+  and re-anchored with ``\\Z``.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from core.inference.llama_cpp import (
+    _TRAILING_PLAN_COLON,
+    _TRAILING_PLAN_INTENT,
+    _TRAILING_PLAN_LIST,
+    _trailing_plan_hit,
+)
+
+
+# ----- _TRAILING_PLAN_INTENT -------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        # Closing phrases must NOT match (regression: "let me know")
+        ("If you need anything else, let me know.", False),
+        ("Let me know if I can help further.", False),
+        ("let me know!", False),
+        # Genuine mid-plan intent SHOULD match
+        ("Let me clone the repo.", True),
+        ("Let me check the file.", True),
+        ("Now let me run the tests.", True),
+        ("I'll now run the analyzer.", True),
+        ("I’ll now run the analyzer.", True),  # curly apostrophe
+        ("I will now begin.", True),
+        # Unrelated trailing text must NOT match
+        ("Hello world.", False),
+        ("The answer is 42.", False),
+    ],
+)
+def test_trailing_plan_intent(text: str, expected: bool) -> None:
+    assert bool(_TRAILING_PLAN_INTENT.search(text)) is expected
+
+
+# ----- _TRAILING_PLAN_LIST ---------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        # List block at end of buffer SHOULD match
+        ("Let me do this:\n- step one\n- step two\n", True),
+        ("Here's my plan:\n1. one\n2. two\n", True),
+        ("Here's my plan:\n1. one\n2. two\n   \n", True),  # trailing whitespace
+        # Unicode bullet at end of buffer SHOULD match
+        ("Let me try:\n• first\n• second\n", True),
+        # List followed by a closing sentence MUST NOT match (regression:
+        # the `m` flag in `(?ims)` previously let `\s*$` match end-of-line)
+        (
+            "Here's my plan:\n- step one\n- step two\n\nDone, hope that helps.",
+            False,
+        ),
+        (
+            "Let me walk through it:\n1. first\n2. second\n\nThat's everything.",
+            False,
+        ),
+        # Single-item numbered list followed by closing prose MUST NOT match
+        (
+            "Let me explain:\n1. The function returns 42.\n\nThat's the answer.",
+            False,
+        ),
+        # List embedded mid-text (not trailing) MUST NOT match
+        (
+            "Here's my plan:\n- step one\n- step two\nNow the conclusion follows.",
+            False,
+        ),
+    ],
+)
+def test_trailing_plan_list(text: str, expected: bool) -> None:
+    assert bool(_TRAILING_PLAN_LIST.search(text)) is expected
+
+
+# ----- _TRAILING_PLAN_COLON --------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        # Bare trailing colon SHOULD match
+        ("Let me check the repo:", True),
+        ("I'll now look at this:", True),
+        # Colon mid-sentence MUST NOT match
+        ("Let me check this: it should work fine.", False),
+        # Colon not in an intent-cue clause MUST NOT match
+        ("The result is:", False),
+    ],
+)
+def test_trailing_plan_colon(text: str, expected: bool) -> None:
+    assert bool(_TRAILING_PLAN_COLON.search(text)) is expected
+
+
+# ----- _trailing_plan_hit composite -----------------------------------------
+
+
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        # Any of the three sub-patterns triggers a hit
+        ("Now let me run the tests.", True),
+        ("Let me do this:\n- step one\n- step two\n", True),
+        ("Let me check the repo:", True),
+        # Negative cases that previously misfired
+        ("If you need anything else, let me know.", False),
+        (
+            "Here's my plan:\n- step one\n- step two\n\nDone, hope that helps.",
+            False,
+        ),
+        # Short empty string is a no-op
+        ("", False),
+        ("   ", False),
+    ],
+)
+def test_trailing_plan_hit(text: str, expected: bool) -> None:
+    assert _trailing_plan_hit(text) is expected
+
+
+# ----- window slicing --------------------------------------------------------
+
+
+def test_trailing_plan_hit_respects_window() -> None:
+    """An intent cue further back than ``_TRAILING_PLAN_WINDOW`` must NOT
+    trigger a hit; only the tail of the response is inspected."""
+
+    # 800-char prefix of unrelated text, then a finalising sentence.
+    prefix = "lorem ipsum " * 80  # ~960 chars
+    text = f"Let me check the repo. {prefix}The result is 42."
+    assert _trailing_plan_hit(text) is False

From 0cc69de2109a884a5ffa067dcc8f4e74e04c8745 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Tue, 19 May 2026 02:39:35 +0000
Subject: [PATCH 08/11] studio: scope agentic-loop boundary marker so normal
 completions stream cleanly

Codex P2 on #5549 flagged that the cycle-5 cursor-reset (efa43c4) keyed
on every empty-status event, but generate_chat_completion_with_tools
emits empty status events in five places, only two of which are real
iteration boundaries:

  - Line 4497: emitted right before `continue` after a re-prompt /
    auto-continue. The next iteration starts a fresh assistant turn.
    BOUNDARY: reset cursor.
  - Line 4766: emitted right before `continue` after a tool call. The
    next iteration regenerates with tool results in history. BOUNDARY:
    reset cursor.
  - Line 4501: emitted at metadata-yield after normal streaming. Stream
    is about to end, no new iteration follows. NOT a boundary.
  - Line 4584: emitted in DRAINING-no-tool-call fallback path. Stream
    is about to end with buffered content_accum. NOT a boundary.
  - Line 4794: emitted at the final exit of the generator. NOT a
    boundary.

Treating all five as boundaries gave every Anthropic-streaming response
an extra content_block_stop + content_block_start pair around its final
text and around every tool call.

Fix by tagging the two real boundary sites with `"boundary": True` and
tightening both the Anthropic emitter (`anthropic_compat.py`) and the
OpenAI-compat tool path + Anthropic non-streaming path
(`routes/inference.py`) to reset the cumulative-text cursor only when
that flag is set. Plain empty-status events keep their existing badge-
clear semantics on the frontend (`tool_status` SSE with content "").

Add two regression tests in
`backend/tests/test_anthropic_messages.py::TestAnthropicStreamEmitter`:

  - test_boundary_flag_closes_block_and_resets_cursor: a boundary=True
    status closes the open text block and the next content delta
    streams from zero.
  - test_empty_status_without_boundary_does_not_close_block: a plain
    empty status leaves block_index unchanged and the next content
    delta is diffed against the previous text length.

107 tests pass across the four anthropic + trailing-plan test files.
---
 .../core/inference/anthropic_compat.py        | 26 ++++++-----
 studio/backend/core/inference/llama_cpp.py    | 11 +++--
 studio/backend/routes/inference.py            | 21 +++++----
 .../backend/tests/test_anthropic_messages.py  | 43 +++++++++++++++++++
 4 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/studio/backend/core/inference/anthropic_compat.py b/studio/backend/core/inference/anthropic_compat.py
index 8e1c939e55..6f83fd54e3 100644
--- a/studio/backend/core/inference/anthropic_compat.py
+++ b/studio/backend/core/inference/anthropic_compat.py
@@ -253,17 +253,23 @@ def feed(self, event: dict) -> list[str]:
         elif etype == "metadata":
             self._usage = event.get("usage", {})
             return []
-        elif etype == "status" and not event.get("text"):
-            # Auto-continue boundary marker emitted by
-            # generate_chat_completion_with_tools — the next "content"
-            # event resets to a fresh cumulative baseline, so close any
-            # open text block and clear the diff cursor. Without this
-            # the next continuation gets diffed against the previous
-            # turn's length (shorter continuations are dropped, longer
-            # ones lose their prefix).
+        elif etype == "status" and event.get("boundary"):
+            # Iteration-boundary marker emitted by
+            # generate_chat_completion_with_tools when a fresh model
+            # turn is about to begin (after an auto-continue re-prompt
+            # or after a tool result). The next "content" event resets
+            # to a fresh cumulative baseline, so we close any open text
+            # block and clear the diff cursor. Without this the next
+            # continuation gets diffed against the previous turn's
+            # length (shorter continuations are dropped, longer ones
+            # lose their prefix). Non-boundary empty status events
+            # (UI badge clears at normal completion, draining-no-tool
+            # fallbacks, final stream-end yields) do NOT reach this
+            # branch and so do NOT produce spurious extra
+            # content_block_start/stop pairs.
             return self._handle_boundary()
-        # Other status events (tool progress text) have no Anthropic
-        # equivalent.
+        # Other status events (tool progress text, non-boundary badge
+        # clears) have no Anthropic equivalent.
         return []
 
     def _handle_boundary(self) -> list[str]:
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index a1b7734301..afb3c4a428 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -4485,7 +4485,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                             _it_r = _iter_timings or {}
                             _accumulated_predicted_ms += _it_r.get("predicted_ms", 0)
                             _accumulated_predicted_n += _it_r.get("predicted_n", 0)
-                            yield {"type": "status", "text": ""}
+                            # boundary=True: the next agentic iteration
+                            # starts a fresh assistant turn, so adapters
+                            # must reset their cumulative cursor here.
+                            yield {"type": "status", "text": "", "boundary": True}
                             continue
 
                         # Content was already streamed.  Yield metadata.
@@ -4753,8 +4756,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                         tool_msg["tool_call_id"] = tool_call_id
                     conversation.append(tool_msg)
 
-                # Clear tool status badge before next generation iteration
-                yield {"type": "status", "text": ""}
+                # Clear tool status badge before next generation iteration.
+                # boundary=True: the model is about to start a fresh turn
+                # so cumulative-text adapters must reset their cursor.
+                yield {"type": "status", "text": "", "boundary": True}
                 # Continue the loop to let model respond with context
                 continue
 
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 6b4c2f0268..24b08f031b 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2459,11 +2459,15 @@ async def gguf_tool_stream():
                             break
 
                         if event["type"] == "status":
-                            # Empty status marks an iteration boundary
-                            # in the GGUF tool loop (e.g. after a
-                            # re-prompt).  Reset the cumulative cursor
-                            # so the next assistant turn streams cleanly.
-                            if not event["text"]:
+                            # boundary=True flags a true iteration
+                            # boundary (auto-continue re-prompt or
+                            # post-tool resume). Reset the cumulative
+                            # cursor only then; non-boundary empty
+                            # status events (UI badge clears at normal
+                            # stream end) keep the existing cursor so
+                            # we do not spuriously re-emit a duplicate
+                            # prefix on the next "content" yield.
+                            if event.get("boundary"):
                                 prev_text = ""
                             # Emit tool status as a custom SSE event
                             # (including empty ones to clear UI badges)
@@ -4370,12 +4374,13 @@ async def _anthropic_tool_non_streaming(run_gen, message_id, model_name):
             )
         elif etype == "tool_end":
             prev_text = ""
-        elif etype == "status" and not event.get("text"):
-            # Auto-continue boundary marker: the next content event
+        elif etype == "status" and event.get("boundary"):
+            # Iteration-boundary marker: the next content event
             # restarts the cumulative diff baseline, so reset prev_text
             # the same way tool_end does. Without this a shorter
             # continuation gets dropped entirely and a longer one
-            # loses its prefix.
+            # loses its prefix. Plain empty-status events (UI badge
+            # clears at normal stream end) do not match this branch.
             prev_text = ""
         elif etype == "metadata":
             usage = event.get("usage", {})
diff --git a/studio/backend/tests/test_anthropic_messages.py b/studio/backend/tests/test_anthropic_messages.py
index 0825ef9337..e3a5bc6358 100644
--- a/studio/backend/tests/test_anthropic_messages.py
+++ b/studio/backend/tests/test_anthropic_messages.py
@@ -629,6 +629,49 @@ def test_text_after_tool_resets_prev_text(self):
         parsed = json.loads(events[0].split("data: ")[1])
         assert parsed["delta"]["text"] == "After tool"
 
+    def test_boundary_flag_closes_block_and_resets_cursor(self):
+        """An iteration-boundary status (boundary=True) must close the
+        open text block, open a fresh one, and reset _prev_text so the
+        next content delta starts from zero."""
+
+        e = AnthropicStreamEmitter()
+        e.start("msg_1", "m")
+        e.feed({"type": "content", "text": "first turn"})
+        boundary = e.feed({"type": "status", "text": "", "boundary": True})
+        # Boundary must produce content_block_stop + content_block_start
+        # so the next text lives in a new block.
+        joined = "\n".join(boundary)
+        assert "content_block_stop" in joined
+        assert "content_block_start" in joined
+        # Next content delta must include the full "second turn", not a
+        # diff against the previous turn's length.
+        nxt = e.feed({"type": "content", "text": "second turn"})
+        parsed = json.loads(nxt[0].split("data: ")[1])
+        assert parsed["delta"]["text"] == "second turn"
+
+    def test_empty_status_without_boundary_does_not_close_block(self):
+        """A non-boundary empty-status event (UI badge clear at normal
+        stream end, draining fallbacks, final status yields in
+        llama_cpp.py at lines 4501, 4584, 4794) must NOT close the
+        current text block or reset _prev_text - otherwise every normal
+        Anthropic response gets extra content_block_start/stop pairs
+        around its final text. Regression test for PR 5549 codex P2."""
+
+        e = AnthropicStreamEmitter()
+        e.start("msg_1", "m")
+        block_before = e.block_index
+        e.feed({"type": "content", "text": "hello "})
+        # Plain empty status (no boundary flag) -> no extra SSE events.
+        out = e.feed({"type": "status", "text": ""})
+        assert out == []
+        # block_index must not have advanced (no close+reopen happened).
+        assert e.block_index == block_before
+        # Next content delta is diffed against "hello ", so we only emit
+        # " world" (the new suffix).
+        nxt = e.feed({"type": "content", "text": "hello world"})
+        parsed = json.loads(nxt[0].split("data: ")[1])
+        assert parsed["delta"]["text"] == "world"
+
 
 # =====================================================================
 # Pass-through emitter tests (client-side tool execution path)

From bb33346642bddcfcb94c9ff13a724269fb734c62 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Tue, 19 May 2026 02:57:56 +0000
Subject: [PATCH 09/11] studio: drop redundant boundary flag on post-tool
 status, do reset at tool_end

Codex 02:42Z on #5549 caught that flagging the post-tool empty-status
event with boundary=True (cycle-12 commit 610c387) double-handles the
cursor reset in the Anthropic streaming path.

AnthropicStreamEmitter._handle_tool_end already:
  - closes the open tool_use block,
  - emits tool_result,
  - increments block_index,
  - opens a fresh text block, and
  - resets _prev_text = "".

When llama_cpp.py then yielded boundary=True on the very next event,
_handle_boundary fired _close_block + _open_text_block on that freshly
opened (still-empty) text block. Result: every tool call produced a
spurious content_block_stop + content_block_start pair before the
post-tool model text streamed.

Fix:

  - llama_cpp.py: drop boundary=True from the post-tool status emit
    (line ~4768). Keep boundary=True only at the auto-continue site
    (line ~4500), which has no preceding tool_end to do the cursor
    work.
  - routes/inference.py OpenAI-compat tool stream: mirror the
    Anthropic semantics by resetting prev_text on BOTH tool_start AND
    tool_end, so the post-tool empty-status no longer needs to do it.

Add backend/tests/test_anthropic_messages.py::TestAnthropicStreamEmitter::
test_post_tool_empty_status_does_not_double_close as a regression
test: content -> tool_start -> tool_end -> empty status -> content
must not bump block_index past tool_end's increment, and the post-tool
content must land in the text block tool_end opened.

95 tests pass across the anthropic + trailing-plan suites.
---
 studio/backend/core/inference/llama_cpp.py    | 12 ++++--
 studio/backend/routes/inference.py            | 14 ++++++-
 .../backend/tests/test_anthropic_messages.py  | 41 +++++++++++++++++++
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index afb3c4a428..44e12c2b32 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -4757,9 +4757,15 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                     conversation.append(tool_msg)
 
                 # Clear tool status badge before next generation iteration.
-                # boundary=True: the model is about to start a fresh turn
-                # so cumulative-text adapters must reset their cursor.
-                yield {"type": "status", "text": "", "boundary": True}
+                # We do NOT mark this as a boundary: the preceding
+                # tool_end event already opened a fresh text block and
+                # reset the cumulative cursor in Anthropic streaming
+                # (see AnthropicStreamEmitter._handle_tool_end), and the
+                # OpenAI-compat path resets prev_text on tool_start. A
+                # second boundary here would close the freshly opened
+                # text block (empty) and reopen it, producing a spurious
+                # content_block_stop/start pair on every tool call.
+                yield {"type": "status", "text": ""}
                 # Continue the loop to let model respond with context
                 continue
 
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 24b08f031b..cb86b3bd7d 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2481,8 +2481,18 @@ async def gguf_tool_stream():
                             continue
 
                         if event["type"] in ("tool_start", "tool_end"):
-                            if event["type"] == "tool_start":
-                                prev_text = ""
+                            # Both endpoints of a tool call begin a fresh
+                            # cumulative-text window: tool_start because
+                            # the model's next visible content restarts
+                            # cumulative-from-zero in the post-tool turn,
+                            # and tool_end because after we emit the
+                            # tool result, the model's next iteration
+                            # produces its own fresh cumulative stream
+                            # (the post-tool empty-status event is just
+                            # a UI badge clear; it does NOT carry a
+                            # boundary flag, so this is the place to
+                            # reset prev_text on tool_end).
+                            prev_text = ""
                             yield f"data: {json.dumps(event)}\n\n"
                             continue
 
diff --git a/studio/backend/tests/test_anthropic_messages.py b/studio/backend/tests/test_anthropic_messages.py
index e3a5bc6358..31077b996b 100644
--- a/studio/backend/tests/test_anthropic_messages.py
+++ b/studio/backend/tests/test_anthropic_messages.py
@@ -649,6 +649,47 @@ def test_boundary_flag_closes_block_and_resets_cursor(self):
         parsed = json.loads(nxt[0].split("data: ")[1])
         assert parsed["delta"]["text"] == "second turn"
 
+    def test_post_tool_empty_status_does_not_double_close(self):
+        """After tool_end already opens a fresh text block, the post-tool
+        empty-status event emitted by llama_cpp.py (line 4766) must NOT
+        close that fresh block. Otherwise every tool call produces a
+        spurious empty content_block_stop + content_block_start pair
+        before the model's post-tool text arrives. Regression test for
+        PR 5549 codex 02:42Z."""
+
+        e = AnthropicStreamEmitter()
+        e.start("msg_1", "m")
+        e.feed({"type": "content", "text": "pre"})
+        e.feed(
+            {
+                "type": "tool_start",
+                "tool_name": "t",
+                "tool_call_id": "tc_1",
+                "arguments": {},
+            }
+        )
+        e.feed(
+            {
+                "type": "tool_end",
+                "tool_name": "t",
+                "tool_call_id": "tc_1",
+                "result": "ok",
+            }
+        )
+        block_after_tool = e.block_index
+        # Post-tool empty status (no boundary flag): should produce zero
+        # SSE events and leave block_index unchanged. The previous
+        # behaviour was to close+reopen, which produced a duplicate
+        # empty content block.
+        out = e.feed({"type": "status", "text": ""})
+        assert out == []
+        assert e.block_index == block_after_tool
+        # Next content delta lands in the same fresh text block that
+        # tool_end opened.
+        nxt = e.feed({"type": "content", "text": "post"})
+        parsed = json.loads(nxt[0].split("data: ")[1])
+        assert parsed["delta"]["text"] == "post"
+
     def test_empty_status_without_boundary_does_not_close_block(self):
         """A non-boundary empty-status event (UI badge clear at normal
         stream end, draining fallbacks, final status yields in

From a1d79b761e3c198d6a4c7ca307b664298892c811 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Tue, 19 May 2026 03:58:18 +0000
Subject: [PATCH 10/11] studio: trim verbose comments in auto-continue +
 boundary plumbing

---
 .../core/inference/anthropic_compat.py        | 20 ++---
 studio/backend/core/inference/llama_cpp.py    | 75 +++++--------------
 studio/backend/routes/inference.py            | 32 ++------
 3 files changed, 32 insertions(+), 95 deletions(-)

diff --git a/studio/backend/core/inference/anthropic_compat.py b/studio/backend/core/inference/anthropic_compat.py
index 6f83fd54e3..c60ff1ed16 100644
--- a/studio/backend/core/inference/anthropic_compat.py
+++ b/studio/backend/core/inference/anthropic_compat.py
@@ -254,22 +254,12 @@ def feed(self, event: dict) -> list[str]:
             self._usage = event.get("usage", {})
             return []
         elif etype == "status" and event.get("boundary"):
-            # Iteration-boundary marker emitted by
-            # generate_chat_completion_with_tools when a fresh model
-            # turn is about to begin (after an auto-continue re-prompt
-            # or after a tool result). The next "content" event resets
-            # to a fresh cumulative baseline, so we close any open text
-            # block and clear the diff cursor. Without this the next
-            # continuation gets diffed against the previous turn's
-            # length (shorter continuations are dropped, longer ones
-            # lose their prefix). Non-boundary empty status events
-            # (UI badge clears at normal completion, draining-no-tool
-            # fallbacks, final stream-end yields) do NOT reach this
-            # branch and so do NOT produce spurious extra
-            # content_block_start/stop pairs.
+            # Iteration-boundary marker (auto-continue reprompt). Close
+            # the open text block + reset _prev_text so the next content
+            # event diffs against zero. Non-boundary status events (UI
+            # badge clears) don't reach this branch.
             return self._handle_boundary()
-        # Other status events (tool progress text, non-boundary badge
-        # clears) have no Anthropic equivalent.
+        # Other status events have no Anthropic equivalent.
         return []
 
     def _handle_boundary(self) -> list[str]:
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 44e12c2b32..f6facf02d3 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -57,15 +57,9 @@
 )
 _MAX_REPROMPTS = 3
 
-# Mid-plan EOS detectors. Auto-continue runs alongside the _INTENT_SIGNAL
-# re-prompt path but stays neutral ("Continue.") instead of tool-coercive,
-# so it also handles tool-less turns. Three observed shapes:
-#   1. Trailing intent at end of buffer: "Let me clone the repo."
-#   2. Numbered or bulleted list under a "Let me ...:" header.
-#   3. Bare trailing colon: "Let me check the repo:"
-# "let me know" is a closing phrase, not a mid-plan signal; exclude it via
-# negative lookahead so a final "If you need anything else, let me know."
-# does not trigger spurious auto-continues.
+# Mid-plan EOS detectors. Three shapes: trailing intent, list under a
+# "Let me ...:" header, bare trailing colon. "let me know" is a closer,
+# not a plan signal -- excluded via negative lookahead.
 _TRAILING_PLAN_INTENT = re.compile(
     r"(?i)("
     r"let me(?!\s+know\b)|now let me|i['’]ll now|next,?\s*i['’]ll|"
@@ -73,17 +67,10 @@
     r")[^.!?\n]*[.!?]?\s*$"
 )
 _TRAILING_PLAN_LIST = re.compile(
-    # No `m` flag: the trailing anchor must match end-of-string, not
-    # end-of-line. With `m` an answer like "1. one\n2. two\n\nDone." would
-    # still match on the list block and trigger a spurious auto-continue.
-    #
-    # Anchors below also defend against a subtler backtrack: each list
-    # item starts with `[ \t]*` (horizontal whitespace only, NOT `\s*`)
-    # so the marker must sit on its own line, not be picked up mid-prose
-    # via a substring like "42." inside a sentence such as
-    # "Let me explain:\n1. The function returns 42.\n\nThat's the answer."
-    # The item content ends with `(?:\n|\Z)` so the iteration boundary
-    # is a real line break or end-of-buffer.
+    # `\Z` (not `$`) so the regex only fires when the list is the last
+    # thing in the buffer, not when a closing paragraph follows.
+    # `[ \t]*` before each marker keeps it line-anchored so a mid-prose
+    # "42." cannot pose as a phantom list item.
     r"(?i)"
     r"(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
     r"here['’]?s (?:my |the |a )?(?:plan|approach|steps?)|"
@@ -102,11 +89,7 @@
 
 
 def _trailing_plan_hit(stripped: str) -> bool:
-    """True if the last `_TRAILING_PLAN_WINDOW` chars look mid-plan.
-
-    The window covers both single-line trailing intent and list endings
-    where the intent cue is several lines above the last item.
-    """
+    """True if the last `_TRAILING_PLAN_WINDOW` chars look mid-plan."""
     if not stripped:
         return False
     tail = stripped[-_TRAILING_PLAN_WINDOW:]
@@ -4078,18 +4061,13 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
         # direct answer like "4" or "Hello!" will not match.
         # Pattern is compiled once at module level (_INTENT_SIGNAL).
         _reprompt_count = 0
-        # Auto-continue (mid-plan EOS) uses its own counter so it does
-        # not steal the tool-coercive re-prompt budget.
+        # Separate counter so auto-continue doesn't steal reprompt budget.
         _continue_count = 0
 
-        # Grant headroom for re-prompts and continues only as they're
-        # actually consumed, so the caller's tool-iteration cap is
-        # respected end-to-end. Each consumed re-prompt or continue
-        # raises the effective cap by exactly one slot, so the total
-        # never exceeds max_tool_iterations + reprompts + continues
-        # (bounded above by max_tool_iterations + _MAX_REPROMPTS +
-        # _MAX_CONTINUES). itertools.count keeps `continue` semantics
-        # intact in the loop body below.
+        # Dynamic cap: caller's max_tool_iterations is honored exactly
+        # until a reprompt/continue actually fires; each consumed event
+        # earns its own slot back. itertools.count preserves loop-body
+        # `continue` semantics.
         for iteration in _itertools.count():
             if iteration >= (max_tool_iterations + _reprompt_count + _continue_count):
                 break
@@ -4426,19 +4404,14 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                         if not _stripped:
                             _stripped = reasoning_accum.strip()
 
-                        # Tool-coercive re-prompt fires when there are
-                        # tools and the model wrote intent text without
-                        # invoking one.
+                        # Tool-coercive reprompt: intent text without a tool call.
                         _tool_intent_hit = (
                             tools
                             and _reprompt_count < _MAX_REPROMPTS
                             and 0 < len(_stripped) < _REPROMPT_MAX_CHARS
                             and _INTENT_SIGNAL.search(_stripped) is not None
                         )
-                        # Neutral auto-continue fires when the model
-                        # stops mid-plan with a trailing intent cue,
-                        # numbered/bulleted list, or bare colon. Works
-                        # with or without tools, on any response length.
+                        # Neutral auto-continue on mid-plan EOS. Works without tools.
                         _trailing_hit = (
                             _continue_count < _MAX_CONTINUES
                             and _trailing_plan_hit(_stripped)
@@ -4485,9 +4458,8 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                             _it_r = _iter_timings or {}
                             _accumulated_predicted_ms += _it_r.get("predicted_ms", 0)
                             _accumulated_predicted_n += _it_r.get("predicted_n", 0)
-                            # boundary=True: the next agentic iteration
-                            # starts a fresh assistant turn, so adapters
-                            # must reset their cumulative cursor here.
+                            # boundary=True: next iter starts a fresh
+                            # turn, so adapters must reset their cursor.
                             yield {"type": "status", "text": "", "boundary": True}
                             continue
 
@@ -4756,17 +4728,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                         tool_msg["tool_call_id"] = tool_call_id
                     conversation.append(tool_msg)
 
-                # Clear tool status badge before next generation iteration.
-                # We do NOT mark this as a boundary: the preceding
-                # tool_end event already opened a fresh text block and
-                # reset the cumulative cursor in Anthropic streaming
-                # (see AnthropicStreamEmitter._handle_tool_end), and the
-                # OpenAI-compat path resets prev_text on tool_start. A
-                # second boundary here would close the freshly opened
-                # text block (empty) and reopen it, producing a spurious
-                # content_block_stop/start pair on every tool call.
+                # UI badge clear. NOT a boundary: tool_end already
+                # reset adapter cursors (would emit a spurious empty
+                # block if we flagged it).
                 yield {"type": "status", "text": ""}
-                # Continue the loop to let model respond with context
                 continue
 
             except httpx.ConnectError:
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index cb86b3bd7d..d170c3eda9 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2459,14 +2459,9 @@ async def gguf_tool_stream():
                             break
 
                         if event["type"] == "status":
-                            # boundary=True flags a true iteration
-                            # boundary (auto-continue re-prompt or
-                            # post-tool resume). Reset the cumulative
-                            # cursor only then; non-boundary empty
-                            # status events (UI badge clears at normal
-                            # stream end) keep the existing cursor so
-                            # we do not spuriously re-emit a duplicate
-                            # prefix on the next "content" yield.
+                            # boundary=True: auto-continue reprompt.
+                            # Reset cursor only then; plain empty-status
+                            # events (badge clears) keep the cursor.
                             if event.get("boundary"):
                                 prev_text = ""
                             # Emit tool status as a custom SSE event
@@ -2481,17 +2476,9 @@ async def gguf_tool_stream():
                             continue
 
                         if event["type"] in ("tool_start", "tool_end"):
-                            # Both endpoints of a tool call begin a fresh
-                            # cumulative-text window: tool_start because
-                            # the model's next visible content restarts
-                            # cumulative-from-zero in the post-tool turn,
-                            # and tool_end because after we emit the
-                            # tool result, the model's next iteration
-                            # produces its own fresh cumulative stream
-                            # (the post-tool empty-status event is just
-                            # a UI badge clear; it does NOT carry a
-                            # boundary flag, so this is the place to
-                            # reset prev_text on tool_end).
+                            # Both edges of a tool call restart cumulative
+                            # text: tool_start opens a new stream, tool_end
+                            # is the cursor reset for the post-tool turn.
                             prev_text = ""
                             yield f"data: {json.dumps(event)}\n\n"
                             continue
@@ -4385,12 +4372,7 @@ async def _anthropic_tool_non_streaming(run_gen, message_id, model_name):
         elif etype == "tool_end":
             prev_text = ""
         elif etype == "status" and event.get("boundary"):
-            # Iteration-boundary marker: the next content event
-            # restarts the cumulative diff baseline, so reset prev_text
-            # the same way tool_end does. Without this a shorter
-            # continuation gets dropped entirely and a longer one
-            # loses its prefix. Plain empty-status events (UI badge
-            # clears at normal stream end) do not match this branch.
+            # Iteration-boundary marker: reset like tool_end does.
             prev_text = ""
         elif etype == "metadata":
             usage = event.get("usage", {})

From 5afd6c184167eaa9eabf12363547147b751c2414 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Tue, 19 May 2026 05:36:27 +0000
Subject: [PATCH 11/11] studio: drop "now let me know" false positive + lazy
 text-block on boundary

PR 5549 cycle-15 codex review surfaced two P2 issues:

1. Trailing-plan regex matched "Now let me know if you want another
   example" (the closer "let me know" was negative-lookahead-guarded
   only on the bare alternative, not on "now let me"). Refactor to a
   single `(?:now\s+)?let me(?!\s+know\b)` alternative so both
   prefixes share the same guard.

2. Anthropic adapter opened a new text block immediately on the
   iteration boundary marker. If the reprompted iteration started
   with a tool_call rather than text, that just-opened text block was
   immediately closed, emitting a zero-length text content block
   between intent text and tool_use. Defer the open: close the
   current block + bump block_index + reset cursor at the boundary,
   and let `_handle_content` lazy-open when real text arrives.
---
 studio/backend/core/inference/anthropic_compat.py | 6 +++++-
 studio/backend/core/inference/llama_cpp.py        | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/studio/backend/core/inference/anthropic_compat.py b/studio/backend/core/inference/anthropic_compat.py
index c60ff1ed16..c411db0ab9 100644
--- a/studio/backend/core/inference/anthropic_compat.py
+++ b/studio/backend/core/inference/anthropic_compat.py
@@ -263,11 +263,15 @@ def feed(self, event: dict) -> list[str]:
         return []
 
     def _handle_boundary(self) -> list[str]:
+        # Close the current text block + reset the cumulative cursor.
+        # Do NOT pre-open a new text block here -- if the next event is
+        # a tool_start (not text), the eager-open would emit a zero-length
+        # text content block between intent text and the tool_use.
+        # _handle_content lazy-opens when real text arrives.
         events = []
         if self._text_block_open:
             events.append(self._close_block())
             self.block_index += 1
-            events.extend(self._open_text_block())
         self._prev_text = ""
         return events
 
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index f6facf02d3..40c042fa42 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -62,7 +62,7 @@
 # not a plan signal -- excluded via negative lookahead.
 _TRAILING_PLAN_INTENT = re.compile(
     r"(?i)("
-    r"let me(?!\s+know\b)|now let me|i['’]ll now|next,?\s*i['’]ll|"
+    r"(?:now\s+)?let me(?!\s+know\b)|i['’]ll now|next,?\s*i['’]ll|"
     r"i['’]m going to|i will now|let['’]s now"
     r")[^.!?\n]*[.!?]?\s*$"
 )