Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion studio/backend/core/inference/anthropic_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,28 @@ def feed(self, event: dict) -> list[str]:
elif etype == "metadata":
self._usage = event.get("usage", {})
return []
# status events — no Anthropic equivalent
elif etype == "status" and event.get("boundary"):
# Iteration-boundary marker (auto-continue reprompt). Close
# the open text block + reset _prev_text so the next content
# event diffs against zero. Non-boundary status events (UI
# badge clears) don't reach this branch.
return self._handle_boundary()
# Other status events have no Anthropic equivalent.
return []

def _handle_boundary(self) -> list[str]:
# Close the current text block + reset the cumulative cursor.
# Do NOT pre-open a new text block here -- if the next event is
# a tool_start (not text), the eager-open would emit a zero-length
# text content block between intent text and the tool_use.
# _handle_content lazy-opens when real text arrives.
events = []
if self._text_block_open:
events.append(self._close_block())
self.block_index += 1
self._prev_text = ""
return events

def finish(self, stop_reason: str = "end_turn") -> list[str]:
"""Close any open block and emit message_delta + message_stop."""
events = []
Expand Down
118 changes: 95 additions & 23 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import atexit
import contextlib
import itertools as _itertools
import json
import os
import re
Expand Down Expand Up @@ -56,6 +57,51 @@
)
_MAX_REPROMPTS = 3

# Mid-plan EOS detectors. Three shapes: trailing intent, list under a
# "Let me ...:" header, bare trailing colon. "let me know" is a closer,
# not a plan signal -- excluded via negative lookahead.
_TRAILING_PLAN_INTENT = re.compile(
r"(?i)("
r"(?:now\s+)?let me(?!\s+know\b)|i['’]ll now|next,?\s*i['’]ll|"
r"i['’]m going to|i will now|let['’]s now"
r")[^.!?\n]*[.!?]?\s*$"
)
_TRAILING_PLAN_LIST = re.compile(
# `\Z` (not `$`) so the regex only fires when the list is the last
# thing in the buffer, not when a closing paragraph follows.
# `[ \t]*` before each marker keeps it line-anchored so a mid-prose
# "42." cannot pose as a phantom list item.
r"(?i)"
r"(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
r"here['’]?s (?:my |the |a )?(?:plan|approach|steps?)|"
r"as follows|the (?:plan|steps?) (?:is|are))"
r"[^:\n]{0,160}:\s*\n"
r"(?:[ \t]*(?:[-*•]|\d+\.)[ \t]+[^\n]+(?:\n|\Z))+"
r"\s*\Z"
)
_TRAILING_PLAN_COLON = re.compile(
r"(?i)(?:let me|i['’]ll|i will|i['’]m going to|i am going to|"
r"now i['’]ll|now i will)"
r"[^\n:]{0,200}:\s*$"
)
_TRAILING_PLAN_WINDOW = 600
_MAX_CONTINUES = 3


def _trailing_plan_hit(stripped: str) -> bool:
"""True if the last `_TRAILING_PLAN_WINDOW` chars look mid-plan."""
if not stripped:
return False
tail = stripped[-_TRAILING_PLAN_WINDOW:]
if _TRAILING_PLAN_INTENT.search(tail) is not None:
return True
if _TRAILING_PLAN_LIST.search(tail) is not None:
return True
if _TRAILING_PLAN_COLON.search(tail) is not None:
return True
return False
Comment on lines +96 to +102

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For improved conciseness and readability, you can use any() with a generator expression to check against the multiple regex patterns. This avoids the series of if statements and combines checks into a single iteration, aligning with repository efficiency guidelines regarding redundant data iterations.

Suggested change
if _TRAILING_PLAN_INTENT.search(tail) is not None:
return True
if _TRAILING_PLAN_LIST.search(tail) is not None:
return True
if _TRAILING_PLAN_COLON.search(tail) is not None:
return True
return False
return any(
regex.search(tail) is not None
for regex in (_TRAILING_PLAN_INTENT, _TRAILING_PLAN_LIST, _TRAILING_PLAN_COLON)
)
References
  1. To improve efficiency, avoid redundant data iterations. Combine checks and transformations into a single loop and return computed values for callers to reuse.



# Without max_tokens, llama-server defaults to n_predict = n_ctx (up to
# 262144 for Qwen3.5), producing many-minute zombie decodes when cancel
# fails. t_max_predict_ms is a wall-clock backstop applied unconditionally,
Expand Down Expand Up @@ -4015,12 +4061,16 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
# direct answer like "4" or "Hello!" will not match.
# Pattern is compiled once at module level (_INTENT_SIGNAL).
_reprompt_count = 0

# Reserve extra iterations for re-prompts so they don't
# consume the caller's tool-call budget. Only add the
# extra slot when tool iterations are actually allowed.
_extra = _MAX_REPROMPTS if max_tool_iterations > 0 else 0
for iteration in range(max_tool_iterations + _extra):
# Separate counter so auto-continue doesn't steal reprompt budget.
_continue_count = 0

# Dynamic cap: caller's max_tool_iterations is honored exactly
# until a reprompt/continue actually fires; each consumed event
# earns its own slot back. itertools.count preserves loop-body
# `continue` semantics.
for iteration in _itertools.count():
if iteration >= (max_tool_iterations + _reprompt_count + _continue_count):
break
if cancel_event is not None and cancel_event.is_set():
return

Expand Down Expand Up @@ -4353,18 +4403,41 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
_stripped = content_accum.strip()
if not _stripped:
_stripped = reasoning_accum.strip()
if (

# Tool-coercive reprompt: intent text without a tool call.
_tool_intent_hit = (
tools
and _reprompt_count < _MAX_REPROMPTS
and 0 < len(_stripped) < _REPROMPT_MAX_CHARS
and _INTENT_SIGNAL.search(_stripped)
):
_reprompt_count += 1
logger.info(
f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: "
f"model responded without calling tools "
f"({len(_stripped)} chars)"
)
and _INTENT_SIGNAL.search(_stripped) is not None
)
# Neutral auto-continue on mid-plan EOS. Works without tools.
_trailing_hit = (
_continue_count < _MAX_CONTINUES
and _trailing_plan_hit(_stripped)
)

if _tool_intent_hit or _trailing_hit:
if _tool_intent_hit:
_reprompt_count += 1
logger.info(
f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: "
f"model responded without calling tools "
f"({len(_stripped)} chars)"
)
_nudge = (
"STOP. Do NOT write code or explain. "
"You MUST call a tool NOW. "
"Call web_search or python immediately."
)
else:
_continue_count += 1
logger.info(
f"Auto-continue {_continue_count}/{_MAX_CONTINUES}: "
f"model ended turn mid-plan "
f"({len(_stripped)} chars)"
)
_nudge = "Continue."

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Reset Anthropic cursors before auto-continued text

For Anthropic Messages requests with tools enabled, this new Continue. branch starts another assistant generation after yielding only a status event; however AnthropicStreamEmitter and _anthropic_tool_non_streaming ignore status and reset their cumulative-text cursor only on tool_end. When auto-continue fires without a tool call, the continuation is diffed against the previous assistant turn length, so shorter/equal continuations are dropped and longer ones lose their prefix. Either emit a boundary the Anthropic adapters handle or reset their cursor on this empty status.

Useful? React with 👍 / 👎.

conversation.append(
{
"role": "assistant",
Expand All @@ -4374,11 +4447,7 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
conversation.append(
{
"role": "user",
"content": (
"STOP. Do NOT write code or explain. "
"You MUST call a tool NOW. "
"Call web_search or python immediately."
),
"content": _nudge,
}
)
# Accumulate tokens and timing from this iteration
Expand All @@ -4389,7 +4458,9 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
_it_r = _iter_timings or {}
_accumulated_predicted_ms += _it_r.get("predicted_ms", 0)
_accumulated_predicted_n += _it_r.get("predicted_n", 0)
yield {"type": "status", "text": ""}
# boundary=True: next iter starts a fresh
# turn, so adapters must reset their cursor.
yield {"type": "status", "text": "", "boundary": True}
continue

# Content was already streamed. Yield metadata.
Expand Down Expand Up @@ -4657,9 +4728,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
tool_msg["tool_call_id"] = tool_call_id
conversation.append(tool_msg)

# Clear tool status badge before next generation iteration
# UI badge clear. NOT a boundary: tool_end already
# reset adapter cursors (would emit a spurious empty
# block if we flagged it).
yield {"type": "status", "text": ""}
# Continue the loop to let model respond with context
continue

except httpx.ConnectError:
Expand Down
18 changes: 11 additions & 7 deletions studio/backend/routes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -2459,11 +2459,10 @@ async def gguf_tool_stream():
break

if event["type"] == "status":
# Empty status marks an iteration boundary
# in the GGUF tool loop (e.g. after a
# re-prompt). Reset the cumulative cursor
# so the next assistant turn streams cleanly.
if not event["text"]:
# boundary=True: auto-continue reprompt.
# Reset cursor only then; plain empty-status
# events (badge clears) keep the cursor.
if event.get("boundary"):
prev_text = ""
# Emit tool status as a custom SSE event
# (including empty ones to clear UI badges)
Expand All @@ -2477,8 +2476,10 @@ async def gguf_tool_stream():
continue

if event["type"] in ("tool_start", "tool_end"):
if event["type"] == "tool_start":
prev_text = ""
# Both edges of a tool call restart cumulative
# text: tool_start opens a new stream, tool_end
# is the cursor reset for the post-tool turn.
prev_text = ""
yield f"data: {json.dumps(event)}\n\n"
continue

Expand Down Expand Up @@ -4370,6 +4371,9 @@ async def _anthropic_tool_non_streaming(run_gen, message_id, model_name):
)
elif etype == "tool_end":
prev_text = ""
elif etype == "status" and event.get("boundary"):
# Iteration-boundary marker: reset like tool_end does.
prev_text = ""
elif etype == "metadata":
usage = event.get("usage", {})

Expand Down
84 changes: 84 additions & 0 deletions studio/backend/tests/test_anthropic_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,90 @@ def test_text_after_tool_resets_prev_text(self):
parsed = json.loads(events[0].split("data: ")[1])
assert parsed["delta"]["text"] == "After tool"

def test_boundary_flag_closes_block_and_resets_cursor(self):
"""An iteration-boundary status (boundary=True) must close the
open text block, open a fresh one, and reset _prev_text so the
next content delta starts from zero."""

e = AnthropicStreamEmitter()
e.start("msg_1", "m")
e.feed({"type": "content", "text": "first turn"})
boundary = e.feed({"type": "status", "text": "", "boundary": True})
# Boundary must produce content_block_stop + content_block_start
# so the next text lives in a new block.
joined = "\n".join(boundary)
assert "content_block_stop" in joined
assert "content_block_start" in joined
# Next content delta must include the full "second turn", not a
# diff against the previous turn's length.
nxt = e.feed({"type": "content", "text": "second turn"})
parsed = json.loads(nxt[0].split("data: ")[1])
assert parsed["delta"]["text"] == "second turn"

def test_post_tool_empty_status_does_not_double_close(self):
"""After tool_end already opens a fresh text block, the post-tool
empty-status event emitted by llama_cpp.py (line 4766) must NOT
close that fresh block. Otherwise every tool call produces a
spurious empty content_block_stop + content_block_start pair
before the model's post-tool text arrives. Regression test for
PR 5549 codex 02:42Z."""

e = AnthropicStreamEmitter()
e.start("msg_1", "m")
e.feed({"type": "content", "text": "pre"})
e.feed(
{
"type": "tool_start",
"tool_name": "t",
"tool_call_id": "tc_1",
"arguments": {},
}
)
e.feed(
{
"type": "tool_end",
"tool_name": "t",
"tool_call_id": "tc_1",
"result": "ok",
}
)
block_after_tool = e.block_index
# Post-tool empty status (no boundary flag): should produce zero
# SSE events and leave block_index unchanged. The previous
# behaviour was to close+reopen, which produced a duplicate
# empty content block.
out = e.feed({"type": "status", "text": ""})
assert out == []
assert e.block_index == block_after_tool
# Next content delta lands in the same fresh text block that
# tool_end opened.
nxt = e.feed({"type": "content", "text": "post"})
parsed = json.loads(nxt[0].split("data: ")[1])
assert parsed["delta"]["text"] == "post"

def test_empty_status_without_boundary_does_not_close_block(self):
"""A non-boundary empty-status event (UI badge clear at normal
stream end, draining fallbacks, final status yields in
llama_cpp.py at lines 4501, 4584, 4794) must NOT close the
current text block or reset _prev_text - otherwise every normal
Anthropic response gets extra content_block_start/stop pairs
around its final text. Regression test for PR 5549 codex P2."""

e = AnthropicStreamEmitter()
e.start("msg_1", "m")
block_before = e.block_index
e.feed({"type": "content", "text": "hello "})
# Plain empty status (no boundary flag) -> no extra SSE events.
out = e.feed({"type": "status", "text": ""})
assert out == []
# block_index must not have advanced (no close+reopen happened).
assert e.block_index == block_before
# Next content delta is diffed against "hello ", so we only emit
# " world" (the new suffix).
nxt = e.feed({"type": "content", "text": "hello world"})
parsed = json.loads(nxt[0].split("data: ")[1])
assert parsed["delta"]["text"] == "world"


# =====================================================================
# Pass-through emitter tests (client-side tool execution path)
Expand Down
Loading
Loading