diff --git a/studio/backend/core/inference/anthropic_compat.py b/studio/backend/core/inference/anthropic_compat.py index 263718c540..c411db0ab9 100644 --- a/studio/backend/core/inference/anthropic_compat.py +++ b/studio/backend/core/inference/anthropic_compat.py @@ -253,9 +253,28 @@ def feed(self, event: dict) -> list[str]: elif etype == "metadata": self._usage = event.get("usage", {}) return [] - # status events — no Anthropic equivalent + elif etype == "status" and event.get("boundary"): + # Iteration-boundary marker (auto-continue reprompt). Close + # the open text block + reset _prev_text so the next content + # event diffs against zero. Non-boundary status events (UI + # badge clears) don't reach this branch. + return self._handle_boundary() + # Other status events have no Anthropic equivalent. return [] + def _handle_boundary(self) -> list[str]: + # Close the current text block + reset the cumulative cursor. + # Do NOT pre-open a new text block here -- if the next event is + # a tool_start (not text), the eager-open would emit a zero-length + # text content block between intent text and the tool_use. + # _handle_content lazy-opens when real text arrives. + events = [] + if self._text_block_open: + events.append(self._close_block()) + self.block_index += 1 + self._prev_text = "" + return events + def finish(self, stop_reason: str = "end_turn") -> list[str]: """Close any open block and emit message_delta + message_stop.""" events = [] diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 21f2fe71b5..40c042fa42 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -10,6 +10,7 @@ import atexit import contextlib +import itertools as _itertools import json import os import re @@ -56,6 +57,51 @@ ) _MAX_REPROMPTS = 3 +# Mid-plan EOS detectors. Three shapes: trailing intent, list under a +# "Let me ...:" header, bare trailing colon. "let me know" is a closer, +# not a plan signal -- excluded via negative lookahead. +_TRAILING_PLAN_INTENT = re.compile( + r"(?i)(" + r"(?:now\s+)?let me(?!\s+know\b)|i['’]ll now|next,?\s*i['’]ll|" + r"i['’]m going to|i will now|let['’]s now" + r")[^.!?\n]*[.!?]?\s*$" +) +_TRAILING_PLAN_LIST = re.compile( + # `\Z` (not `$`) so the regex only fires when the list is the last + # thing in the buffer, not when a closing paragraph follows. + # `[ \t]*` before each marker keeps it line-anchored so a mid-prose + # "42." cannot pose as a phantom list item. + r"(?i)" + r"(?:let me|i['’]ll|i will|i['’]m going to|i am going to|" + r"here['’]?s (?:my |the |a )?(?:plan|approach|steps?)|" + r"as follows|the (?:plan|steps?) (?:is|are))" + r"[^:\n]{0,160}:\s*\n" + r"(?:[ \t]*(?:[-*•]|\d+\.)[ \t]+[^\n]+(?:\n|\Z))+" + r"\s*\Z" +) +_TRAILING_PLAN_COLON = re.compile( + r"(?i)(?:let me|i['’]ll|i will|i['’]m going to|i am going to|" + r"now i['’]ll|now i will)" + r"[^\n:]{0,200}:\s*$" +) +_TRAILING_PLAN_WINDOW = 600 +_MAX_CONTINUES = 3 + + +def _trailing_plan_hit(stripped: str) -> bool: + """True if the last `_TRAILING_PLAN_WINDOW` chars look mid-plan.""" + if not stripped: + return False + tail = stripped[-_TRAILING_PLAN_WINDOW:] + if _TRAILING_PLAN_INTENT.search(tail) is not None: + return True + if _TRAILING_PLAN_LIST.search(tail) is not None: + return True + if _TRAILING_PLAN_COLON.search(tail) is not None: + return True + return False + + # Without max_tokens, llama-server defaults to n_predict = n_ctx (up to # 262144 for Qwen3.5), producing many-minute zombie decodes when cancel # fails. t_max_predict_ms is a wall-clock backstop applied unconditionally, @@ -4015,12 +4061,16 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: # direct answer like "4" or "Hello!" will not match. # Pattern is compiled once at module level (_INTENT_SIGNAL). _reprompt_count = 0 - - # Reserve extra iterations for re-prompts so they don't - # consume the caller's tool-call budget. Only add the - # extra slot when tool iterations are actually allowed. - _extra = _MAX_REPROMPTS if max_tool_iterations > 0 else 0 - for iteration in range(max_tool_iterations + _extra): + # Separate counter so auto-continue doesn't steal reprompt budget. + _continue_count = 0 + + # Dynamic cap: caller's max_tool_iterations is honored exactly + # until a reprompt/continue actually fires; each consumed event + # earns its own slot back. itertools.count preserves loop-body + # `continue` semantics. + for iteration in _itertools.count(): + if iteration >= (max_tool_iterations + _reprompt_count + _continue_count): + break if cancel_event is not None and cancel_event.is_set(): return @@ -4353,18 +4403,41 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: _stripped = content_accum.strip() if not _stripped: _stripped = reasoning_accum.strip() - if ( + + # Tool-coercive reprompt: intent text without a tool call. + _tool_intent_hit = ( tools and _reprompt_count < _MAX_REPROMPTS and 0 < len(_stripped) < _REPROMPT_MAX_CHARS - and _INTENT_SIGNAL.search(_stripped) - ): - _reprompt_count += 1 - logger.info( - f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: " - f"model responded without calling tools " - f"({len(_stripped)} chars)" - ) + and _INTENT_SIGNAL.search(_stripped) is not None + ) + # Neutral auto-continue on mid-plan EOS. Works without tools. + _trailing_hit = ( + _continue_count < _MAX_CONTINUES + and _trailing_plan_hit(_stripped) + ) + + if _tool_intent_hit or _trailing_hit: + if _tool_intent_hit: + _reprompt_count += 1 + logger.info( + f"Re-prompt {_reprompt_count}/{_MAX_REPROMPTS}: " + f"model responded without calling tools " + f"({len(_stripped)} chars)" + ) + _nudge = ( + "STOP. Do NOT write code or explain. " + "You MUST call a tool NOW. " + "Call web_search or python immediately." + ) + else: + _continue_count += 1 + logger.info( + f"Auto-continue {_continue_count}/{_MAX_CONTINUES}: " + f"model ended turn mid-plan " + f"({len(_stripped)} chars)" + ) + _nudge = "Continue." conversation.append( { "role": "assistant", @@ -4374,11 +4447,7 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: conversation.append( { "role": "user", - "content": ( - "STOP. Do NOT write code or explain. " - "You MUST call a tool NOW. " - "Call web_search or python immediately." - ), + "content": _nudge, } ) # Accumulate tokens and timing from this iteration @@ -4389,7 +4458,9 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: _it_r = _iter_timings or {} _accumulated_predicted_ms += _it_r.get("predicted_ms", 0) _accumulated_predicted_n += _it_r.get("predicted_n", 0) - yield {"type": "status", "text": ""} + # boundary=True: next iter starts a fresh + # turn, so adapters must reset their cursor. + yield {"type": "status", "text": "", "boundary": True} continue # Content was already streamed. Yield metadata. @@ -4657,9 +4728,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: tool_msg["tool_call_id"] = tool_call_id conversation.append(tool_msg) - # Clear tool status badge before next generation iteration + # UI badge clear. NOT a boundary: tool_end already + # reset adapter cursors (would emit a spurious empty + # block if we flagged it). yield {"type": "status", "text": ""} - # Continue the loop to let model respond with context continue except httpx.ConnectError: diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py index 607245467c..d170c3eda9 100644 --- a/studio/backend/routes/inference.py +++ b/studio/backend/routes/inference.py @@ -2459,11 +2459,10 @@ async def gguf_tool_stream(): break if event["type"] == "status": - # Empty status marks an iteration boundary - # in the GGUF tool loop (e.g. after a - # re-prompt). Reset the cumulative cursor - # so the next assistant turn streams cleanly. - if not event["text"]: + # boundary=True: auto-continue reprompt. + # Reset cursor only then; plain empty-status + # events (badge clears) keep the cursor. + if event.get("boundary"): prev_text = "" # Emit tool status as a custom SSE event # (including empty ones to clear UI badges) @@ -2477,8 +2476,10 @@ async def gguf_tool_stream(): continue if event["type"] in ("tool_start", "tool_end"): - if event["type"] == "tool_start": - prev_text = "" + # Both edges of a tool call restart cumulative + # text: tool_start opens a new stream, tool_end + # is the cursor reset for the post-tool turn. + prev_text = "" yield f"data: {json.dumps(event)}\n\n" continue @@ -4370,6 +4371,9 @@ async def _anthropic_tool_non_streaming(run_gen, message_id, model_name): ) elif etype == "tool_end": prev_text = "" + elif etype == "status" and event.get("boundary"): + # Iteration-boundary marker: reset like tool_end does. + prev_text = "" elif etype == "metadata": usage = event.get("usage", {}) diff --git a/studio/backend/tests/test_anthropic_messages.py b/studio/backend/tests/test_anthropic_messages.py index 0825ef9337..31077b996b 100644 --- a/studio/backend/tests/test_anthropic_messages.py +++ b/studio/backend/tests/test_anthropic_messages.py @@ -629,6 +629,90 @@ def test_text_after_tool_resets_prev_text(self): parsed = json.loads(events[0].split("data: ")[1]) assert parsed["delta"]["text"] == "After tool" + def test_boundary_flag_closes_block_and_resets_cursor(self): + """An iteration-boundary status (boundary=True) must close the + open text block, open a fresh one, and reset _prev_text so the + next content delta starts from zero.""" + + e = AnthropicStreamEmitter() + e.start("msg_1", "m") + e.feed({"type": "content", "text": "first turn"}) + boundary = e.feed({"type": "status", "text": "", "boundary": True}) + # Boundary must produce content_block_stop + content_block_start + # so the next text lives in a new block. + joined = "\n".join(boundary) + assert "content_block_stop" in joined + assert "content_block_start" in joined + # Next content delta must include the full "second turn", not a + # diff against the previous turn's length. + nxt = e.feed({"type": "content", "text": "second turn"}) + parsed = json.loads(nxt[0].split("data: ")[1]) + assert parsed["delta"]["text"] == "second turn" + + def test_post_tool_empty_status_does_not_double_close(self): + """After tool_end already opens a fresh text block, the post-tool + empty-status event emitted by llama_cpp.py (line 4766) must NOT + close that fresh block. Otherwise every tool call produces a + spurious empty content_block_stop + content_block_start pair + before the model's post-tool text arrives. Regression test for + PR 5549 codex 02:42Z.""" + + e = AnthropicStreamEmitter() + e.start("msg_1", "m") + e.feed({"type": "content", "text": "pre"}) + e.feed( + { + "type": "tool_start", + "tool_name": "t", + "tool_call_id": "tc_1", + "arguments": {}, + } + ) + e.feed( + { + "type": "tool_end", + "tool_name": "t", + "tool_call_id": "tc_1", + "result": "ok", + } + ) + block_after_tool = e.block_index + # Post-tool empty status (no boundary flag): should produce zero + # SSE events and leave block_index unchanged. The previous + # behaviour was to close+reopen, which produced a duplicate + # empty content block. + out = e.feed({"type": "status", "text": ""}) + assert out == [] + assert e.block_index == block_after_tool + # Next content delta lands in the same fresh text block that + # tool_end opened. + nxt = e.feed({"type": "content", "text": "post"}) + parsed = json.loads(nxt[0].split("data: ")[1]) + assert parsed["delta"]["text"] == "post" + + def test_empty_status_without_boundary_does_not_close_block(self): + """A non-boundary empty-status event (UI badge clear at normal + stream end, draining fallbacks, final status yields in + llama_cpp.py at lines 4501, 4584, 4794) must NOT close the + current text block or reset _prev_text - otherwise every normal + Anthropic response gets extra content_block_start/stop pairs + around its final text. Regression test for PR 5549 codex P2.""" + + e = AnthropicStreamEmitter() + e.start("msg_1", "m") + block_before = e.block_index + e.feed({"type": "content", "text": "hello "}) + # Plain empty status (no boundary flag) -> no extra SSE events. + out = e.feed({"type": "status", "text": ""}) + assert out == [] + # block_index must not have advanced (no close+reopen happened). + assert e.block_index == block_before + # Next content delta is diffed against "hello ", so we only emit + # " world" (the new suffix). + nxt = e.feed({"type": "content", "text": "hello world"}) + parsed = json.loads(nxt[0].split("data: ")[1]) + assert parsed["delta"]["text"] == "world" + # ===================================================================== # Pass-through emitter tests (client-side tool execution path) diff --git a/studio/backend/tests/test_trailing_plan.py b/studio/backend/tests/test_trailing_plan.py new file mode 100644 index 0000000000..d4c5cc7e07 --- /dev/null +++ b/studio/backend/tests/test_trailing_plan.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +"""Tests for the mid-plan auto-continue regexes and ``_trailing_plan_hit``. + +The trailing-plan detector lives in ``core.inference.llama_cpp`` and decides +whether the model just stopped mid-plan (and therefore deserves a neutral +``Continue.`` re-prompt). False positives cost real tool calls and latency, +so the patterns get explicit coverage here. + +Bug history pinned by these tests: + +* ``"If you need anything else, let me know."`` matched ``_TRAILING_PLAN_INTENT`` + before the negative lookahead landed. +* ``"Here's my plan:\\n- a\\n- b\\n\\nDone, that should work."`` matched + ``_TRAILING_PLAN_LIST`` before the regex was switched off the ``m`` flag + and re-anchored with ``\\Z``. +""" + +from __future__ import annotations + +import pytest + +from core.inference.llama_cpp import ( + _TRAILING_PLAN_COLON, + _TRAILING_PLAN_INTENT, + _TRAILING_PLAN_LIST, + _trailing_plan_hit, +) + + +# ----- _TRAILING_PLAN_INTENT ------------------------------------------------- + + +@pytest.mark.parametrize( + "text,expected", + [ + # Closing phrases must NOT match (regression: "let me know") + ("If you need anything else, let me know.", False), + ("Let me know if I can help further.", False), + ("let me know!", False), + # Genuine mid-plan intent SHOULD match + ("Let me clone the repo.", True), + ("Let me check the file.", True), + ("Now let me run the tests.", True), + ("I'll now run the analyzer.", True), + ("I’ll now run the analyzer.", True), # curly apostrophe + ("I will now begin.", True), + # Unrelated trailing text must NOT match + ("Hello world.", False), + ("The answer is 42.", False), + ], +) +def test_trailing_plan_intent(text: str, expected: bool) -> None: + assert bool(_TRAILING_PLAN_INTENT.search(text)) is expected + + +# ----- _TRAILING_PLAN_LIST --------------------------------------------------- + + +@pytest.mark.parametrize( + "text,expected", + [ + # List block at end of buffer SHOULD match + ("Let me do this:\n- step one\n- step two\n", True), + ("Here's my plan:\n1. one\n2. two\n", True), + ("Here's my plan:\n1. one\n2. two\n \n", True), # trailing whitespace + # Unicode bullet at end of buffer SHOULD match + ("Let me try:\n• first\n• second\n", True), + # List followed by a closing sentence MUST NOT match (regression: + # the `m` flag in `(?ims)` previously let `\s*$` match end-of-line) + ( + "Here's my plan:\n- step one\n- step two\n\nDone, hope that helps.", + False, + ), + ( + "Let me walk through it:\n1. first\n2. second\n\nThat's everything.", + False, + ), + # Single-item numbered list followed by closing prose MUST NOT match + ( + "Let me explain:\n1. The function returns 42.\n\nThat's the answer.", + False, + ), + # List embedded mid-text (not trailing) MUST NOT match + ( + "Here's my plan:\n- step one\n- step two\nNow the conclusion follows.", + False, + ), + ], +) +def test_trailing_plan_list(text: str, expected: bool) -> None: + assert bool(_TRAILING_PLAN_LIST.search(text)) is expected + + +# ----- _TRAILING_PLAN_COLON -------------------------------------------------- + + +@pytest.mark.parametrize( + "text,expected", + [ + # Bare trailing colon SHOULD match + ("Let me check the repo:", True), + ("I'll now look at this:", True), + # Colon mid-sentence MUST NOT match + ("Let me check this: it should work fine.", False), + # Colon not in an intent-cue clause MUST NOT match + ("The result is:", False), + ], +) +def test_trailing_plan_colon(text: str, expected: bool) -> None: + assert bool(_TRAILING_PLAN_COLON.search(text)) is expected + + +# ----- _trailing_plan_hit composite ----------------------------------------- + + +@pytest.mark.parametrize( + "text,expected", + [ + # Any of the three sub-patterns triggers a hit + ("Now let me run the tests.", True), + ("Let me do this:\n- step one\n- step two\n", True), + ("Let me check the repo:", True), + # Negative cases that previously misfired + ("If you need anything else, let me know.", False), + ( + "Here's my plan:\n- step one\n- step two\n\nDone, hope that helps.", + False, + ), + # Short empty string is a no-op + ("", False), + (" ", False), + ], +) +def test_trailing_plan_hit(text: str, expected: bool) -> None: + assert _trailing_plan_hit(text) is expected + + +# ----- window slicing -------------------------------------------------------- + + +def test_trailing_plan_hit_respects_window() -> None: + """An intent cue further back than ``_TRAILING_PLAN_WINDOW`` must NOT + trigger a hit; only the tail of the response is inspected.""" + + # 800-char prefix of unrelated text, then a finalising sentence. + prefix = "lorem ipsum " * 80 # ~960 chars + text = f"Let me check the repo. {prefix}The result is 42." + assert _trailing_plan_hit(text) is False