diff --git a/assistant/src/__tests__/provider-error-scenarios.test.ts b/assistant/src/__tests__/provider-error-scenarios.test.ts index db8da7478c3..b712d59ec22 100644 --- a/assistant/src/__tests__/provider-error-scenarios.test.ts +++ b/assistant/src/__tests__/provider-error-scenarios.test.ts @@ -753,6 +753,117 @@ describe("RetryProvider — streaming response handling", () => { expect(callCount).toBe(2); }); + test("retries transport-aborted stream (Anthropic 'Request was aborted' with no abortReason)", async () => { + let callCount = 0; + const inner: Provider = { + name: "retry-transport-abort", + async sendMessage() { + callCount++; + if (callCount <= 1) { + // Mirrors the ProviderError shape produced by the catch-site in + // providers/anthropic/client.ts when the SDK reports + // ``Anthropic.APIError(status === undefined, message: "Request + // was aborted.")`` and the daemon's AbortController was NOT the + // cause (i.e. abortReason is undefined). Empirically the #1 + // daemon error by a factor of 5x — 1,344 events in 4d on the + // SSE chat path, all of which used to surface as a 45s blank + // screen on the web client via LUM-1431. + throw new ProviderError( + "Anthropic API error: Request was aborted.", + "anthropic", + undefined, + ); + } + return successResponse(); + }, + }; + const provider = new RetryProvider(inner); + await provider.sendMessage(MESSAGES); + expect(callCount).toBe(2); + }); + + test("does NOT retry caller-aborted stream (abortReason set short-circuits retry)", async () => { + let callCount = 0; + const abortError = new ProviderError( + "Anthropic API error: Request was aborted.", + "anthropic", + undefined, + // Tagging abortReason exactly matches what the catch-site does when + // signal.aborted was true at the moment of failure — i.e. the + // daemon (or the user) cancelled the request, not the transport. + // The retry layer must respect this and surface the error + // immediately without consuming retry budget. + { abortReason: "user-cancelled" }, + ); + const inner: Provider = { + name: "caller-abort", + async sendMessage() { + callCount++; + throw abortError; + }, + }; + const provider = new RetryProvider(inner); + await expect(provider.sendMessage(MESSAGES)).rejects.toBe(abortError); + expect(callCount).toBe(1); + }); + + test("does NOT retry inner-timeout stream (deterministic 30min deadline failure)", async () => { + let callCount = 0; + // When the inner streamTimeoutMs fires, the catch-site rewrites the + // message to "Anthropic stream timed out after Xs (inner + // streamTimeoutMs)" instead of "Request was aborted." That rewrite is + // what allows this branch to bypass the transport-abort retry — + // retrying a 30min-deadline failure would almost certainly hit the + // same deadline on the next attempt and waste retry budget. + const innerTimeoutError = new ProviderError( + "Anthropic API error: Anthropic stream timed out after 1800s (inner streamTimeoutMs)", + "anthropic", + undefined, + ); + const inner: Provider = { + name: "inner-timeout", + async sendMessage() { + callCount++; + throw innerTimeoutError; + }, + }; + const provider = new RetryProvider(inner); + await expect(provider.sendMessage(MESSAGES)).rejects.toBe( + innerTimeoutError, + ); + expect(callCount).toBe(1); + }); + + test("does NOT retry OpenAI/Gemini-shaped 'Request was aborted' (no inner-timeout rewrite at those catch-sites)", async () => { + // The OpenAI chat-completions, OpenAI responses, and Gemini catch-sites + // format their errors as `" API error (undefined): Request + // was aborted."` (note the `(undefined)` parenthetical that the + // Anthropic catch-site intentionally omits) and — unlike the Anthropic + // catch-site — they do NOT rewrite their inner-streamTimeoutMs + // deadline failures. A provider-agnostic transport-abort predicate + // would burn three retries on what is by construction a deterministic + // 30-minute deadline failure that will fire again on every attempt. + // Scoping the predicate to the Anthropic message prefix avoids that + // wasted retry budget for non-Anthropic providers until their + // catch-sites grow the same `innerTimeoutFired` distinction. + const openaiAbortError = new ProviderError( + "OpenAI API error (undefined): Request was aborted.", + "openai", + undefined, + ); + let callCount = 0; + const inner: Provider = { + name: "openai-aborted-stream", + async sendMessage() { + callCount++; + throw openaiAbortError; + }, + }; + const provider = new RetryProvider(inner); + await expect(provider.sendMessage(MESSAGES)).rejects.toBe(openaiAbortError); + expect(callCount).toBe(1); + }); + test("events accumulate across retries (each attempt delivers events independently)", async () => { let callCount = 0; const inner: Provider = { diff --git a/assistant/src/providers/retry.ts b/assistant/src/providers/retry.ts index 0234de10fca..df35aa71df2 100644 --- a/assistant/src/providers/retry.ts +++ b/assistant/src/providers/retry.ts @@ -72,6 +72,40 @@ const RETRYABLE_STREAM_PATTERNS = [ */ const RETRYABLE_PROVIDER_MESSAGE_PATTERNS = [/overloaded/i]; +/** + * Patterns that indicate the Anthropic provider SDK reported a transport-level + * abort (TCP close mid-stream, edge LB idle cutoff, Bun fetch deadline) rather + * than a caller-initiated cancellation or inner-timeout deadline. The SDK + * surfaces all three cases as ``Request was aborted`` with ``error.status === + * undefined``; the catch-site in ``providers/anthropic/client.ts`` separates + * them by: + * - tagging caller cancellations with ``abortReason`` (short-circuits in + * {@link isRetryableError} before reaching this predicate) + * - rewriting the inner-timeout message to ``"Anthropic stream timed out + * after Xs (inner streamTimeoutMs)"`` (doesn't start with ``Anthropic API + * error:`` so it falls through to network-error classification) + * - leaving the transport-abort message verbatim as + * ``"Anthropic API error: Request was aborted."`` + * + * Pattern is intentionally anchored to the Anthropic-specific message prefix. + * The OpenAI / Gemini / OpenRouter catch-sites format their errors as + * ``" API error (undefined): Request was aborted."`` (note the + * ``(undefined)`` parenthetical) and — crucially — do **not** rewrite + * inner-timeout failures, so a provider-agnostic ``/request was aborted/i`` + * predicate would erroneously retry their 30-minute deadline failures three + * additional times. Once those catch-sites grow the same + * ``innerTimeoutFired`` distinction the Anthropic one has, the pattern set + * here can be expanded to cover them too. + * + * This is the daemon-side counterpart to the vembda graceful-close behavior + * for upstream disconnects (LUM-1536) — together they collapse the 45 s + * silent-stall window the web client used to observe whenever Anthropic's + * stream was cut mid-token. + */ +const RETRYABLE_TRANSPORT_ABORT_PATTERNS = [ + /^anthropic api error:\s*request was aborted/i, +]; + function isRetryableStreamError(error: unknown): boolean { if (!(error instanceof ProviderError)) return false; if (error.statusCode !== undefined) return false; // has a real HTTP status — not a stream error @@ -84,6 +118,15 @@ function isRetryableProviderMessage(error: unknown): boolean { return RETRYABLE_PROVIDER_MESSAGE_PATTERNS.some((p) => p.test(error.message)); } +function isRetryableTransportAbort(error: unknown): boolean { + if (!(error instanceof ProviderError)) return false; + // Transport aborts surface with ``status === undefined`` (the SDK never + // saw an HTTP response). A real HTTP status here means a server error, + // which is handled by the status check. + if (error.statusCode !== undefined) return false; + return RETRYABLE_TRANSPORT_ABORT_PATTERNS.some((p) => p.test(error.message)); +} + function isRetryableError(error: unknown): boolean { // Context overflow is deterministic — retrying the same oversized prompt // will never succeed. Short-circuit before the generic 429/5xx check so @@ -103,6 +146,7 @@ function isRetryableError(error: unknown): boolean { } if (isRetryableProviderMessage(error)) return true; if (isRetryableStreamError(error)) return true; + if (isRetryableTransportAbort(error)) return true; return isRetryableNetworkError(error); } @@ -482,7 +526,9 @@ export class RetryProvider implements Provider { ? "provider_overloaded" : isRetryableStreamError(error) ? "stream_corruption" - : "network_error"; + : isRetryableTransportAbort(error) + ? "transport_abort" + : "network_error"; log.warn( { attempt: attempt + 1,