Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions assistant/src/__tests__/provider-error-scenarios.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,117 @@ describe("RetryProvider — streaming response handling", () => {
expect(callCount).toBe(2);
});

test("retries transport-aborted stream (Anthropic 'Request was aborted' with no abortReason)", async () => {
let callCount = 0;
const inner: Provider = {
name: "retry-transport-abort",
async sendMessage() {
callCount++;
if (callCount <= 1) {
// Mirrors the ProviderError shape produced by the catch-site in
// providers/anthropic/client.ts when the SDK reports
// ``Anthropic.APIError(status === undefined, message: "Request
// was aborted.")`` and the daemon's AbortController was NOT the
// cause (i.e. abortReason is undefined). Empirically the #1
// daemon error by a factor of 5x — 1,344 events in 4d on the
// SSE chat path, all of which used to surface as a 45s blank
// screen on the web client via LUM-1431.
throw new ProviderError(
"Anthropic API error: Request was aborted.",
"anthropic",
undefined,
);
}
return successResponse();
},
};
const provider = new RetryProvider(inner);
await provider.sendMessage(MESSAGES);
expect(callCount).toBe(2);
});

test("does NOT retry caller-aborted stream (abortReason set short-circuits retry)", async () => {
let callCount = 0;
const abortError = new ProviderError(
"Anthropic API error: Request was aborted.",
"anthropic",
undefined,
// Tagging abortReason exactly matches what the catch-site does when
// signal.aborted was true at the moment of failure — i.e. the
// daemon (or the user) cancelled the request, not the transport.
// The retry layer must respect this and surface the error
// immediately without consuming retry budget.
{ abortReason: "user-cancelled" },
);
const inner: Provider = {
name: "caller-abort",
async sendMessage() {
callCount++;
throw abortError;
},
};
const provider = new RetryProvider(inner);
await expect(provider.sendMessage(MESSAGES)).rejects.toBe(abortError);
expect(callCount).toBe(1);
});

test("does NOT retry inner-timeout stream (deterministic 30min deadline failure)", async () => {
let callCount = 0;
// When the inner streamTimeoutMs fires, the catch-site rewrites the
// message to "Anthropic stream timed out after Xs (inner
// streamTimeoutMs)" instead of "Request was aborted." That rewrite is
// what allows this branch to bypass the transport-abort retry —
// retrying a 30min-deadline failure would almost certainly hit the
// same deadline on the next attempt and waste retry budget.
const innerTimeoutError = new ProviderError(
"Anthropic API error: Anthropic stream timed out after 1800s (inner streamTimeoutMs)",
"anthropic",
undefined,
);
const inner: Provider = {
name: "inner-timeout",
async sendMessage() {
callCount++;
throw innerTimeoutError;
},
};
const provider = new RetryProvider(inner);
await expect(provider.sendMessage(MESSAGES)).rejects.toBe(
innerTimeoutError,
);
expect(callCount).toBe(1);
});

test("does NOT retry OpenAI/Gemini-shaped 'Request was aborted' (no inner-timeout rewrite at those catch-sites)", async () => {
// The OpenAI chat-completions, OpenAI responses, and Gemini catch-sites
// format their errors as `"<Provider> API error (undefined): Request
// was aborted."` (note the `(undefined)` parenthetical that the
// Anthropic catch-site intentionally omits) and — unlike the Anthropic
// catch-site — they do NOT rewrite their inner-streamTimeoutMs
// deadline failures. A provider-agnostic transport-abort predicate
// would burn three retries on what is by construction a deterministic
// 30-minute deadline failure that will fire again on every attempt.
// Scoping the predicate to the Anthropic message prefix avoids that
// wasted retry budget for non-Anthropic providers until their
// catch-sites grow the same `innerTimeoutFired` distinction.
const openaiAbortError = new ProviderError(
"OpenAI API error (undefined): Request was aborted.",
"openai",
undefined,
);
let callCount = 0;
const inner: Provider = {
name: "openai-aborted-stream",
async sendMessage() {
callCount++;
throw openaiAbortError;
},
};
const provider = new RetryProvider(inner);
await expect(provider.sendMessage(MESSAGES)).rejects.toBe(openaiAbortError);
expect(callCount).toBe(1);
});

test("events accumulate across retries (each attempt delivers events independently)", async () => {
let callCount = 0;
const inner: Provider = {
Expand Down
48 changes: 47 additions & 1 deletion assistant/src/providers/retry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,40 @@ const RETRYABLE_STREAM_PATTERNS = [
*/
const RETRYABLE_PROVIDER_MESSAGE_PATTERNS = [/overloaded/i];

/**
* Patterns that indicate the Anthropic provider SDK reported a transport-level
* abort (TCP close mid-stream, edge LB idle cutoff, Bun fetch deadline) rather
* than a caller-initiated cancellation or inner-timeout deadline. The SDK
* surfaces all three cases as ``Request was aborted`` with ``error.status ===
* undefined``; the catch-site in ``providers/anthropic/client.ts`` separates
* them by:
* - tagging caller cancellations with ``abortReason`` (short-circuits in
* {@link isRetryableError} before reaching this predicate)
* - rewriting the inner-timeout message to ``"Anthropic stream timed out
* after Xs (inner streamTimeoutMs)"`` (doesn't start with ``Anthropic API
* error:`` so it falls through to network-error classification)
* - leaving the transport-abort message verbatim as
* ``"Anthropic API error: Request was aborted."``
*
* Pattern is intentionally anchored to the Anthropic-specific message prefix.
* The OpenAI / Gemini / OpenRouter catch-sites format their errors as
* ``"<Provider> API error (undefined): Request was aborted."`` (note the
* ``(undefined)`` parenthetical) and — crucially — do **not** rewrite
* inner-timeout failures, so a provider-agnostic ``/request was aborted/i``
* predicate would erroneously retry their 30-minute deadline failures three
* additional times. Once those catch-sites grow the same
* ``innerTimeoutFired`` distinction the Anthropic one has, the pattern set
* here can be expanded to cover them too.
*
* This is the daemon-side counterpart to the vembda graceful-close behavior
* for upstream disconnects (LUM-1536) — together they collapse the 45 s
* silent-stall window the web client used to observe whenever Anthropic's
* stream was cut mid-token.
*/
const RETRYABLE_TRANSPORT_ABORT_PATTERNS = [
/^anthropic api error:\s*request was aborted/i,
];

function isRetryableStreamError(error: unknown): boolean {
if (!(error instanceof ProviderError)) return false;
if (error.statusCode !== undefined) return false; // has a real HTTP status — not a stream error
Expand All @@ -84,6 +118,15 @@ function isRetryableProviderMessage(error: unknown): boolean {
return RETRYABLE_PROVIDER_MESSAGE_PATTERNS.some((p) => p.test(error.message));
}

function isRetryableTransportAbort(error: unknown): boolean {
if (!(error instanceof ProviderError)) return false;
// Transport aborts surface with ``status === undefined`` (the SDK never
// saw an HTTP response). A real HTTP status here means a server error,
// which is handled by the status check.
if (error.statusCode !== undefined) return false;
return RETRYABLE_TRANSPORT_ABORT_PATTERNS.some((p) => p.test(error.message));

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid retrying other providers' timeout aborts

When a non-Anthropic provider's own createStreamTimeout() aborts the SDK call, the OpenAI/Gemini catch sites only preserve abortReason from the external caller signal and otherwise wrap non-API aborts as a ProviderError with statusCode === undefined; if that SDK reports the same Request was aborted text, this new provider-agnostic predicate will retry the deterministic 30-minute timeout three more times instead of surfacing it. Anthropic rewrites its inner timeout before this point, but the other providers do not, so this should be scoped to the Anthropic/OpenRouter-Anthropic transport-abort shape or those catch sites should tag/rewrite inner timeouts first.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch — confirmed and fixed in 55ad034.

Walked through the catch-sites:

  • providers/openai/chat-completions-provider.ts:338-382 — computes abortReason from the external signal?.aborted, so when the inner createStreamTimeout fires (timing out timeoutSignal, not signal), abortReason stays undefined. The OpenAI SDK throws APIUserAbortError extends APIError with status === undefined and message === "Request was aborted.", so the catch falls into the OpenAI.APIError branch and throws ProviderError("OpenAI API error (undefined): Request was aborted.", "openai", undefined, {}) — no message rewrite. ❌ would have retried.
  • providers/openai/responses-provider.ts:401-443 — same shape, providerLabel substituted. ❌ would have retried.
  • providers/gemini/client.ts:336-372 — same shape, "Gemini API error (undefined): ...". ❌ would have retried.
  • providers/openrouter/client.ts extends chat-completions-provider, so it inherits the OpenAI catch-site path. ❌ would have retried.
  • providers/anthropic/client.ts:1417-1523 — the only catch-site with the innerTimeoutFired check that rewrites the message to "Anthropic stream timed out after Xs (inner streamTimeoutMs)", ensuring inner-timeout failures don't match "Request was aborted". ✓ safe.

Tightened the pattern to /^anthropic api error:\s*request was aborted/i so it matches only the Anthropic catch-site's verbatim output. Non-Anthropic providers' aborted-stream errors (including their (undefined) parenthetical form) fall through to network-error classification, unchanged from before this PR.

Added a regression test (does NOT retry OpenAI/Gemini-shaped 'Request was aborted') that pins this — feeding "OpenAI API error (undefined): Request was aborted." through RetryProvider confirms callCount === 1 (zero retries).

The right long-term fix is for the OpenAI/Gemini catch-sites to grow the same innerTimeoutFired distinction, at which point the pattern set here can be expanded to cover them. Out of scope for LUM-1537 since (a) there's no telemetry showing transport aborts on those providers in our fleet (1,344/4d is Anthropic-specific), and (b) it requires touching three more provider files plus their tests.

}

function isRetryableError(error: unknown): boolean {
// Context overflow is deterministic — retrying the same oversized prompt
// will never succeed. Short-circuit before the generic 429/5xx check so
Expand All @@ -103,6 +146,7 @@ function isRetryableError(error: unknown): boolean {
}
if (isRetryableProviderMessage(error)) return true;
if (isRetryableStreamError(error)) return true;
if (isRetryableTransportAbort(error)) return true;
return isRetryableNetworkError(error);
}

Expand Down Expand Up @@ -482,7 +526,9 @@ export class RetryProvider implements Provider {
? "provider_overloaded"
: isRetryableStreamError(error)
? "stream_corruption"
: "network_error";
: isRetryableTransportAbort(error)
? "transport_abort"
: "network_error";
log.warn(
{
attempt: attempt + 1,
Expand Down
Loading