From 46feb06f7f28be481c6d9b037c9c2ae401d4f79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20Emre=20Kabakc=C4=B1?= Date: Wed, 13 May 2026 06:05:40 +0100 Subject: [PATCH] fix(egress-judge): hard per-call timeout that fails closed and feeds the breaker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LLM egress judge was awaited synchronously by the HTTP proxy with no timeout independent of the model client's own transport timeout, so a hung judge call could stall an outbound request indefinitely. Add a per-call timeout (default 8s, configurable via EGRESS_JUDGE_TIMEOUT_MS or EgressJudgeOptions.judgeTimeoutMs) that, on expiry, fails the verdict closed (deny) and counts as a circuit-breaker failure — so a run of timeouts opens the breaker and subsequent calls short-circuit without invoking the model. Verdict cache untouched. --- .../unit/egress-judge-timeout.test.ts | 101 ++++++++++++++++++ .../src/gateway/proxy/egress-judge/judge.ts | 81 +++++++++++--- 2 files changed, 170 insertions(+), 12 deletions(-) create mode 100644 packages/server/src/__tests__/unit/egress-judge-timeout.test.ts diff --git a/packages/server/src/__tests__/unit/egress-judge-timeout.test.ts b/packages/server/src/__tests__/unit/egress-judge-timeout.test.ts new file mode 100644 index 000000000..4ed4ffb28 --- /dev/null +++ b/packages/server/src/__tests__/unit/egress-judge-timeout.test.ts @@ -0,0 +1,101 @@ +/** + * Egress-judge per-call timeout + circuit hygiene. + * + * The judge is awaited synchronously by the HTTP proxy when a `judge`-action + * rule matches, so a hung model call would otherwise stall an outbound + * request indefinitely. These tests pin: (1) a call that hangs past the + * timeout resolves to a deny verdict within ~timeout, and (2) consecutive + * timeouts count as breaker failures, so the circuit opens and later calls + * fail closed without touching the model. + */ +import { describe, expect, test } from "bun:test"; +import type { ResolvedJudgeRule } from "../../gateway/permissions/policy-store.js"; +import { EgressJudge } from "../../gateway/proxy/egress-judge/judge.js"; +import type { + JudgeClient, + JudgeVerdict, +} from "../../gateway/proxy/egress-judge/types.js"; + +class HangingClient implements JudgeClient { + calls = 0; + async judge(): Promise { + this.calls++; + // Never settles — the judge's own timeout must rescue the caller. + return new Promise(() => {}); + } +} + +function rule(overrides: Partial = {}): ResolvedJudgeRule { + return { + judgeName: "default", + policy: "allow only repos the user owns", + policyHash: "policy-hash-1", + ...overrides, + }; +} + +describe("EgressJudge timeout", () => { + test("a hung judge call fails closed within ~timeout", async () => { + const client = new HangingClient(); + const judge = new EgressJudge({ client, judgeTimeoutMs: 30 }); + const started = Date.now(); + const decision = await judge.decide( + { agentId: "agent-a", hostname: "api.github.com" }, + rule() + ); + const elapsed = Date.now() - started; + expect(decision.verdict).toBe("deny"); + expect(decision.source).toBe("judge-error"); + expect(decision.reason).toContain("timed out"); + expect(elapsed).toBeLessThan(500); + expect(client.calls).toBe(1); + }); + + test("consecutive timeouts trip the breaker and stop calling the model", async () => { + const client = new HangingClient(); + const judge = new EgressJudge({ + client, + judgeTimeoutMs: 20, + breakerFailureThreshold: 2, + breakerCooldownMs: 60_000, + }); + // Distinct hostnames so the verdict cache never short-circuits the path. + for (let i = 0; i < 5; i++) { + const decision = await judge.decide( + { agentId: "agent-a", hostname: `h${i}.example.com` }, + rule() + ); + expect(decision.verdict).toBe("deny"); + } + // First two calls time out and count as breaker failures; the breaker + // then opens and the remaining three short-circuit without the model. + expect(client.calls).toBe(2); + + const afterOpen = await judge.decide( + { agentId: "agent-a", hostname: "another.example.com" }, + rule() + ); + expect(afterOpen.verdict).toBe("deny"); + expect(afterOpen.source).toBe("circuit-open"); + expect(client.calls).toBe(2); + }); + + test("EGRESS_JUDGE_TIMEOUT_MS env var sets the default", async () => { + const prev = process.env.EGRESS_JUDGE_TIMEOUT_MS; + process.env.EGRESS_JUDGE_TIMEOUT_MS = "25"; + try { + const client = new HangingClient(); + const judge = new EgressJudge({ client }); + const started = Date.now(); + const decision = await judge.decide( + { agentId: "agent-a", hostname: "api.github.com" }, + rule() + ); + expect(decision.verdict).toBe("deny"); + expect(Date.now() - started).toBeLessThan(500); + } finally { + if (prev === undefined) delete process.env.EGRESS_JUDGE_TIMEOUT_MS; + else process.env.EGRESS_JUDGE_TIMEOUT_MS = prev; + } + }); +}); diff --git a/packages/server/src/gateway/proxy/egress-judge/judge.ts b/packages/server/src/gateway/proxy/egress-judge/judge.ts index 42f443c39..c09a4988d 100644 --- a/packages/server/src/gateway/proxy/egress-judge/judge.ts +++ b/packages/server/src/gateway/proxy/egress-judge/judge.ts @@ -14,6 +14,28 @@ const logger = createLogger("egress-judge"); */ const DEFAULT_JUDGE_MODEL = "claude-haiku-4-5-20251001"; +/** + * Hard ceiling on a single judge call, independent of the model client's + * own transport timeout. On expiry the call is abandoned, the verdict + * fails closed (deny), and the timeout counts as a circuit-breaker failure. + * Overridable via `EGRESS_JUDGE_TIMEOUT_MS` or `EgressJudgeOptions.judgeTimeoutMs`. + */ +const DEFAULT_JUDGE_TIMEOUT_MS = 8_000; + +function envTimeoutMs(): number | undefined { + const raw = process.env.EGRESS_JUDGE_TIMEOUT_MS; + if (!raw) return undefined; + const n = Number(raw); + return Number.isFinite(n) && n > 0 ? n : undefined; +} + +class JudgeTimeoutError extends Error { + constructor(timeoutMs: number) { + super(`Egress judge call exceeded ${timeoutMs}ms`); + this.name = "JudgeTimeoutError"; + } +} + export interface EgressJudgeOptions { client?: JudgeClient; /** Judge model identifier (overridable per-agent via `egressConfig.judgeModel`). */ @@ -26,6 +48,11 @@ export interface EgressJudgeOptions { breakerFailureThreshold?: number; /** Cooldown before the circuit half-opens again. Default: 30s. */ breakerCooldownMs?: number; + /** + * Per-call timeout. Default: `EGRESS_JUDGE_TIMEOUT_MS` env or 8s. On + * expiry the verdict fails closed and the breaker records a failure. + */ + judgeTimeoutMs?: number; } /** @@ -42,6 +69,7 @@ export class EgressJudge { private readonly breaker: CircuitBreaker; private readonly inFlight = new Map>(); private readonly defaultModel: string; + private readonly judgeTimeoutMs: number; private _client: JudgeClient | undefined; constructor(options: EgressJudgeOptions = {}) { @@ -54,6 +82,8 @@ export class EgressJudge { options.breakerCooldownMs ?? 30_000 ); this.defaultModel = options.defaultModel ?? DEFAULT_JUDGE_MODEL; + this.judgeTimeoutMs = + options.judgeTimeoutMs ?? envTimeoutMs() ?? DEFAULT_JUDGE_TIMEOUT_MS; this._client = options.client; } @@ -123,11 +153,13 @@ export class EgressJudge { const started = Date.now(); const model = rule.judgeModel ?? this.defaultModel; try { - const verdict = await this.client.judge({ - model, - systemPrompt: buildSystemPrompt(), - userPrompt: buildUserPrompt({ policy: rule.policy, request }), - }); + const verdict = await this.withTimeout( + this.client.judge({ + model, + systemPrompt: buildSystemPrompt(), + userPrompt: buildUserPrompt({ policy: rule.policy, request }), + }) + ); const latencyMs = Date.now() - started; this.breaker.onSuccess(rule.policyHash); this.cache.set(cacheKey, verdict); @@ -140,15 +172,24 @@ export class EgressJudge { }; } catch (err) { this.breaker.onFailure(rule.policyHash); - logger.error("Egress judge call failed — failing closed", { - policyHash: rule.policyHash, - hostname: request.hostname, - model, - error: err instanceof Error ? err.message : String(err), - }); + const timedOut = err instanceof JudgeTimeoutError; + logger.error( + timedOut + ? "Egress judge call timed out — failing closed" + : "Egress judge call failed — failing closed", + { + policyHash: rule.policyHash, + hostname: request.hostname, + model, + timeoutMs: timedOut ? this.judgeTimeoutMs : undefined, + error: err instanceof Error ? err.message : String(err), + } + ); return { verdict: "deny", - reason: "Judge call failed; request denied", + reason: timedOut + ? "Judge call timed out; request denied" + : "Judge call failed; request denied", source: "judge-error", latencyMs: Date.now() - started, policyHash: rule.policyHash, @@ -156,4 +197,20 @@ export class EgressJudge { }; } } + + /** + * Race the judge call against a hard timeout. The underlying client + * promise is left to settle on its own (we can't cancel it), but the + * caller sees a {@link JudgeTimeoutError} once the deadline passes. + */ + private withTimeout(promise: Promise): Promise { + let timer: ReturnType; + const timeout = new Promise((_, reject) => { + timer = setTimeout( + () => reject(new JudgeTimeoutError(this.judgeTimeoutMs)), + this.judgeTimeoutMs + ); + }); + return Promise.race([promise, timeout]).finally(() => clearTimeout(timer)); + } }