Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions packages/cli/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,21 @@ if (!process.env.CLAUDE_API_KEY && !process.env.CLAUDE_CODE_OAUTH_TOKEN) {
}
}

// Warn if running from inside a Claude Code session. Nested Claude Code
// subprocess launches have a history of hanging (keychain/session/TTY state,
// upstream SDK quirks) even when Archon's subprocess env strip works. Users
// hitting this class of issue should use `archon serve` + web UI / HTTP API
// from a non-nested shell. See coleam00/Archon#1067.
if (process.env.CLAUDECODE === '1' && process.env.ARCHON_SUPPRESS_NESTED_CLAUDE_WARNING !== '1') {
console.warn(
'⚠ Detected CLAUDECODE=1 — you appear to be running `archon` from inside a Claude Code session.\n' +
' If workflows hang silently at dag_node_started, this is a known class of issue.\n' +
' Workaround: run `archon serve` from a regular shell and use the web UI or HTTP API.\n' +
' Details: https://github.com/coleam00/Archon/issues/1067\n' +
' Suppress this warning: export ARCHON_SUPPRESS_NESTED_CLAUDE_WARNING=1'
);
}

// DATABASE_URL is no longer required - SQLite will be used as default

// Import commands after dotenv is loaded
Expand Down
87 changes: 87 additions & 0 deletions packages/core/src/clients/claude.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1105,4 +1105,91 @@ describe('ClaudeClient', () => {
expect(spyScan).not.toHaveBeenCalled();
});
});

describe('first-event timeout', () => {
const originalTimeoutEnv = process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS;

beforeEach(() => {
process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS = '50';
});

afterEach(() => {
if (originalTimeoutEnv === undefined) {
delete process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS;
} else {
process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS = originalTimeoutEnv;
}
});

test('throws a descriptive error when SDK never yields a first message', async () => {
// Generator that sleeps forever — simulates a hung subprocess
mockQuery.mockImplementation(async function* () {
await new Promise(() => {
// never resolves; the abortController.abort() inside the generator's
// internal loop is what actually unblocks it in production. For the
// test, we simulate that by awaiting a promise that rejects on abort.
});
// unreachable
yield { type: 'assistant', message: { content: [{ type: 'text', text: 'x' }] } };
});

// The for-await loop in sendQuery() will be blocked on the generator's
// first `next()` call. The timeout fires, aborts the controller, and the
// SDK generator is expected to throw or return. Since our mock doesn't
// actually react to abort, we test that sendQuery() itself raises the
// descriptive timeout error. We use a very short timeout (50ms) to keep
// the test fast.
await expect(async () => {
for await (const _ of client.sendQuery('test', '/workspace')) {
// consume — should never produce any chunks
}
}).toThrow(/produced no output within/);

// The diagnostic dump should have been logged at error level
expect(mockLogger.error).toHaveBeenCalledWith(
expect.objectContaining({
timeoutMs: 50,
parentProcess: expect.objectContaining({
platform: process.platform,
}),
}),
'claude.first_event_timeout'
);
});

test('does not throw when SDK yields a message within the timeout', async () => {
mockQuery.mockImplementation(async function* () {
yield {
type: 'assistant',
message: { content: [{ type: 'text', text: 'fast response' }] },
};
});

const chunks = [];
for await (const chunk of client.sendQuery('test', '/workspace')) {
chunks.push(chunk);
}

expect(chunks).toHaveLength(1);
expect(chunks[0]).toEqual({ type: 'assistant', content: 'fast response' });
// The timeout should not have fired
expect(mockLogger.error).not.toHaveBeenCalledWith(
expect.anything(),
'claude.first_event_timeout'
);
});

test('the timeout error mentions issue #1067 for discoverability', async () => {
mockQuery.mockImplementation(async function* () {
await new Promise(() => {});
yield { type: 'assistant', message: { content: [{ type: 'text', text: 'x' }] } };
});

await expect(async () => {
for await (const _ of client.sendQuery('test', '/workspace')) {
// consume
}
}).toThrow(/coleam00\/Archon\/issues\/1067/);
});
});
});
137 changes: 136 additions & 1 deletion packages/core/src/clients/claude.ts
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,98 @@ const MAX_SUBPROCESS_RETRIES = 3;
/** Delay between retries in milliseconds */
const RETRY_BASE_DELAY_MS = 2000;

/** Timeout (ms) for the SDK's *first* yielded message. If the subprocess spawns
* but produces no output within this window, we abort it and emit a diagnostic
* dump instead of hanging forever. Defaults to 60s — normal first events arrive
* in <5s, so 60s catches real hangs without flagging legitimately slow starts.
* Override via ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS. See coleam00/Archon#1067. */
function getFirstEventTimeoutMs(): number {
const raw = process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS;
if (raw === undefined) return 60_000;
const parsed = Number.parseInt(raw, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
getLog().warn({ raw }, 'claude.invalid_first_event_timeout_env');
return 60_000;
}
return parsed;
}

/**
* Wrap an async iterable so that its first `next()` call is raced against a
* timeout. On timeout, `onTimeout()` is called to build and throw the error.
* Subsequent iterations are unaffected — only the *first* yield has a deadline.
*
* Uses Promise.race rather than relying on abortController so we unblock even
* if the source ignores abort (the exact pathological case we're defending).
*/
async function* withFirstMessageTimeout<T>(
source: AsyncIterable<T>,
timeoutMs: number,
onTimeout: () => Error
): AsyncGenerator<T> {
const it = source[Symbol.asyncIterator]();
let first = true;
while (true) {
let result: IteratorResult<T>;
if (first) {
first = false;
let timer: ReturnType<typeof setTimeout> | undefined;
try {
result = await Promise.race([
it.next(),
new Promise<never>((_, reject) => {
timer = setTimeout(() => {
reject(onTimeout());
}, timeoutMs);
}),
]);
} finally {
if (timer !== undefined) clearTimeout(timer);
}
} else {
result = await it.next();
}
if (result.done) return;
yield result.value;
}
// No `finally` with `await it.return()` — on the timeout path, the source
// generator is stuck inside an `await` that never resolves (that's literally
// the bug we're defending against). Calling `it.return()` would itself hang
// waiting for the generator to unwind. In production, the SDK's own abort
// handling cleans up; in tests, the stuck generator is GC'd.
}

/** Build a diagnostic snapshot for the hang case. Includes env var names
* (not values — secrets), SDK options (redacted), and process state. Logged
* at error level when the first-event timeout fires so the next reporter has
* actionable evidence instead of 30 minutes of silence. */
function buildFirstEventHangDiagnostics(params: {
cwd: string;
attempt: number;
subprocessEnvKeys: string[];
parentClaudeEnvKeys: string[];
model: string | undefined;
timeoutMs: number;
stderrLines: string[];
}): Record<string, unknown> {
return {
cwd: params.cwd,
attempt: params.attempt,
timeoutMs: params.timeoutMs,
model: params.model,
subprocessEnvKeys: params.subprocessEnvKeys.sort(),
parentClaudeEnvKeys: params.parentClaudeEnvKeys.sort(),
parentProcess: {
platform: process.platform,
uid: getProcessUid(),
isTTY: process.stdout.isTTY ?? false,
claudeCode: process.env.CLAUDECODE ?? null,
claudeCodeEntrypoint: process.env.CLAUDE_CODE_ENTRYPOINT ?? null,
},
stderrTail: params.stderrLines.slice(-20),
};
}

/** Patterns indicating rate limiting in stderr/error messages */
const RATE_LIMIT_PATTERNS = ['rate limit', 'too many requests', '429', 'overloaded'];

Expand Down Expand Up @@ -478,8 +570,43 @@ export class ClaudeClient implements IAssistantClient {
getLog().debug({ cwd, attempt }, 'starting_new_session');
}

// First-event timeout: if the SDK produces no output within
// `firstEventTimeoutMs`, log a diagnostic dump and throw. Uses Promise.race
// on the first iterator.next() so we unblock even if the SDK ignores abort
// (the pathological case we're defending against).
// See coleam00/Archon#1067.
const firstEventTimeoutMs = getFirstEventTimeoutMs();
let firstEventTimedOut = false;
const buildTimeoutError = (): Error => {
firstEventTimedOut = true;
const subprocessEnvKeys = Object.keys(options.env ?? {});
const parentClaudeEnvKeys = Object.keys(process.env).filter(
k => k === 'CLAUDECODE' || k.startsWith('CLAUDE_CODE_') || k.startsWith('ANTHROPIC_')
);
const diag = buildFirstEventHangDiagnostics({
cwd,
attempt,
subprocessEnvKeys,
parentClaudeEnvKeys,
model: requestOptions?.model,
timeoutMs: firstEventTimeoutMs,
stderrLines,
});
getLog().error(diag, 'claude.first_event_timeout');
controller.abort(); // best effort — Promise.race has already unblocked us
return new Error(
`Claude Code subprocess produced no output within ${firstEventTimeoutMs}ms. ` +
"See logs for 'claude.first_event_timeout' diagnostics. " +
'Details: https://github.com/coleam00/Archon/issues/1067'
);
};

try {
for await (const msg of query({ prompt, options })) {
for await (const msg of withFirstMessageTimeout(
query({ prompt, options }),
firstEventTimeoutMs,
buildTimeoutError
)) {
// Drain tool results captured by PostToolUse hook before processing the next message
while (toolResultQueue.length > 0) {
const tr = toolResultQueue.shift();
Expand Down Expand Up @@ -588,6 +715,14 @@ export class ClaudeClient implements IAssistantClient {
} catch (error) {
const err = error as Error;

// First-event timeout fired: the SDK never yielded a message within
// `firstEventTimeoutMs`. `buildTimeoutError()` already built, logged,
// and aborted. Re-throw the error as-is — no retry, because a wedged
// subprocess will produce the same hang next time.
if (firstEventTimedOut) {
throw err;
}

// Don't retry aborted queries
if (controller.signal.aborted) {
throw new Error('Query aborted');
Expand Down
Loading