From dfbb4acd43fb980e741f7fbaffc52b8513a64454 Mon Sep 17 00:00:00 2001
From: Cole Medin <cole@dynamous.ai>
Date: Sat, 18 Apr 2026 12:19:43 -0500
Subject: [PATCH] fix(workflows): fail loudly on SDK isError results (#1208)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, `dag-executor` only failed nodes/iterations when the SDK
returned an `error_max_budget_usd` result. Every other `isError: true`
subtype — including `error_during_execution` — was silently `break`ed
out of the stream with whatever partial output had accumulated, letting
failed runs masquerade as successful ones with empty output.

This is the most likely explanation for the "5-second crash" symptom in
#1208: iterations finish instantly with empty text, the loop keeps
going, and only the `claude.result_is_error` log tips the user off.

Changes:
- Capture the SDK's `errors: string[]` detail on result messages
  (previously discarded) and surface it through `MessageChunk.errors`.
- Log `errors`, `stopReason` alongside `errorSubtype` in
  `claude.result_is_error` so users can see what actually failed.
- Throw from both the general node path and the loop iteration path
  on any `isError: true` result, including the subtype and SDK errors
  detail in the thrown message.

Note: this does not implement auto-retry. See PR comments on #1121 and
the analysis on #1208 — a retry-with-fresh-session approach for loop
iterations is not obviously correct until we see what
`error_during_execution` actually carries in the reporter's env.
This change is the observability + fail-loud step that has to come
first so that signal is no longer silent.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/providers/src/claude/provider.ts   |  10 +-
 packages/providers/src/types.ts             |   2 +
 packages/workflows/src/dag-executor.test.ts | 116 ++++++++++++++++++++
 packages/workflows/src/dag-executor.ts      |  41 +++++++
 4 files changed, 168 insertions(+), 1 deletion(-)

diff --git a/packages/providers/src/claude/provider.ts b/packages/providers/src/claude/provider.ts
index 26935bf373..0821319317 100644
--- a/packages/providers/src/claude/provider.ts
+++ b/packages/providers/src/claude/provider.ts
@@ -740,6 +740,7 @@ async function* streamClaudeMessages(
         total_cost_usd?: number;
         stop_reason?: string | null;
         num_turns?: number;
+        errors?: string[];
         model_usage?: Record<
           string,
           {
@@ -751,9 +752,15 @@ async function* streamClaudeMessages(
         >;
       };
       const tokens = normalizeClaudeUsage(resultMsg.usage);
+      const sdkErrors = Array.isArray(resultMsg.errors) ? resultMsg.errors : undefined;
       if (resultMsg.is_error) {
         getLog().error(
-          { sessionId: resultMsg.session_id, errorSubtype: resultMsg.subtype },
+          {
+            sessionId: resultMsg.session_id,
+            errorSubtype: resultMsg.subtype,
+            stopReason: resultMsg.stop_reason,
+            errors: sdkErrors,
+          },
           'claude.result_is_error'
         );
       }
@@ -765,6 +772,7 @@ async function* streamClaudeMessages(
           ? { structuredOutput: resultMsg.structured_output }
           : {}),
         ...(resultMsg.is_error ? { isError: true, errorSubtype: resultMsg.subtype } : {}),
+        ...(resultMsg.is_error && sdkErrors?.length ? { errors: sdkErrors } : {}),
         ...(resultMsg.total_cost_usd !== undefined ? { cost: resultMsg.total_cost_usd } : {}),
         ...(resultMsg.stop_reason != null ? { stopReason: resultMsg.stop_reason } : {}),
         ...(resultMsg.num_turns !== undefined ? { numTurns: resultMsg.num_turns } : {}),
diff --git a/packages/providers/src/types.ts b/packages/providers/src/types.ts
index 63444ac112..9822abd261 100644
--- a/packages/providers/src/types.ts
+++ b/packages/providers/src/types.ts
@@ -72,6 +72,8 @@ export type MessageChunk =
       structuredOutput?: unknown;
       isError?: boolean;
       errorSubtype?: string;
+      /** SDK-provided error detail strings. Populated when isError is true. */
+      errors?: string[];
       cost?: number;
       stopReason?: string;
       numTurns?: number;
diff --git a/packages/workflows/src/dag-executor.test.ts b/packages/workflows/src/dag-executor.test.ts
index c5822197e5..0c745b39e5 100644
--- a/packages/workflows/src/dag-executor.test.ts
+++ b/packages/workflows/src/dag-executor.test.ts
@@ -3594,6 +3594,70 @@ describe('executeDagWorkflow -- resume with priorCompletedNodes', () => {
       expect(sessionArg).toBe('loop-session-1');
     });
 
+    it('loop iteration fails loudly when SDK returns error_during_execution', async () => {
+      // Regression test for #1208: previously the loop silently broke on isError
+      // results and kept iterating with empty output, producing "5-second crashes"
+      // that masqueraded as successful iterations.
+      mockSendQueryDag.mockImplementation(function* () {
+        yield {
+          type: 'result',
+          isError: true,
+          errorSubtype: 'error_during_execution',
+          errors: ['Subprocess crashed mid-turn'],
+          sessionId: 'bad-session',
+        };
+      });
+
+      const store = createMockStore();
+      const mockDeps = createMockDeps(store);
+      const platform = createMockPlatform();
+      const workflowRun = makeWorkflowRun();
+
+      await executeDagWorkflow(
+        mockDeps,
+        platform,
+        'conv-dag',
+        testDir,
+        {
+          name: 'loop-iteration-err',
+          nodes: [
+            {
+              id: 'work',
+              loop: {
+                prompt: 'Do the work. Say DONE.',
+                until: 'DONE',
+                max_iterations: 5,
+              },
+            },
+          ],
+        },
+        workflowRun,
+        'claude',
+        undefined,
+        join(testDir, 'artifacts'),
+        join(testDir, 'logs'),
+        'main',
+        'docs/',
+        minimalConfig
+      );
+
+      // Should fail after one iteration rather than burning through max_iterations
+      expect(mockSendQueryDag.mock.calls.length).toBe(1);
+      // The loop_iteration_failed event should carry the subtype and SDK errors detail
+      const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
+      const iterFailedEvents = eventCalls.filter(
+        (call: unknown[]) =>
+          (call[0] as Record<string, unknown>).event_type === 'loop_iteration_failed'
+      );
+      expect(iterFailedEvents.length).toBeGreaterThan(0);
+      const failedData = (iterFailedEvents[0][0] as Record<string, unknown>).data as Record<
+        string,
+        unknown
+      >;
+      expect(failedData.error).toContain('error_during_execution');
+      expect(failedData.error).toContain('Subprocess crashed mid-turn');
+    });
+
     it('non-interactive loop is unaffected (no pause)', async () => {
       mockSendQueryDag.mockImplementation(function* () {
         yield { type: 'assistant', content: 'Still working...' };
@@ -4617,6 +4681,58 @@ describe('executeDagWorkflow -- Claude SDK advanced options', () => {
     expect(capMessage).toBeDefined();
   });
 
+  it('fails node when SDK returns error_during_execution result', async () => {
+    // Regression test for #1208: previously we only failed on error_max_budget_usd
+    // and silently broke on all other isError subtypes, letting failed nodes
+    // masquerade as successes with empty output.
+    mockSendQueryDag.mockImplementation(function* () {
+      yield {
+        type: 'result',
+        isError: true,
+        errorSubtype: 'error_during_execution',
+        errors: ['Tool call failed: permission denied'],
+        sessionId: 'sid-err',
+      };
+    });
+
+    const store = createMockStore();
+    const mockDeps = createMockDeps(store);
+    const platform = createMockPlatform();
+    const workflowRun = makeWorkflowRun();
+
+    await executeDagWorkflow(
+      mockDeps,
+      platform,
+      'conv-dag',
+      testDir,
+      {
+        name: 'err-exec-test',
+        nodes: [{ id: 'step1', command: 'my-cmd' }],
+      },
+      workflowRun,
+      'claude',
+      undefined,
+      join(testDir, 'artifacts'),
+      join(testDir, 'logs'),
+      'main',
+      'docs/',
+      minimalConfig
+    );
+
+    // The node_failed event should carry the subtype and SDK errors detail
+    const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
+    const nodeFailedEvents = eventCalls.filter(
+      (call: unknown[]) => (call[0] as Record<string, unknown>).event_type === 'node_failed'
+    );
+    expect(nodeFailedEvents.length).toBeGreaterThan(0);
+    const failedData = (nodeFailedEvents[0][0] as Record<string, unknown>).data as Record<
+      string,
+      unknown
+    >;
+    expect(failedData.error).toContain('error_during_execution');
+    expect(failedData.error).toContain('permission denied');
+  });
+
   it('forwards workflow-level effort to node when no per-node override', async () => {
     const mockDeps = createMockDeps();
     const platform = createMockPlatform();
diff --git a/packages/workflows/src/dag-executor.ts b/packages/workflows/src/dag-executor.ts
index 3680af28b5..a1dd05564b 100644
--- a/packages/workflows/src/dag-executor.ts
+++ b/packages/workflows/src/dag-executor.ts
@@ -764,6 +764,25 @@ async function executeNodeInternal(
             `Node '${node.id}' exceeded cost cap${cap !== undefined ? ` of $${cap.toFixed(2)}` : ''}.`
           );
         }
+        // Fail loudly on any other SDK error result. Previously we broke out of
+        // the stream silently, producing empty/partial output without signaling
+        // failure — which let failed iterations masquerade as successes (#1208).
+        if (msg.isError) {
+          const subtype = msg.errorSubtype ?? 'unknown';
+          const errorsDetail = msg.errors?.length ? ` — ${msg.errors.join('; ')}` : '';
+          getLog().error(
+            {
+              nodeId: node.id,
+              errorSubtype: subtype,
+              errors: msg.errors,
+              sessionId: msg.sessionId,
+              stopReason: msg.stopReason,
+              durationMs: Date.now() - nodeStartTime,
+            },
+            'dag.node_sdk_error_result'
+          );
+          throw new Error(`Node '${node.id}' failed: SDK returned ${subtype}${errorsDetail}`);
+        }
         break; // Result is the "I'm done" signal — don't wait for subprocess to exit
       } else if (msg.type === 'system' && msg.content) {
         // Forward provider warnings (⚠️) and MCP connection failures to the user.
@@ -1626,6 +1645,28 @@ async function executeLoopNode(
           if (msg.numTurns !== undefined) {
             loopTotalNumTurns = (loopTotalNumTurns ?? 0) + msg.numTurns;
           }
+          // Fail the iteration loudly on SDK error results. Previously we broke
+          // silently, producing empty output and continuing to the next iteration —
+          // which made `error_during_execution` on resumed interactive loops look
+          // like a "5-second crash" that kept burning iterations (#1208).
+          if (msg.isError) {
+            const subtype = msg.errorSubtype ?? 'unknown';
+            const errorsDetail = msg.errors?.length ? ` — ${msg.errors.join('; ')}` : '';
+            getLog().error(
+              {
+                nodeId: node.id,
+                iteration: i,
+                errorSubtype: subtype,
+                errors: msg.errors,
+                sessionId: msg.sessionId,
+                stopReason: msg.stopReason,
+              },
+              'loop_node.iteration_sdk_error'
+            );
+            throw new Error(
+              `Loop '${node.id}' iteration ${String(i)} failed: SDK returned ${subtype}${errorsDetail}`
+            );
+          }
           break; // Result is the "I'm done" signal — don't wait for subprocess to exit
         } else if (msg.type === 'tool' && msg.toolName) {
           const now = Date.now();