diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index e0c7ed78284a2..862184470fa74 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -996,6 +996,7 @@ x-pack/platform/packages/shared/kbn-entities-schema @elastic/core-analysis
 x-pack/platform/packages/shared/kbn-es-snapshot-loader @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals @elastic/obs-ai-team @elastic/security-generative-ai
 x-pack/platform/packages/shared/kbn-evals-common @elastic/obs-ai-team @elastic/security-generative-ai
+x-pack/platform/packages/shared/kbn-evals-extensions @elastic/obs-ai-team @elastic/security-generative-ai
 x-pack/platform/packages/shared/kbn-evals-phoenix-executor @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals-suite-streams @elastic/obs-onboarding-team @elastic/obs-sig-events-team
 x-pack/platform/packages/shared/kbn-event-stacktrace @elastic/obs-presentation-team @elastic/obs-exploration-team
diff --git a/package.json b/package.json
index 9f73cc43e0156..9e3ca568a85ef 100644
--- a/package.json
+++ b/package.json
@@ -1690,6 +1690,7 @@
     "@kbn/eslint-plugin-telemetry": "link:packages/kbn-eslint-plugin-telemetry",
     "@kbn/esql-resource-browser-storybook-config": "link:src/platform/packages/shared/kbn-esql-resource-browser/.storybook",
     "@kbn/evals": "link:x-pack/platform/packages/shared/kbn-evals",
+    "@kbn/evals-extensions": "link:x-pack/platform/packages/shared/kbn-evals-extensions",
     "@kbn/evals-phoenix-executor": "link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor",
     "@kbn/evals-suite-agent-builder": "link:x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder",
     "@kbn/evals-suite-endpoint": "link:x-pack/solutions/security/packages/kbn-evals-suite-endpoint",
diff --git a/tsconfig.base.json b/tsconfig.base.json
index bf7f014138503..7d53a18fbc3d4 100644
--- a/tsconfig.base.json
+++ b/tsconfig.base.json
@@ -1144,6 +1144,8 @@
       "@kbn/evals/*": ["x-pack/platform/packages/shared/kbn-evals/*"],
       "@kbn/evals-common": ["x-pack/platform/packages/shared/kbn-evals-common"],
       "@kbn/evals-common/*": ["x-pack/platform/packages/shared/kbn-evals-common/*"],
+      "@kbn/evals-extensions": ["x-pack/platform/packages/shared/kbn-evals-extensions"],
+      "@kbn/evals-extensions/*": ["x-pack/platform/packages/shared/kbn-evals-extensions/*"],
       "@kbn/evals-phoenix-executor": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor"],
       "@kbn/evals-phoenix-executor/*": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor/*"],
       "@kbn/evals-plugin": ["x-pack/platform/plugins/shared/evals"],
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore b/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore
new file mode 100644
index 0000000000000..c3d694ce14f84
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore
@@ -0,0 +1,17 @@
+# Build output
+target/
+*.js
+!jest.config.js
+*.d.ts
+tsconfig.tsbuildinfo
+
+# Dependencies
+node_modules/
+
+# IDE
+.vscode/
+.idea/
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/README.md b/x-pack/platform/packages/shared/kbn-evals-extensions/README.md
new file mode 100644
index 0000000000000..4c4e87be6bcb2
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/README.md
@@ -0,0 +1,211 @@
+# @kbn/evals-extensions
+
+Advanced evaluation capabilities for `@kbn/evals` - **standalone extensions package**.
+
+## Purpose
+
+This package extends `@kbn/evals` with advanced features ported from [cursor-plugin-evals](https://github.com/patrykkopycinski/cursor-plugin-evals) and serves as the home for Phases 3-5 of the evals roadmap.
+
+## Architecture: Independent Package Design
+
+**Critical principle:** This package is designed to be **completely independent** from `@kbn/evals`.
+
+```
+┌─────────────────────────────────────────────────────┐
+│              Evaluation Suites                      │
+│  (agent-builder, obs-ai-assistant, security)        │
+└──────────────────┬──────────────────────────────────┘
+                   │
+        ┌──────────┴──────────┐
+        │                     │
+        ▼                     ▼
+┌──────────────────┐   ┌─────────────────────────────┐
+│   @kbn/evals     │   │   @kbn/evals-extensions     │
+│   (core)         │   │   (advanced features)       │
+│                  │   │                             │
+│ ✅ Evaluators    │   │ ✅ Safety evaluators        │
+│ ✅ Scout/PW      │   │ ✅ Cost tracking            │
+│ ✅ ES export     │   │ ✅ Dataset management       │
+│ ✅ Stats         │   │ ✅ UI components            │
+│ ✅ CLI basics    │   │ ✅ Watch mode               │
+│                  │   │ ✅ A/B testing              │
+│ ❌ NO imports    │   │ ✅ Human-in-the-loop        │
+│    from ext ─────┼───┼──X                          │
+│                  │   │                             │
+└──────────────────┘   └──────────┬──────────────────┘
+                                  │
+                                  │ depends on
+                                  ▼
+                       ┌──────────────────┐
+                       │   @kbn/evals     │
+                       │   (types, utils) │
+                       └──────────────────┘
+```
+
+**Dependency Rules:**
+- ✅ `kbn-evals-extensions` CAN import from `kbn-evals`
+- ❌ `kbn-evals` MUST NOT import from `kbn-evals-extensions`
+- ✅ Evaluation suites can use both packages independently
+
+## Features
+
+### Current Status: Foundation (PR #1)
+- ✅ Package structure established
+- ✅ Build configuration
+- ✅ Test infrastructure
+- ❌ No functional features yet (placeholder exports only)
+
+### Roadmap
+
+#### **PR #2: Cost Tracking & Metadata** (Weeks 2-3)
+- Token-based cost calculation
+- Hyperparameter tracking (temperature, top_p, etc.)
+- Environment snapshots (Kibana/ES versions, plugins)
+- Run tagging and annotations
+
+#### **PR #3: Dataset Management** (Weeks 4-6)
+- Dataset versioning (semantic versioning)
+- Schema validation (Zod-based)
+- Deduplication (similarity-based)
+- Merging and splitting utilities
+- Filtering and statistics
+
+#### **PR #4: Safety Evaluators** (Weeks 7-10)
+- Toxicity detection
+- PII detection
+- Bias detection
+- Hallucination detection
+- Refusal testing
+- Content moderation
+
+#### **PR #5: UI Components** (Weeks 11-16)
+- Run comparison viewer (side-by-side diff)
+- Example explorer (worst-case analysis)
+- Score distribution charts
+- Integration with evals Kibana plugin
+
+#### **PR #6: DX Enhancements** (Weeks 17-21)
+- Watch mode (auto-rerun on changes)
+- Parallel execution (multi-suite concurrency)
+- Result caching (skip unchanged examples)
+- Incremental evaluation (delta-only runs)
+- Interactive mode (step-through debugging)
+- Dry-run mode (validation without execution)
+
+#### **PR #7: Advanced Analytics** (Weeks 22-24)
+- Confidence intervals (bootstrapping)
+- Outlier detection (Z-score, IQR, Isolation Forest)
+- Failure clustering (K-means, hierarchical)
+- Error taxonomy
+- Ensemble evaluation
+- Calibration analysis
+
+#### **PR #8: A/B Testing & Active Learning** (Weeks 25-29)
+- A/B testing framework with statistical tests
+- Bandit algorithms (epsilon-greedy, UCB, Thompson sampling)
+- Active learning (uncertainty and diversity sampling)
+
+#### **PR #9: Human-in-the-Loop** (Weeks 30-35)
+- Review queue UI
+- Annotation interface
+- Assignment workflow
+- Inter-rater reliability
+- Conflict resolution
+
+#### **PR #10: IDE Integration** (Weeks 36-39)
+- VS Code extension
+- Cursor skills for eval authoring
+- AI-assisted dataset creation
+
+## Usage
+
+### Opting In to Extensions
+
+Evaluation suites import extensions explicitly:
+
+```typescript
+// Example: agent-builder evaluation suite
+import { evaluate } from '@kbn/evals';
+import {
+  createToxicityEvaluator,
+  createPiiDetector,
+  createBiasEvaluator,
+  costTracker,
+  watchMode
+} from '@kbn/evals-extensions';
+
+evaluate('security test', async ({ executorClient }) => {
+  // Mix core and extension evaluators
+  await executorClient.runExperiment(
+    { dataset, task },
+    [
+      ...createCorrectnessEvaluators(),     // core kbn/evals
+      createToxicityEvaluator(),            // extension
+      createPiiDetector(),                  // extension
+    ]
+  );
+
+  // Use extension features
+  await costTracker.logRunCost(executorClient.getRunId());
+});
+```
+
+### Feature Flags
+
+Extensions use environment variables for opt-in behavior:
+
+```bash
+# Enable watch mode
+KBN_EVALS_EXT_WATCH_MODE=true node scripts/evals run --suite <id>
+
+# Enable parallel execution
+KBN_EVALS_EXT_PARALLEL=true node scripts/evals run --suite <id>
+
+# Enable result caching
+KBN_EVALS_EXT_CACHE=true node scripts/evals run --suite <id>
+```
+
+## Why a Separate Package?
+
+1. **Clear boundaries** - Extensions don't pollute core framework
+2. **Independent evolution** - Iterate without affecting core
+3. **Optional adoption** - Suites choose which features to use
+4. **Parallel development** - Teams work without conflicts
+5. **Easier testing** - Integration tests isolated
+6. **Future migration** - Can promote mature features to core later
+
+## Vision Alignment
+
+All features follow principles from "Future of @kbn/evals":
+- **Trace-first**: Leverage OTel traces when applicable
+- **Elastic-native**: No external dependencies
+- **Shared layer**: Provide composable primitives
+- **Code-defined**: Datasets versioned in code
+
+## Development
+
+### Running Tests
+
+```bash
+yarn test:jest --testPathPattern=kbn-evals-extensions
+```
+
+### Type Checking
+
+```bash
+yarn test:type_check --project x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
+```
+
+### Linting
+
+```bash
+node scripts/eslint --fix x-pack/platform/packages/shared/kbn-evals-extensions
+```
+
+## Contributing
+
+See individual feature directories for contribution guidelines. All PRs should:
+- Follow Kibana code standards
+- Include unit tests
+- Update this README with new exports
+- Maintain independence from `@kbn/evals` core
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
new file mode 100644
index 0000000000000..3cad7400b2597
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts
@@ -0,0 +1,53 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Basic package health checks for @kbn/evals-extensions
+ */
+
+import { EVALS_EXTENSIONS_VERSION } from '..';
+
+describe('@kbn/evals-extensions', () => {
+  describe('package structure', () => {
+    it('should export EVALS_EXTENSIONS_VERSION', () => {
+      expect(EVALS_EXTENSIONS_VERSION).toBe('1.0.0');
+    });
+
+    it('should be importable without errors', async () => {
+      const mod = await import('..');
+      expect(mod).toBeDefined();
+    });
+  });
+
+  describe('dependency isolation', () => {
+    it('should not create circular dependencies with @kbn/evals', async () => {
+      // This test ensures we maintain one-way dependency:
+      // kbn-evals-extensions → depends on → kbn-evals
+      // kbn-evals → MUST NOT depend on → kbn-evals-extensions
+
+      // Both packages should be importable
+      const evalsExtensions = await import('..');
+      const kbnEvals = await import('@kbn/evals');
+
+      expect(evalsExtensions).toBeDefined();
+      expect(kbnEvals).toBeDefined();
+
+      // kbn-evals-extensions can use kbn-evals types (verified by compilation)
+      // kbn-evals should have no knowledge of kbn-evals-extensions
+      // This is enforced by TypeScript references in tsconfig.json
+    });
+  });
+
+  describe('exports', () => {
+    it('should re-export core types from @kbn/evals', async () => {
+      // Type exports are verified at compile time
+      // Runtime check just ensures module loads
+      const exports = await import('..');
+      expect(exports).toBeDefined();
+    });
+  });
+});
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts
new file mode 100644
index 0000000000000..5a82567054db1
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts
@@ -0,0 +1,82 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * @kbn/evals-extensions - Advanced evaluation capabilities
+ *
+ * This package provides standalone extensions for @kbn/evals.
+ * It does NOT modify the core @kbn/evals package.
+ *
+ * ## Architecture
+ *
+ * Dependency flow:
+ * - ✅ kbn-evals-extensions → imports from → kbn-evals
+ * - ❌ kbn-evals → MUST NOT import from → kbn-evals-extensions
+ *
+ * Evaluation suites can opt-in to extensions by importing directly:
+ *
+ * @example
+ * ```typescript
+ * import { evaluate } from '@kbn/evals';
+ * import { createToxicityEvaluator, costTracker } from '@kbn/evals-extensions';
+ *
+ * evaluate('test', async ({ executorClient }) => {
+ *   await executorClient.runExperiment(
+ *     { dataset, task },
+ *     [createToxicityEvaluator()]  // Extension evaluator
+ *   );
+ *   await costTracker.logRunCost(runId);  // Extension feature
+ * });
+ * ```
+ *
+ * ## Roadmap
+ *
+ * Features are being added incrementally:
+ * - **PR #1**: Foundation (current) - Package setup, no functional changes
+ * - **PR #2**: Cost tracking & metadata
+ * - **PR #3**: Dataset management utilities
+ * - **PR #4**: Safety evaluators (toxicity, PII, bias, etc.)
+ * - **PR #5**: UI components (run comparison, example explorer)
+ * - **PR #6**: DX enhancements (watch mode, caching, parallel execution)
+ * - **PR #7**: Advanced analytics (confidence intervals, outlier detection)
+ * - **PR #8**: A/B testing & active learning
+ * - **PR #9**: Human-in-the-loop workflows
+ * - **PR #10**: IDE integration (VS Code extension, Cursor skills)
+ *
+ * @packageDocumentation
+ */
+
+// Re-export core types from kbn-evals for convenience
+// This allows users to import from one place, but doesn't create reverse dependency
+export type { Evaluator, Example, EvaluationDataset, TaskOutput } from '@kbn/evals';
+
+export type { EvaluationScoreDocument } from '@kbn/evals';
+
+/**
+ * Extension-specific types (to be populated in future PRs)
+ */
+export interface ExtensionConfig {
+  /**
+   * Configuration for extension features
+   * Will be expanded as features are added
+   */
+  placeholder?: string;
+}
+
+/**
+ * Feature exports (to be populated in future PRs)
+ *
+ * Examples of what will be exported:
+ * - export { createToxicityEvaluator } from './src/evaluators/safety/toxicity';
+ * - export { costTracker } from './src/tracking/cost_calculator';
+ * - export { watchMode } from './src/execution/watch_mode';
+ * - export { createABTest } from './src/experimentation/ab_testing/framework';
+ * - export { reviewQueue } from './src/human_review/workflow/review_workflow';
+ */
+
+// Placeholder export to ensure package builds
+export const EVALS_EXTENSIONS_VERSION = '1.0.0';
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js b/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js
new file mode 100644
index 0000000000000..60bb4e9652f53
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js
@@ -0,0 +1,12 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+module.exports = {
+  preset: '@kbn/test/jest_node',
+  rootDir: '../../../../..',
+  roots: ['<rootDir>/x-pack/platform/packages/shared/kbn-evals-extensions'],
+};
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
new file mode 100644
index 0000000000000..fdea4cb3f5818
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc
@@ -0,0 +1,7 @@
+{
+  "type": "test-helper",
+  "id": "@kbn/evals-extensions",
+  "owner": ["@elastic/obs-ai-team", "@elastic/security-generative-ai"],
+  "description": "Advanced evaluation capabilities for @kbn/evals - standalone extensions package. Home for features ported from cursor-plugin-evals and Phases 3-5 of evals roadmap.",
+  "devOnly": true
+}
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
new file mode 100644
index 0000000000000..f07149989f299
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml
@@ -0,0 +1,54 @@
+# This file is generated by the @kbn/moon package. Any manual edits will be erased!
+#  To extend this, write your extensions/overrides to 'moon.extend.yml'
+#  then regenerate this file with: 'node scripts/regenerate_moon_projects.js --update --filter @kbn/evals-extensions'
+
+$schema: https://moonrepo.dev/schemas/project.json
+id: '@kbn/evals-extensions'
+layer: unknown
+owners:
+  defaultOwner: '@elastic/obs-ai-team'
+toolchains:
+  default: node
+language: typescript
+project:
+  title: '@kbn/evals-extensions'
+  description: Moon project for @kbn/evals-extensions
+  channel: ''
+  owner: '@elastic/obs-ai-team'
+  sourceRoot: x-pack/platform/packages/shared/kbn-evals-extensions
+dependsOn:
+  - '@kbn/evals'
+tags:
+  - test-helper
+  - package
+  - dev
+  - group-undefined
+  - jest-unit-tests
+fileGroups:
+  src:
+    - '**/*.ts'
+    - '**/*.json'
+    - '!target/**/*'
+tasks:
+  jest:
+    command: node
+    args:
+      - '--no-experimental-require-module'
+      - $workspaceRoot/scripts/jest
+      - '--config'
+      - $projectRoot/jest.config.js
+    options:
+      runFromWorkspaceRoot: true
+    inputs:
+      - '@group(src)'
+  jestCI:
+    command: node
+    args:
+      - '--no-experimental-require-module'
+      - $workspaceRoot/scripts/jest
+      - '--config'
+      - $projectRoot/jest.config.js
+    options:
+      runFromWorkspaceRoot: true
+    inputs:
+      - '@group(src)'
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/package.json b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
new file mode 100644
index 0000000000000..830ebc4dcaef2
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json
@@ -0,0 +1,8 @@
+{
+  "name": "@kbn/evals-extensions",
+  "version": "1.0.0",
+  "private": true,
+  "description": "Advanced evaluation capabilities - standalone extensions for @kbn/evals. Features ported from cursor-plugin-evals and home for Phases 3-5 of evals roadmap.",
+  "license": "Elastic License 2.0",
+  "main": "./index.ts"
+}
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
new file mode 100644
index 0000000000000..e14da609f2e38
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts
@@ -0,0 +1,14 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Internal exports for @kbn/evals-extensions
+ * External API surface is defined in the root index.ts
+ */
+
+export type * from './types';
+export * from './utils';
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts
new file mode 100644
index 0000000000000..90cd9b0eea61b
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts
@@ -0,0 +1,47 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Shared types for @kbn/evals-extensions
+ *
+ * NOTE: This package depends on @kbn/evals but @kbn/evals does NOT depend on this package.
+ * Keep types that need to be shared with core @kbn/evals in @kbn/evals itself.
+ *
+ * Types here are specific to extension features and will be populated as features are added.
+ */
+
+/**
+ * Placeholder type to ensure package builds
+ * Will be replaced/extended as features are added in subsequent PRs
+ */
+export interface ExtensionPlaceholder {
+  version: string;
+  description: string;
+}
+
+/**
+ * Future type exports (to be added in subsequent PRs):
+ *
+ * PR #2: Cost tracking types
+ * - export interface CostData { ... }
+ * - export interface HyperparameterConfig { ... }
+ * - export interface EnvironmentSnapshot { ... }
+ *
+ * PR #3: Dataset management types
+ * - export interface DatasetVersion { ... }
+ * - export interface ValidationSchema { ... }
+ *
+ * PR #4: Safety evaluator types
+ * - export interface ToxicityScore { ... }
+ * - export interface PiiDetectionResult { ... }
+ *
+ * PR #5: UI component types
+ * - export interface RunComparison { ... }
+ * - export interface ExampleExplorerProps { ... }
+ *
+ * And so on for PRs #6-10...
+ */
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts
new file mode 100644
index 0000000000000..7bc3109dd9887
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts
@@ -0,0 +1,19 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Utility functions for @kbn/evals-extensions
+ *
+ * Will be populated in future PRs with:
+ * - Common helpers
+ * - Shared calculations
+ * - Type guards
+ * - Validation utilities
+ */
+
+// Placeholder export
+export const UTILS_VERSION = '1.0.0';
diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
new file mode 100644
index 0000000000000..f2347e8ce78ed
--- /dev/null
+++ b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
@@ -0,0 +1,21 @@
+{
+  "extends": "@kbn/tsconfig-base/tsconfig.json",
+  "compilerOptions": {
+    "outDir": "target/types",
+    "types": [
+      "jest",
+      "node",
+      "@kbn/ambient-common-types"
+    ]
+  },
+  "include": [
+    "**/*.ts",
+    "**/*.json"
+  ],
+  "exclude": [
+    "target/**/*"
+  ],
+  "kbn_references": [
+    "@kbn/evals",
+  ]
+}
diff --git a/x-pack/platform/packages/shared/kbn-evals/README.md b/x-pack/platform/packages/shared/kbn-evals/README.md
index e138d8ca28b30..09635727aaaad 100644
--- a/x-pack/platform/packages/shared/kbn-evals/README.md
+++ b/x-pack/platform/packages/shared/kbn-evals/README.md
@@ -2,6 +2,16 @@
 
 `@kbn/evals` contains utilities for writing offline evaluation suites against LLM-based workflows in Kibana.
 
+## Vision Alignment
+
+This package follows the strategic direction outlined in the "Future of @kbn/evals" vision document. Contributors should be aware of these principles:
+
+- **Trace-first evaluators**: New evaluators should derive signals from OTel traces stored in Elasticsearch when possible. Use `createTraceBasedEvaluator` for non-functional metrics. For evaluators that currently operate on in-memory output, design interfaces that also accept `traceId` references for future API-based evaluation.
+- **Elastic-native path**: Build on ES/Kibana/OTel capabilities rather than introducing new external dependencies. Phoenix usage should remain behind `KBN_EVALS_EXECUTOR=phoenix` and not expand.
+- **Shared evaluation layer**: This package provides primitives (evaluator factories, data model, persistence, reporting). Solution-specific evaluators, datasets, and reporting belong in solution-owned evaluation suites, not here.
+- **Code-defined datasets**: Evaluation datasets should be defined in code, versioned, and reviewed alongside suites. Ad-hoc datasets must be explicitly decoupled from CI-contributing datasets.
+- **Ownership**: Framework is owned by the Observability AI team. General-purpose evaluators discovered in solution suites should be contributed upstream.
+
 This package is built on top of `@kbn/scout` and the `@kbn/inference-*` packages. It bundles three main entry-points:
 
 1. `createPlaywrightEvalsConfig` – helper that returns a ready-made Playwright config for evaluation suites. It automatically:
diff --git a/x-pack/platform/packages/shared/kbn-evals/index.ts b/x-pack/platform/packages/shared/kbn-evals/index.ts
index b1ccf15f1a958..40e2ed178e34e 100644
--- a/x-pack/platform/packages/shared/kbn-evals/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/index.ts
@@ -4,6 +4,23 @@
  * 2.0; you may not use this file except in compliance with the Elastic License
  * 2.0.
  */
+
+/**
+ * @kbn/evals — Evaluation framework for LLM-based workflows in Kibana.
+ *
+ * This package provides the shared evaluation layer (vision Section 5.2.3): evaluator
+ * factories, data model types, persistence utilities, and reporting primitives. It is
+ * designed to be independent of how evaluations are triggered (CI/offline vs in-tool).
+ *
+ * ## Architecture boundaries
+ * - **Framework primitives** (this package): evaluator contracts, trace-based evaluators,
+ *   data model, persistence, reporting, CLI tooling
+ * - **Solution suites** (separate packages): datasets, tasks, solution-specific evaluators,
+ *   solution-specific reporting
+ *
+ * @module @kbn/evals
+ */
+
 // CLI tools
 export * as cli from './src/cli';
 
@@ -55,7 +72,19 @@ export {
 export { mapToEvaluationScoreDocuments, exportEvaluations } from './src/utils/report_model_score';
 
 export { parseSelectedEvaluators, selectEvaluators } from './src/evaluators/filter';
+/**
+ * Trace-based evaluators — the preferred pattern for non-functional metrics.
+ *
+ * These evaluators query OTel traces in Elasticsearch via ES|QL, extracting latency,
+ * token usage, tool calls, and skill invocations directly from production-grade traces.
+ * This is the trace-first evaluator pattern described in vision Section 5.2.1.
+ *
+ * New evaluators that measure non-functional signals should use `createTraceBasedEvaluator`
+ * rather than implementing custom ES queries.
+ */
 export {
+  createTraceBasedEvaluator,
+  type TraceBasedEvaluatorConfig,
   createSpanLatencyEvaluator,
   createSkillInvocationEvaluator,
 } from './src/evaluators/trace_based';
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts
index 0ce749750d618..3b853a90e400b 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts
@@ -11,6 +11,17 @@ import pRetry from 'p-retry';
 import type { Evaluator } from '../../types';
 import { LlmCoherenceEvaluationPrompt } from './prompt';
 
+/**
+ * LLM-as-a-judge evaluator that scores multi-turn conversation quality across four
+ * dimensions: topic consistency, context retention, contradiction detection, and
+ * resolution quality. Each dimension is scored 0–1 by the LLM, then averaged.
+ *
+ * Uses retry logic for resilience against transient LLM failures. Validates that
+ * all returned scores are finite numbers in the [0, 1] range.
+ *
+ * @param config.inferenceClient - Bound inference client for LLM calls
+ * @param config.log - Logger for retry warnings and error reporting
+ */
 export function createConversationCoherenceEvaluator(config: {
   inferenceClient: BoundInferenceClient;
   log: ToolingLog;
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
index 21ffd36ddd634..875ea488b80b0 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts
@@ -25,6 +25,18 @@ function computeMajority(scores: number[]): number {
   return ones > rounded.length / 2 ? 1 : 0;
 }
 
+/**
+ * Meta-evaluator that aggregates scores from multiple judge evaluators using a
+ * configurable strategy (mean, median, or majority vote).
+ *
+ * Individual judge failures are handled gracefully — failed judges are logged via
+ * the optional logger and excluded from aggregation. The evaluator's `kind` is
+ * derived from the judges: 'LLM' if any judge is LLM-based, 'CODE' otherwise.
+ *
+ * @param config.judges - Array of evaluators to aggregate
+ * @param config.strategy - Aggregation method: 'mean' | 'median' | 'majority' (default: 'mean')
+ * @param config.logger - Optional logger for warning on judge failures
+ */
 export function createMultiJudgeEvaluator(config: {
   judges: Evaluator[];
   strategy?: AggregationStrategy;
@@ -48,7 +60,7 @@ export function createMultiJudgeEvaluator(config: {
       results.forEach((result, i) => {
         if (result.status === 'fulfilled') {
           judgeResults.push({ name: judges[i].name, result: result.value });
-          if (result.value.score != null) {
+          if (result.value.score != null && Number.isFinite(result.value.score)) {
             scores.push(result.value.score);
           }
         } else {
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
index a9eb3681d972f..f6f8081067551 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts
@@ -5,6 +5,17 @@
  * 2.0.
  */
 
+/**
+ * Security evaluators for adversarial testing and guardrail validation.
+ *
+ * These evaluators currently operate on in-memory task output. Per the @kbn/evals
+ * vision (Section 5.2.1 — Trace-first evaluators), they should migrate to derive
+ * signals from OTel trace spans stored in Elasticsearch. When a `traceId` is
+ * available in the task output, evaluators propagate it in result metadata to
+ * support trace linkage (Section 5.2.2).
+ *
+ * @see {@link createTraceBasedEvaluator} for the trace-first evaluator factory
+ */
 import type { Evaluator } from '../../types';
 
 const DEFAULT_PROMPT_LEAK_PATTERNS = [
@@ -16,6 +27,12 @@ const DEFAULT_PROMPT_LEAK_PATTERNS = [
   /<<SYS>>/i,
 ];
 
+/**
+ * Detects unauthorized tool invocations by comparing actual tool calls against an allowlist.
+ *
+ * Returns score 1.0 when all tool calls are authorized, 0.0 when unauthorized tools are detected.
+ * Unauthorized tool names are included in the result metadata for investigation.
+ */
 export function createToolPoisoningEvaluator(config: {
   allowedTools: string[];
   extractToolCalls: (output: unknown) => string[];
@@ -57,6 +74,13 @@ export function createToolPoisoningEvaluator(config: {
   };
 }
 
+/**
+ * Detects potential system prompt leakage in model output using configurable regex patterns.
+ *
+ * Scans both plain text and code blocks separately. Excluded patterns are stripped before
+ * scanning to allow known-safe content. Returns score 1.0 when no leak indicators found,
+ * 0.0 with detected pattern details when leaks are identified.
+ */
 export function createPromptLeakDetectionEvaluator(config?: {
   patterns?: RegExp[];
   excludePatterns?: RegExp[];
@@ -89,6 +113,7 @@ export function createPromptLeakDetectionEvaluator(config?: {
       const detectedPatterns: Array<{ pattern: string; location: 'text' | 'codeblock' }> = [];
 
       for (const pattern of patterns) {
+        pattern.lastIndex = 0;
         if (pattern.test(strippedPlainText)) {
           detectedPatterns.push({ pattern: pattern.source, location: 'text' });
         }
@@ -97,6 +122,7 @@ export function createPromptLeakDetectionEvaluator(config?: {
       for (const block of codeBlocks) {
         const strippedBlock = stripExcludedSegments(block);
         for (const pattern of patterns) {
+          pattern.lastIndex = 0;
           if (pattern.test(strippedBlock)) {
             detectedPatterns.push({ pattern: pattern.source, location: 'codeblock' });
           }
@@ -123,6 +149,13 @@ export function createPromptLeakDetectionEvaluator(config?: {
   };
 }
 
+/**
+ * Validates that model output stays within defined scope boundaries using regex patterns.
+ *
+ * Returns score 1.0 when output matches at least one allowed pattern, 0.0 when output
+ * falls outside all allowed patterns. Useful for ensuring agents don't drift into
+ * unauthorized domains.
+ */
 export function createScopeViolationEvaluator(config: { allowedPatterns: RegExp[] }): Evaluator {
   const { allowedPatterns } = config;
 
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts
index d42b6744f8e8d..68a39d27ee78e 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts
@@ -65,6 +65,15 @@ function sortKeys(value: unknown): unknown {
     }, {});
 }
 
+/**
+ * Computes term-frequency cosine similarity between expected and actual outputs.
+ *
+ * Both inputs are normalized to lowercase tokens. Objects are sorted by keys and
+ * serialized to JSON for consistent comparison. Returns a score between 0 and 1,
+ * with a configurable threshold for the similar/dissimilar label.
+ *
+ * @param config.threshold - Minimum cosine similarity to be labeled 'similar' (default: 0.7)
+ */
 export function createSimilarityEvaluator(config?: { threshold?: number }): Evaluator {
   const threshold = config?.threshold ?? 0.7;
 
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts
index 5c460e2e3ab2c..e5011c6eb9ab1 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts
@@ -40,6 +40,18 @@ function computeLCS(a: string[], b: string[]): string[] {
   return lcs;
 }
 
+/**
+ * Evaluates tool-call sequence alignment against a golden path using Longest Common
+ * Subsequence (LCS) for order scoring and set intersection for coverage scoring.
+ *
+ * The final score is a weighted combination of order and coverage scores.
+ * Both weights must sum to 1.
+ *
+ * @param config.extractToolCalls - Extracts actual tool call names from task output
+ * @param config.goldenPathExtractor - Extracts expected tool sequence from ground truth
+ * @param config.orderWeight - Weight for LCS-based order score (default: 0.5)
+ * @param config.coverageWeight - Weight for set-based coverage score (default: 0.5)
+ */
 export function createTrajectoryEvaluator(config: {
   extractToolCalls: (output: unknown) => string[];
   goldenPathExtractor: (expected: unknown) => string[];
diff --git a/x-pack/platform/packages/shared/kbn-evals/src/types.ts b/x-pack/platform/packages/shared/kbn-evals/src/types.ts
index 9ef5be0233633..35e7646bb4f16 100644
--- a/x-pack/platform/packages/shared/kbn-evals/src/types.ts
+++ b/x-pack/platform/packages/shared/kbn-evals/src/types.ts
@@ -65,6 +65,10 @@ export interface EvaluatorParams<TExample extends Example, TTaskOutput extends T
 /**
  * Evaluation output returned by evaluators.
  *
+ * Follows the trace-first evaluator contract (vision Section 5.2.1): evaluators produce
+ * standardized score/label/explanation outputs. The `metadata` field can carry trace
+ * references and evaluator-specific details for explainability.
+ *
  * This shape is intentionally compatible with the existing evaluator implementations and
  * the Phoenix client types:
  * - `score` may be omitted or `null` for "unavailable"/"error" cases
@@ -83,6 +87,17 @@ type EvaluatorCallback<TExample extends Example, TTaskOutput extends TaskOutput>
   params: EvaluatorParams<TExample, TTaskOutput>
 ) => Promise<EvaluationResult>;
 
+/**
+ * Core evaluator interface.
+ *
+ * All evaluators — whether CODE-kind (deterministic) or LLM-kind (model-scored) — implement
+ * this interface. Per the @kbn/evals vision (Section 5.2.1), evaluators should progressively
+ * migrate to deriving signals from OTel traces stored in Elasticsearch rather than only
+ * operating on in-memory task output. Use {@link createTraceBasedEvaluator} for trace-native
+ * evaluators.
+ *
+ * @see TraceBasedEvaluatorConfig for the trace-first evaluator factory configuration
+ */
 export interface Evaluator<
   TExample extends Example = Example,
   TTaskOutput extends TaskOutput = TaskOutput
diff --git a/yarn.lock b/yarn.lock
index 023c0253c5631..4834031837a22 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -6827,6 +6827,10 @@
   version "0.0.0"
   uid ""
 
+"@kbn/evals-extensions@link:x-pack/platform/packages/shared/kbn-evals-extensions":
+  version "0.0.0"
+  uid ""
+
 "@kbn/evals-phoenix-executor@link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor":
   version "0.0.0"
   uid ""