elastic · patrykkopycinski · Mar 27, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -996,6 +996,7 @@ x-pack/platform/packages/shared/kbn-entities-schema @elastic/core-analysis
 x-pack/platform/packages/shared/kbn-es-snapshot-loader @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals @elastic/obs-ai-team @elastic/security-generative-ai
 x-pack/platform/packages/shared/kbn-evals-common @elastic/obs-ai-team @elastic/security-generative-ai
+x-pack/platform/packages/shared/kbn-evals-extensions @elastic/obs-ai-team @elastic/security-generative-ai
 x-pack/platform/packages/shared/kbn-evals-phoenix-executor @elastic/obs-ai-team
 x-pack/platform/packages/shared/kbn-evals-suite-streams @elastic/obs-onboarding-team @elastic/obs-sig-events-team
 x-pack/platform/packages/shared/kbn-event-stacktrace @elastic/obs-presentation-team @elastic/obs-exploration-team

diff --git a/package.json b/package.json
@@ -1690,6 +1690,7 @@
     "@kbn/eslint-plugin-telemetry": "link:packages/kbn-eslint-plugin-telemetry",
     "@kbn/esql-resource-browser-storybook-config": "link:src/platform/packages/shared/kbn-esql-resource-browser/.storybook",
     "@kbn/evals": "link:x-pack/platform/packages/shared/kbn-evals",
+    "@kbn/evals-extensions": "link:x-pack/platform/packages/shared/kbn-evals-extensions",
     "@kbn/evals-phoenix-executor": "link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor",
     "@kbn/evals-suite-agent-builder": "link:x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder",
     "@kbn/evals-suite-endpoint": "link:x-pack/solutions/security/packages/kbn-evals-suite-endpoint",

diff --git a/tsconfig.base.json b/tsconfig.base.json
@@ -1144,6 +1144,8 @@
       "@kbn/evals/*": ["x-pack/platform/packages/shared/kbn-evals/*"],
       "@kbn/evals-common": ["x-pack/platform/packages/shared/kbn-evals-common"],
       "@kbn/evals-common/*": ["x-pack/platform/packages/shared/kbn-evals-common/*"],
+      "@kbn/evals-extensions": ["x-pack/platform/packages/shared/kbn-evals-extensions"],
+      "@kbn/evals-extensions/*": ["x-pack/platform/packages/shared/kbn-evals-extensions/*"],
       "@kbn/evals-phoenix-executor": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor"],
       "@kbn/evals-phoenix-executor/*": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor/*"],
       "@kbn/evals-plugin": ["x-pack/platform/plugins/shared/evals"],

@@ -0,0 +1,17 @@
+# Build output
+target/
+*.js
+!jest.config.js
+*.d.ts
+tsconfig.tsbuildinfo
+
+# Dependencies
+node_modules/
+
+# IDE
+.vscode/
+.idea/
+
+# OS
+.DS_Store
+Thumbs.db
@@ -0,0 +1,211 @@
+# @kbn/evals-extensions
+
+Advanced evaluation capabilities for `@kbn/evals` - **standalone extensions package**.
+
+## Purpose
+
+This package extends `@kbn/evals` with advanced features ported from [cursor-plugin-evals](https://github.com/patrykkopycinski/cursor-plugin-evals) and serves as the home for Phases 3-5 of the evals roadmap.
+
+## Architecture: Independent Package Design
+
+**Critical principle:** This package is designed to be **completely independent** from `@kbn/evals`.
+
+```
+┌─────────────────────────────────────────────────────┐
+│              Evaluation Suites                      │
+│  (agent-builder, obs-ai-assistant, security)        │
+└──────────────────┬──────────────────────────────────┘
+                   │
+        ┌──────────┴──────────┐
+        │                     │
+        ▼                     ▼
+┌──────────────────┐   ┌─────────────────────────────┐
+│   @kbn/evals     │   │   @kbn/evals-extensions     │
+│   (core)         │   │   (advanced features)       │
+│                  │   │                             │
+│ ✅ Evaluators    │   │ ✅ Safety evaluators        │
+│ ✅ Scout/PW      │   │ ✅ Cost tracking            │
+│ ✅ ES export     │   │ ✅ Dataset management       │
+│ ✅ Stats         │   │ ✅ UI components            │
+│ ✅ CLI basics    │   │ ✅ Watch mode               │
+│                  │   │ ✅ A/B testing              │
+│ ❌ NO imports    │   │ ✅ Human-in-the-loop        │
+│    from ext ─────┼───┼──X                          │
+│                  │   │                             │
+└──────────────────┘   └──────────┬──────────────────┘
+                                  │
+                                  │ depends on
+                                  ▼
+                       ┌──────────────────┐
+                       │   @kbn/evals     │
+                       │   (types, utils) │
+                       └──────────────────┘
+```
+
+**Dependency Rules:**
+- ✅ `kbn-evals-extensions` CAN import from `kbn-evals`
+- ❌ `kbn-evals` MUST NOT import from `kbn-evals-extensions`
+- ✅ Evaluation suites can use both packages independently
+
+## Features
+
+### Current Status: Foundation (PR #1)
+- ✅ Package structure established
+- ✅ Build configuration
+- ✅ Test infrastructure
+- ❌ No functional features yet (placeholder exports only)
+
+### Roadmap
+
+#### **PR #2: Cost Tracking & Metadata** (Weeks 2-3)
+- Token-based cost calculation
+- Hyperparameter tracking (temperature, top_p, etc.)
+- Environment snapshots (Kibana/ES versions, plugins)
+- Run tagging and annotations
+
+#### **PR #3: Dataset Management** (Weeks 4-6)
+- Dataset versioning (semantic versioning)
+- Schema validation (Zod-based)
+- Deduplication (similarity-based)
+- Merging and splitting utilities
+- Filtering and statistics
+
+#### **PR #4: Safety Evaluators** (Weeks 7-10)
+- Toxicity detection
+- PII detection
+- Bias detection
+- Hallucination detection
+- Refusal testing
+- Content moderation
+
+#### **PR #5: UI Components** (Weeks 11-16)
+- Run comparison viewer (side-by-side diff)
+- Example explorer (worst-case analysis)
+- Score distribution charts
+- Integration with evals Kibana plugin
+
+#### **PR #6: DX Enhancements** (Weeks 17-21)
+- Watch mode (auto-rerun on changes)
+- Parallel execution (multi-suite concurrency)
+- Result caching (skip unchanged examples)
+- Incremental evaluation (delta-only runs)
+- Interactive mode (step-through debugging)
+- Dry-run mode (validation without execution)
+
+#### **PR #7: Advanced Analytics** (Weeks 22-24)
+- Confidence intervals (bootstrapping)
+- Outlier detection (Z-score, IQR, Isolation Forest)
+- Failure clustering (K-means, hierarchical)
+- Error taxonomy
+- Ensemble evaluation
+- Calibration analysis
+
+#### **PR #8: A/B Testing & Active Learning** (Weeks 25-29)
+- A/B testing framework with statistical tests
+- Bandit algorithms (epsilon-greedy, UCB, Thompson sampling)
+- Active learning (uncertainty and diversity sampling)
+
+#### **PR #9: Human-in-the-Loop** (Weeks 30-35)
+- Review queue UI
+- Annotation interface
+- Assignment workflow
+- Inter-rater reliability
+- Conflict resolution
+
+#### **PR #10: IDE Integration** (Weeks 36-39)
+- VS Code extension
+- Cursor skills for eval authoring
+- AI-assisted dataset creation
+
+## Usage
+
+### Opting In to Extensions
+
+Evaluation suites import extensions explicitly:
+
+```typescript
+// Example: agent-builder evaluation suite
+import { evaluate } from '@kbn/evals';
+import {
+  createToxicityEvaluator,
+  createPiiDetector,
+  createBiasEvaluator,
+  costTracker,
+  watchMode
+} from '@kbn/evals-extensions';
+
+evaluate('security test', async ({ executorClient }) => {
+  // Mix core and extension evaluators
+  await executorClient.runExperiment(
+    { dataset, task },
+    [
+      ...createCorrectnessEvaluators(),     // core kbn/evals
+      createToxicityEvaluator(),            // extension
+      createPiiDetector(),                  // extension
+    ]
+  );
+
+  // Use extension features
+  await costTracker.logRunCost(executorClient.getRunId());
+});
+```
+
+### Feature Flags
+
+Extensions use environment variables for opt-in behavior:
+
+```bash
+# Enable watch mode
+KBN_EVALS_EXT_WATCH_MODE=true node scripts/evals run --suite <id>
+
+# Enable parallel execution
+KBN_EVALS_EXT_PARALLEL=true node scripts/evals run --suite <id>
+
+# Enable result caching
+KBN_EVALS_EXT_CACHE=true node scripts/evals run --suite <id>
+```
+
+## Why a Separate Package?
+
+1. **Clear boundaries** - Extensions don't pollute core framework
+2. **Independent evolution** - Iterate without affecting core
+3. **Optional adoption** - Suites choose which features to use
+4. **Parallel development** - Teams work without conflicts
+5. **Easier testing** - Integration tests isolated
+6. **Future migration** - Can promote mature features to core later
+
+## Vision Alignment
+
+All features follow principles from "Future of @kbn/evals":
+- **Trace-first**: Leverage OTel traces when applicable
+- **Elastic-native**: No external dependencies
+- **Shared layer**: Provide composable primitives
+- **Code-defined**: Datasets versioned in code
+
+## Development
+
+### Running Tests
+
+```bash
+yarn test:jest --testPathPattern=kbn-evals-extensions
+```
+
+### Type Checking
+
+```bash
+yarn test:type_check --project x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json
+```
+
+### Linting
+
+```bash
+node scripts/eslint --fix x-pack/platform/packages/shared/kbn-evals-extensions
+```
+
+## Contributing
+
+See individual feature directories for contribution guidelines. All PRs should:
+- Follow Kibana code standards
+- Include unit tests
+- Update this README with new exports
+- Maintain independence from `@kbn/evals` core
@@ -0,0 +1,53 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Basic package health checks for @kbn/evals-extensions
+ */
+
+import { EVALS_EXTENSIONS_VERSION } from '..';
+
+describe('@kbn/evals-extensions', () => {
+  describe('package structure', () => {
+    it('should export EVALS_EXTENSIONS_VERSION', () => {
+      expect(EVALS_EXTENSIONS_VERSION).toBe('1.0.0');
+    });
+
+    it('should be importable without errors', async () => {
+      const mod = await import('..');
+      expect(mod).toBeDefined();
+    });
+  });
+
+  describe('dependency isolation', () => {
+    it('should not create circular dependencies with @kbn/evals', async () => {
+      // This test ensures we maintain one-way dependency:
+      // kbn-evals-extensions → depends on → kbn-evals
+      // kbn-evals → MUST NOT depend on → kbn-evals-extensions
+
+      // Both packages should be importable
+      const evalsExtensions = await import('..');
+      const kbnEvals = await import('@kbn/evals');
+
+      expect(evalsExtensions).toBeDefined();
+      expect(kbnEvals).toBeDefined();
+
+      // kbn-evals-extensions can use kbn-evals types (verified by compilation)
+      // kbn-evals should have no knowledge of kbn-evals-extensions
+      // This is enforced by TypeScript references in tsconfig.json
+    });
+  });
+
+  describe('exports', () => {
+    it('should re-export core types from @kbn/evals', async () => {
+      // Type exports are verified at compile time
+      // Runtime check just ensures module loads
+      const exports = await import('..');
+      expect(exports).toBeDefined();
+    });
+  });
+});
@@ -0,0 +1,82 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * @kbn/evals-extensions - Advanced evaluation capabilities
+ *
+ * This package provides standalone extensions for @kbn/evals.
+ * It does NOT modify the core @kbn/evals package.
+ *
+ * ## Architecture
+ *
+ * Dependency flow:
+ * - ✅ kbn-evals-extensions → imports from → kbn-evals
+ * - ❌ kbn-evals → MUST NOT import from → kbn-evals-extensions
+ *
+ * Evaluation suites can opt-in to extensions by importing directly:
+ *
+ * @example
+ * ```typescript
+ * import { evaluate } from '@kbn/evals';
+ * import { createToxicityEvaluator, costTracker } from '@kbn/evals-extensions';
+ *
+ * evaluate('test', async ({ executorClient }) => {
+ *   await executorClient.runExperiment(
+ *     { dataset, task },
+ *     [createToxicityEvaluator()]  // Extension evaluator
+ *   );
+ *   await costTracker.logRunCost(runId);  // Extension feature
+ * });
+ * ```
+ *
+ * ## Roadmap
+ *
+ * Features are being added incrementally:
+ * - **PR #1**: Foundation (current) - Package setup, no functional changes
+ * - **PR #2**: Cost tracking & metadata
+ * - **PR #3**: Dataset management utilities
+ * - **PR #4**: Safety evaluators (toxicity, PII, bias, etc.)
+ * - **PR #5**: UI components (run comparison, example explorer)
+ * - **PR #6**: DX enhancements (watch mode, caching, parallel execution)
+ * - **PR #7**: Advanced analytics (confidence intervals, outlier detection)
+ * - **PR #8**: A/B testing & active learning
+ * - **PR #9**: Human-in-the-loop workflows
+ * - **PR #10**: IDE integration (VS Code extension, Cursor skills)
+ *
+ * @packageDocumentation
+ */
+
+// Re-export core types from kbn-evals for convenience
+// This allows users to import from one place, but doesn't create reverse dependency
+export type { Evaluator, Example, EvaluationDataset, TaskOutput } from '@kbn/evals';
+
+export type { EvaluationScoreDocument } from '@kbn/evals';
+
+/**
+ * Extension-specific types (to be populated in future PRs)
+ */
+export interface ExtensionConfig {
+  /**
+   * Configuration for extension features
+   * Will be expanded as features are added
+   */
+  placeholder?: string;
+}
+
+/**
+ * Feature exports (to be populated in future PRs)
+ *
+ * Examples of what will be exported:
+ * - export { createToxicityEvaluator } from './src/evaluators/safety/toxicity';
+ * - export { costTracker } from './src/tracking/cost_calculator';
+ * - export { watchMode } from './src/execution/watch_mode';
+ * - export { createABTest } from './src/experimentation/ab_testing/framework';
+ * - export { reviewQueue } from './src/human_review/workflow/review_workflow';
+ */
+
+// Placeholder export to ensure package builds
+export const EVALS_EXTENSIONS_VERSION = '1.0.0';