diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e0c7ed78284a2..862184470fa74 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -996,6 +996,7 @@ x-pack/platform/packages/shared/kbn-entities-schema @elastic/core-analysis x-pack/platform/packages/shared/kbn-es-snapshot-loader @elastic/obs-ai-team x-pack/platform/packages/shared/kbn-evals @elastic/obs-ai-team @elastic/security-generative-ai x-pack/platform/packages/shared/kbn-evals-common @elastic/obs-ai-team @elastic/security-generative-ai +x-pack/platform/packages/shared/kbn-evals-extensions @elastic/obs-ai-team @elastic/security-generative-ai x-pack/platform/packages/shared/kbn-evals-phoenix-executor @elastic/obs-ai-team x-pack/platform/packages/shared/kbn-evals-suite-streams @elastic/obs-onboarding-team @elastic/obs-sig-events-team x-pack/platform/packages/shared/kbn-event-stacktrace @elastic/obs-presentation-team @elastic/obs-exploration-team diff --git a/package.json b/package.json index 9f73cc43e0156..9e3ca568a85ef 100644 --- a/package.json +++ b/package.json @@ -1690,6 +1690,7 @@ "@kbn/eslint-plugin-telemetry": "link:packages/kbn-eslint-plugin-telemetry", "@kbn/esql-resource-browser-storybook-config": "link:src/platform/packages/shared/kbn-esql-resource-browser/.storybook", "@kbn/evals": "link:x-pack/platform/packages/shared/kbn-evals", + "@kbn/evals-extensions": "link:x-pack/platform/packages/shared/kbn-evals-extensions", "@kbn/evals-phoenix-executor": "link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor", "@kbn/evals-suite-agent-builder": "link:x-pack/platform/packages/shared/agent-builder/kbn-evals-suite-agent-builder", "@kbn/evals-suite-endpoint": "link:x-pack/solutions/security/packages/kbn-evals-suite-endpoint", diff --git a/tsconfig.base.json b/tsconfig.base.json index bf7f014138503..7d53a18fbc3d4 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -1144,6 +1144,8 @@ "@kbn/evals/*": ["x-pack/platform/packages/shared/kbn-evals/*"], "@kbn/evals-common": ["x-pack/platform/packages/shared/kbn-evals-common"], "@kbn/evals-common/*": ["x-pack/platform/packages/shared/kbn-evals-common/*"], + "@kbn/evals-extensions": ["x-pack/platform/packages/shared/kbn-evals-extensions"], + "@kbn/evals-extensions/*": ["x-pack/platform/packages/shared/kbn-evals-extensions/*"], "@kbn/evals-phoenix-executor": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor"], "@kbn/evals-phoenix-executor/*": ["x-pack/platform/packages/shared/kbn-evals-phoenix-executor/*"], "@kbn/evals-plugin": ["x-pack/platform/plugins/shared/evals"], diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore b/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore new file mode 100644 index 0000000000000..c3d694ce14f84 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/.gitignore @@ -0,0 +1,17 @@ +# Build output +target/ +*.js +!jest.config.js +*.d.ts +tsconfig.tsbuildinfo + +# Dependencies +node_modules/ + +# IDE +.vscode/ +.idea/ + +# OS +.DS_Store +Thumbs.db diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/README.md b/x-pack/platform/packages/shared/kbn-evals-extensions/README.md new file mode 100644 index 0000000000000..4c4e87be6bcb2 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/README.md @@ -0,0 +1,211 @@ +# @kbn/evals-extensions + +Advanced evaluation capabilities for `@kbn/evals` - **standalone extensions package**. + +## Purpose + +This package extends `@kbn/evals` with advanced features ported from [cursor-plugin-evals](https://github.com/patrykkopycinski/cursor-plugin-evals) and serves as the home for Phases 3-5 of the evals roadmap. + +## Architecture: Independent Package Design + +**Critical principle:** This package is designed to be **completely independent** from `@kbn/evals`. + +``` +┌─────────────────────────────────────────────────────┐ +│ Evaluation Suites │ +│ (agent-builder, obs-ai-assistant, security) │ +└──────────────────┬──────────────────────────────────┘ + │ + ┌──────────┴──────────┐ + │ │ + ▼ ▼ +┌──────────────────┐ ┌─────────────────────────────┐ +│ @kbn/evals │ │ @kbn/evals-extensions │ +│ (core) │ │ (advanced features) │ +│ │ │ │ +│ ✅ Evaluators │ │ ✅ Safety evaluators │ +│ ✅ Scout/PW │ │ ✅ Cost tracking │ +│ ✅ ES export │ │ ✅ Dataset management │ +│ ✅ Stats │ │ ✅ UI components │ +│ ✅ CLI basics │ │ ✅ Watch mode │ +│ │ │ ✅ A/B testing │ +│ ❌ NO imports │ │ ✅ Human-in-the-loop │ +│ from ext ─────┼───┼──X │ +│ │ │ │ +└──────────────────┘ └──────────┬──────────────────┘ + │ + │ depends on + ▼ + ┌──────────────────┐ + │ @kbn/evals │ + │ (types, utils) │ + └──────────────────┘ +``` + +**Dependency Rules:** +- ✅ `kbn-evals-extensions` CAN import from `kbn-evals` +- ❌ `kbn-evals` MUST NOT import from `kbn-evals-extensions` +- ✅ Evaluation suites can use both packages independently + +## Features + +### Current Status: Foundation (PR #1) +- ✅ Package structure established +- ✅ Build configuration +- ✅ Test infrastructure +- ❌ No functional features yet (placeholder exports only) + +### Roadmap + +#### **PR #2: Cost Tracking & Metadata** (Weeks 2-3) +- Token-based cost calculation +- Hyperparameter tracking (temperature, top_p, etc.) +- Environment snapshots (Kibana/ES versions, plugins) +- Run tagging and annotations + +#### **PR #3: Dataset Management** (Weeks 4-6) +- Dataset versioning (semantic versioning) +- Schema validation (Zod-based) +- Deduplication (similarity-based) +- Merging and splitting utilities +- Filtering and statistics + +#### **PR #4: Safety Evaluators** (Weeks 7-10) +- Toxicity detection +- PII detection +- Bias detection +- Hallucination detection +- Refusal testing +- Content moderation + +#### **PR #5: UI Components** (Weeks 11-16) +- Run comparison viewer (side-by-side diff) +- Example explorer (worst-case analysis) +- Score distribution charts +- Integration with evals Kibana plugin + +#### **PR #6: DX Enhancements** (Weeks 17-21) +- Watch mode (auto-rerun on changes) +- Parallel execution (multi-suite concurrency) +- Result caching (skip unchanged examples) +- Incremental evaluation (delta-only runs) +- Interactive mode (step-through debugging) +- Dry-run mode (validation without execution) + +#### **PR #7: Advanced Analytics** (Weeks 22-24) +- Confidence intervals (bootstrapping) +- Outlier detection (Z-score, IQR, Isolation Forest) +- Failure clustering (K-means, hierarchical) +- Error taxonomy +- Ensemble evaluation +- Calibration analysis + +#### **PR #8: A/B Testing & Active Learning** (Weeks 25-29) +- A/B testing framework with statistical tests +- Bandit algorithms (epsilon-greedy, UCB, Thompson sampling) +- Active learning (uncertainty and diversity sampling) + +#### **PR #9: Human-in-the-Loop** (Weeks 30-35) +- Review queue UI +- Annotation interface +- Assignment workflow +- Inter-rater reliability +- Conflict resolution + +#### **PR #10: IDE Integration** (Weeks 36-39) +- VS Code extension +- Cursor skills for eval authoring +- AI-assisted dataset creation + +## Usage + +### Opting In to Extensions + +Evaluation suites import extensions explicitly: + +```typescript +// Example: agent-builder evaluation suite +import { evaluate } from '@kbn/evals'; +import { + createToxicityEvaluator, + createPiiDetector, + createBiasEvaluator, + costTracker, + watchMode +} from '@kbn/evals-extensions'; + +evaluate('security test', async ({ executorClient }) => { + // Mix core and extension evaluators + await executorClient.runExperiment( + { dataset, task }, + [ + ...createCorrectnessEvaluators(), // core kbn/evals + createToxicityEvaluator(), // extension + createPiiDetector(), // extension + ] + ); + + // Use extension features + await costTracker.logRunCost(executorClient.getRunId()); +}); +``` + +### Feature Flags + +Extensions use environment variables for opt-in behavior: + +```bash +# Enable watch mode +KBN_EVALS_EXT_WATCH_MODE=true node scripts/evals run --suite + +# Enable parallel execution +KBN_EVALS_EXT_PARALLEL=true node scripts/evals run --suite + +# Enable result caching +KBN_EVALS_EXT_CACHE=true node scripts/evals run --suite +``` + +## Why a Separate Package? + +1. **Clear boundaries** - Extensions don't pollute core framework +2. **Independent evolution** - Iterate without affecting core +3. **Optional adoption** - Suites choose which features to use +4. **Parallel development** - Teams work without conflicts +5. **Easier testing** - Integration tests isolated +6. **Future migration** - Can promote mature features to core later + +## Vision Alignment + +All features follow principles from "Future of @kbn/evals": +- **Trace-first**: Leverage OTel traces when applicable +- **Elastic-native**: No external dependencies +- **Shared layer**: Provide composable primitives +- **Code-defined**: Datasets versioned in code + +## Development + +### Running Tests + +```bash +yarn test:jest --testPathPattern=kbn-evals-extensions +``` + +### Type Checking + +```bash +yarn test:type_check --project x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json +``` + +### Linting + +```bash +node scripts/eslint --fix x-pack/platform/packages/shared/kbn-evals-extensions +``` + +## Contributing + +See individual feature directories for contribution guidelines. All PRs should: +- Follow Kibana code standards +- Include unit tests +- Update this README with new exports +- Maintain independence from `@kbn/evals` core diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts new file mode 100644 index 0000000000000..3cad7400b2597 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/__tests__/package.test.ts @@ -0,0 +1,53 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * Basic package health checks for @kbn/evals-extensions + */ + +import { EVALS_EXTENSIONS_VERSION } from '..'; + +describe('@kbn/evals-extensions', () => { + describe('package structure', () => { + it('should export EVALS_EXTENSIONS_VERSION', () => { + expect(EVALS_EXTENSIONS_VERSION).toBe('1.0.0'); + }); + + it('should be importable without errors', async () => { + const mod = await import('..'); + expect(mod).toBeDefined(); + }); + }); + + describe('dependency isolation', () => { + it('should not create circular dependencies with @kbn/evals', async () => { + // This test ensures we maintain one-way dependency: + // kbn-evals-extensions → depends on → kbn-evals + // kbn-evals → MUST NOT depend on → kbn-evals-extensions + + // Both packages should be importable + const evalsExtensions = await import('..'); + const kbnEvals = await import('@kbn/evals'); + + expect(evalsExtensions).toBeDefined(); + expect(kbnEvals).toBeDefined(); + + // kbn-evals-extensions can use kbn-evals types (verified by compilation) + // kbn-evals should have no knowledge of kbn-evals-extensions + // This is enforced by TypeScript references in tsconfig.json + }); + }); + + describe('exports', () => { + it('should re-export core types from @kbn/evals', async () => { + // Type exports are verified at compile time + // Runtime check just ensures module loads + const exports = await import('..'); + expect(exports).toBeDefined(); + }); + }); +}); diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts new file mode 100644 index 0000000000000..5a82567054db1 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/index.ts @@ -0,0 +1,82 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * @kbn/evals-extensions - Advanced evaluation capabilities + * + * This package provides standalone extensions for @kbn/evals. + * It does NOT modify the core @kbn/evals package. + * + * ## Architecture + * + * Dependency flow: + * - ✅ kbn-evals-extensions → imports from → kbn-evals + * - ❌ kbn-evals → MUST NOT import from → kbn-evals-extensions + * + * Evaluation suites can opt-in to extensions by importing directly: + * + * @example + * ```typescript + * import { evaluate } from '@kbn/evals'; + * import { createToxicityEvaluator, costTracker } from '@kbn/evals-extensions'; + * + * evaluate('test', async ({ executorClient }) => { + * await executorClient.runExperiment( + * { dataset, task }, + * [createToxicityEvaluator()] // Extension evaluator + * ); + * await costTracker.logRunCost(runId); // Extension feature + * }); + * ``` + * + * ## Roadmap + * + * Features are being added incrementally: + * - **PR #1**: Foundation (current) - Package setup, no functional changes + * - **PR #2**: Cost tracking & metadata + * - **PR #3**: Dataset management utilities + * - **PR #4**: Safety evaluators (toxicity, PII, bias, etc.) + * - **PR #5**: UI components (run comparison, example explorer) + * - **PR #6**: DX enhancements (watch mode, caching, parallel execution) + * - **PR #7**: Advanced analytics (confidence intervals, outlier detection) + * - **PR #8**: A/B testing & active learning + * - **PR #9**: Human-in-the-loop workflows + * - **PR #10**: IDE integration (VS Code extension, Cursor skills) + * + * @packageDocumentation + */ + +// Re-export core types from kbn-evals for convenience +// This allows users to import from one place, but doesn't create reverse dependency +export type { Evaluator, Example, EvaluationDataset, TaskOutput } from '@kbn/evals'; + +export type { EvaluationScoreDocument } from '@kbn/evals'; + +/** + * Extension-specific types (to be populated in future PRs) + */ +export interface ExtensionConfig { + /** + * Configuration for extension features + * Will be expanded as features are added + */ + placeholder?: string; +} + +/** + * Feature exports (to be populated in future PRs) + * + * Examples of what will be exported: + * - export { createToxicityEvaluator } from './src/evaluators/safety/toxicity'; + * - export { costTracker } from './src/tracking/cost_calculator'; + * - export { watchMode } from './src/execution/watch_mode'; + * - export { createABTest } from './src/experimentation/ab_testing/framework'; + * - export { reviewQueue } from './src/human_review/workflow/review_workflow'; + */ + +// Placeholder export to ensure package builds +export const EVALS_EXTENSIONS_VERSION = '1.0.0'; diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js b/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js new file mode 100644 index 0000000000000..60bb4e9652f53 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/jest.config.js @@ -0,0 +1,12 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +module.exports = { + preset: '@kbn/test/jest_node', + rootDir: '../../../../..', + roots: ['/x-pack/platform/packages/shared/kbn-evals-extensions'], +}; diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc new file mode 100644 index 0000000000000..fdea4cb3f5818 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/kibana.jsonc @@ -0,0 +1,7 @@ +{ + "type": "test-helper", + "id": "@kbn/evals-extensions", + "owner": ["@elastic/obs-ai-team", "@elastic/security-generative-ai"], + "description": "Advanced evaluation capabilities for @kbn/evals - standalone extensions package. Home for features ported from cursor-plugin-evals and Phases 3-5 of evals roadmap.", + "devOnly": true +} diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml new file mode 100644 index 0000000000000..f07149989f299 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/moon.yml @@ -0,0 +1,54 @@ +# This file is generated by the @kbn/moon package. Any manual edits will be erased! +# To extend this, write your extensions/overrides to 'moon.extend.yml' +# then regenerate this file with: 'node scripts/regenerate_moon_projects.js --update --filter @kbn/evals-extensions' + +$schema: https://moonrepo.dev/schemas/project.json +id: '@kbn/evals-extensions' +layer: unknown +owners: + defaultOwner: '@elastic/obs-ai-team' +toolchains: + default: node +language: typescript +project: + title: '@kbn/evals-extensions' + description: Moon project for @kbn/evals-extensions + channel: '' + owner: '@elastic/obs-ai-team' + sourceRoot: x-pack/platform/packages/shared/kbn-evals-extensions +dependsOn: + - '@kbn/evals' +tags: + - test-helper + - package + - dev + - group-undefined + - jest-unit-tests +fileGroups: + src: + - '**/*.ts' + - '**/*.json' + - '!target/**/*' +tasks: + jest: + command: node + args: + - '--no-experimental-require-module' + - $workspaceRoot/scripts/jest + - '--config' + - $projectRoot/jest.config.js + options: + runFromWorkspaceRoot: true + inputs: + - '@group(src)' + jestCI: + command: node + args: + - '--no-experimental-require-module' + - $workspaceRoot/scripts/jest + - '--config' + - $projectRoot/jest.config.js + options: + runFromWorkspaceRoot: true + inputs: + - '@group(src)' diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/package.json b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json new file mode 100644 index 0000000000000..830ebc4dcaef2 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/package.json @@ -0,0 +1,8 @@ +{ + "name": "@kbn/evals-extensions", + "version": "1.0.0", + "private": true, + "description": "Advanced evaluation capabilities - standalone extensions for @kbn/evals. Features ported from cursor-plugin-evals and home for Phases 3-5 of evals roadmap.", + "license": "Elastic License 2.0", + "main": "./index.ts" +} diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts new file mode 100644 index 0000000000000..e14da609f2e38 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/index.ts @@ -0,0 +1,14 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * Internal exports for @kbn/evals-extensions + * External API surface is defined in the root index.ts + */ + +export type * from './types'; +export * from './utils'; diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts new file mode 100644 index 0000000000000..90cd9b0eea61b --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/types/index.ts @@ -0,0 +1,47 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * Shared types for @kbn/evals-extensions + * + * NOTE: This package depends on @kbn/evals but @kbn/evals does NOT depend on this package. + * Keep types that need to be shared with core @kbn/evals in @kbn/evals itself. + * + * Types here are specific to extension features and will be populated as features are added. + */ + +/** + * Placeholder type to ensure package builds + * Will be replaced/extended as features are added in subsequent PRs + */ +export interface ExtensionPlaceholder { + version: string; + description: string; +} + +/** + * Future type exports (to be added in subsequent PRs): + * + * PR #2: Cost tracking types + * - export interface CostData { ... } + * - export interface HyperparameterConfig { ... } + * - export interface EnvironmentSnapshot { ... } + * + * PR #3: Dataset management types + * - export interface DatasetVersion { ... } + * - export interface ValidationSchema { ... } + * + * PR #4: Safety evaluator types + * - export interface ToxicityScore { ... } + * - export interface PiiDetectionResult { ... } + * + * PR #5: UI component types + * - export interface RunComparison { ... } + * - export interface ExampleExplorerProps { ... } + * + * And so on for PRs #6-10... + */ diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts b/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts new file mode 100644 index 0000000000000..7bc3109dd9887 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/src/utils/index.ts @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * Utility functions for @kbn/evals-extensions + * + * Will be populated in future PRs with: + * - Common helpers + * - Shared calculations + * - Type guards + * - Validation utilities + */ + +// Placeholder export +export const UTILS_VERSION = '1.0.0'; diff --git a/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json new file mode 100644 index 0000000000000..f2347e8ce78ed --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals-extensions/tsconfig.json @@ -0,0 +1,21 @@ +{ + "extends": "@kbn/tsconfig-base/tsconfig.json", + "compilerOptions": { + "outDir": "target/types", + "types": [ + "jest", + "node", + "@kbn/ambient-common-types" + ] + }, + "include": [ + "**/*.ts", + "**/*.json" + ], + "exclude": [ + "target/**/*" + ], + "kbn_references": [ + "@kbn/evals", + ] +} diff --git a/x-pack/platform/packages/shared/kbn-evals/README.md b/x-pack/platform/packages/shared/kbn-evals/README.md index e138d8ca28b30..09635727aaaad 100644 --- a/x-pack/platform/packages/shared/kbn-evals/README.md +++ b/x-pack/platform/packages/shared/kbn-evals/README.md @@ -2,6 +2,16 @@ `@kbn/evals` contains utilities for writing offline evaluation suites against LLM-based workflows in Kibana. +## Vision Alignment + +This package follows the strategic direction outlined in the "Future of @kbn/evals" vision document. Contributors should be aware of these principles: + +- **Trace-first evaluators**: New evaluators should derive signals from OTel traces stored in Elasticsearch when possible. Use `createTraceBasedEvaluator` for non-functional metrics. For evaluators that currently operate on in-memory output, design interfaces that also accept `traceId` references for future API-based evaluation. +- **Elastic-native path**: Build on ES/Kibana/OTel capabilities rather than introducing new external dependencies. Phoenix usage should remain behind `KBN_EVALS_EXECUTOR=phoenix` and not expand. +- **Shared evaluation layer**: This package provides primitives (evaluator factories, data model, persistence, reporting). Solution-specific evaluators, datasets, and reporting belong in solution-owned evaluation suites, not here. +- **Code-defined datasets**: Evaluation datasets should be defined in code, versioned, and reviewed alongside suites. Ad-hoc datasets must be explicitly decoupled from CI-contributing datasets. +- **Ownership**: Framework is owned by the Observability AI team. General-purpose evaluators discovered in solution suites should be contributed upstream. + This package is built on top of `@kbn/scout` and the `@kbn/inference-*` packages. It bundles three main entry-points: 1. `createPlaywrightEvalsConfig` – helper that returns a ready-made Playwright config for evaluation suites. It automatically: diff --git a/x-pack/platform/packages/shared/kbn-evals/index.ts b/x-pack/platform/packages/shared/kbn-evals/index.ts index b1ccf15f1a958..40e2ed178e34e 100644 --- a/x-pack/platform/packages/shared/kbn-evals/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/index.ts @@ -4,6 +4,23 @@ * 2.0; you may not use this file except in compliance with the Elastic License * 2.0. */ + +/** + * @kbn/evals — Evaluation framework for LLM-based workflows in Kibana. + * + * This package provides the shared evaluation layer (vision Section 5.2.3): evaluator + * factories, data model types, persistence utilities, and reporting primitives. It is + * designed to be independent of how evaluations are triggered (CI/offline vs in-tool). + * + * ## Architecture boundaries + * - **Framework primitives** (this package): evaluator contracts, trace-based evaluators, + * data model, persistence, reporting, CLI tooling + * - **Solution suites** (separate packages): datasets, tasks, solution-specific evaluators, + * solution-specific reporting + * + * @module @kbn/evals + */ + // CLI tools export * as cli from './src/cli'; @@ -55,7 +72,19 @@ export { export { mapToEvaluationScoreDocuments, exportEvaluations } from './src/utils/report_model_score'; export { parseSelectedEvaluators, selectEvaluators } from './src/evaluators/filter'; +/** + * Trace-based evaluators — the preferred pattern for non-functional metrics. + * + * These evaluators query OTel traces in Elasticsearch via ES|QL, extracting latency, + * token usage, tool calls, and skill invocations directly from production-grade traces. + * This is the trace-first evaluator pattern described in vision Section 5.2.1. + * + * New evaluators that measure non-functional signals should use `createTraceBasedEvaluator` + * rather than implementing custom ES queries. + */ export { + createTraceBasedEvaluator, + type TraceBasedEvaluatorConfig, createSpanLatencyEvaluator, createSkillInvocationEvaluator, } from './src/evaluators/trace_based'; diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts index 0ce749750d618..3b853a90e400b 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/conversation_coherence/index.ts @@ -11,6 +11,17 @@ import pRetry from 'p-retry'; import type { Evaluator } from '../../types'; import { LlmCoherenceEvaluationPrompt } from './prompt'; +/** + * LLM-as-a-judge evaluator that scores multi-turn conversation quality across four + * dimensions: topic consistency, context retention, contradiction detection, and + * resolution quality. Each dimension is scored 0–1 by the LLM, then averaged. + * + * Uses retry logic for resilience against transient LLM failures. Validates that + * all returned scores are finite numbers in the [0, 1] range. + * + * @param config.inferenceClient - Bound inference client for LLM calls + * @param config.log - Logger for retry warnings and error reporting + */ export function createConversationCoherenceEvaluator(config: { inferenceClient: BoundInferenceClient; log: ToolingLog; diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts index 21ffd36ddd634..875ea488b80b0 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/multi_judge/index.ts @@ -25,6 +25,18 @@ function computeMajority(scores: number[]): number { return ones > rounded.length / 2 ? 1 : 0; } +/** + * Meta-evaluator that aggregates scores from multiple judge evaluators using a + * configurable strategy (mean, median, or majority vote). + * + * Individual judge failures are handled gracefully — failed judges are logged via + * the optional logger and excluded from aggregation. The evaluator's `kind` is + * derived from the judges: 'LLM' if any judge is LLM-based, 'CODE' otherwise. + * + * @param config.judges - Array of evaluators to aggregate + * @param config.strategy - Aggregation method: 'mean' | 'median' | 'majority' (default: 'mean') + * @param config.logger - Optional logger for warning on judge failures + */ export function createMultiJudgeEvaluator(config: { judges: Evaluator[]; strategy?: AggregationStrategy; @@ -48,7 +60,7 @@ export function createMultiJudgeEvaluator(config: { results.forEach((result, i) => { if (result.status === 'fulfilled') { judgeResults.push({ name: judges[i].name, result: result.value }); - if (result.value.score != null) { + if (result.value.score != null && Number.isFinite(result.value.score)) { scores.push(result.value.score); } } else { diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts index a9eb3681d972f..f6f8081067551 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/security/index.ts @@ -5,6 +5,17 @@ * 2.0. */ +/** + * Security evaluators for adversarial testing and guardrail validation. + * + * These evaluators currently operate on in-memory task output. Per the @kbn/evals + * vision (Section 5.2.1 — Trace-first evaluators), they should migrate to derive + * signals from OTel trace spans stored in Elasticsearch. When a `traceId` is + * available in the task output, evaluators propagate it in result metadata to + * support trace linkage (Section 5.2.2). + * + * @see {@link createTraceBasedEvaluator} for the trace-first evaluator factory + */ import type { Evaluator } from '../../types'; const DEFAULT_PROMPT_LEAK_PATTERNS = [ @@ -16,6 +27,12 @@ const DEFAULT_PROMPT_LEAK_PATTERNS = [ /<>/i, ]; +/** + * Detects unauthorized tool invocations by comparing actual tool calls against an allowlist. + * + * Returns score 1.0 when all tool calls are authorized, 0.0 when unauthorized tools are detected. + * Unauthorized tool names are included in the result metadata for investigation. + */ export function createToolPoisoningEvaluator(config: { allowedTools: string[]; extractToolCalls: (output: unknown) => string[]; @@ -57,6 +74,13 @@ export function createToolPoisoningEvaluator(config: { }; } +/** + * Detects potential system prompt leakage in model output using configurable regex patterns. + * + * Scans both plain text and code blocks separately. Excluded patterns are stripped before + * scanning to allow known-safe content. Returns score 1.0 when no leak indicators found, + * 0.0 with detected pattern details when leaks are identified. + */ export function createPromptLeakDetectionEvaluator(config?: { patterns?: RegExp[]; excludePatterns?: RegExp[]; @@ -89,6 +113,7 @@ export function createPromptLeakDetectionEvaluator(config?: { const detectedPatterns: Array<{ pattern: string; location: 'text' | 'codeblock' }> = []; for (const pattern of patterns) { + pattern.lastIndex = 0; if (pattern.test(strippedPlainText)) { detectedPatterns.push({ pattern: pattern.source, location: 'text' }); } @@ -97,6 +122,7 @@ export function createPromptLeakDetectionEvaluator(config?: { for (const block of codeBlocks) { const strippedBlock = stripExcludedSegments(block); for (const pattern of patterns) { + pattern.lastIndex = 0; if (pattern.test(strippedBlock)) { detectedPatterns.push({ pattern: pattern.source, location: 'codeblock' }); } @@ -123,6 +149,13 @@ export function createPromptLeakDetectionEvaluator(config?: { }; } +/** + * Validates that model output stays within defined scope boundaries using regex patterns. + * + * Returns score 1.0 when output matches at least one allowed pattern, 0.0 when output + * falls outside all allowed patterns. Useful for ensuring agents don't drift into + * unauthorized domains. + */ export function createScopeViolationEvaluator(config: { allowedPatterns: RegExp[] }): Evaluator { const { allowedPatterns } = config; diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts index d42b6744f8e8d..68a39d27ee78e 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/similarity/index.ts @@ -65,6 +65,15 @@ function sortKeys(value: unknown): unknown { }, {}); } +/** + * Computes term-frequency cosine similarity between expected and actual outputs. + * + * Both inputs are normalized to lowercase tokens. Objects are sorted by keys and + * serialized to JSON for consistent comparison. Returns a score between 0 and 1, + * with a configurable threshold for the similar/dissimilar label. + * + * @param config.threshold - Minimum cosine similarity to be labeled 'similar' (default: 0.7) + */ export function createSimilarityEvaluator(config?: { threshold?: number }): Evaluator { const threshold = config?.threshold ?? 0.7; diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts index 5c460e2e3ab2c..e5011c6eb9ab1 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/trajectory/index.ts @@ -40,6 +40,18 @@ function computeLCS(a: string[], b: string[]): string[] { return lcs; } +/** + * Evaluates tool-call sequence alignment against a golden path using Longest Common + * Subsequence (LCS) for order scoring and set intersection for coverage scoring. + * + * The final score is a weighted combination of order and coverage scores. + * Both weights must sum to 1. + * + * @param config.extractToolCalls - Extracts actual tool call names from task output + * @param config.goldenPathExtractor - Extracts expected tool sequence from ground truth + * @param config.orderWeight - Weight for LCS-based order score (default: 0.5) + * @param config.coverageWeight - Weight for set-based coverage score (default: 0.5) + */ export function createTrajectoryEvaluator(config: { extractToolCalls: (output: unknown) => string[]; goldenPathExtractor: (expected: unknown) => string[]; diff --git a/x-pack/platform/packages/shared/kbn-evals/src/types.ts b/x-pack/platform/packages/shared/kbn-evals/src/types.ts index 9ef5be0233633..35e7646bb4f16 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/types.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/types.ts @@ -65,6 +65,10 @@ export interface EvaluatorParams params: EvaluatorParams ) => Promise; +/** + * Core evaluator interface. + * + * All evaluators — whether CODE-kind (deterministic) or LLM-kind (model-scored) — implement + * this interface. Per the @kbn/evals vision (Section 5.2.1), evaluators should progressively + * migrate to deriving signals from OTel traces stored in Elasticsearch rather than only + * operating on in-memory task output. Use {@link createTraceBasedEvaluator} for trace-native + * evaluators. + * + * @see TraceBasedEvaluatorConfig for the trace-first evaluator factory configuration + */ export interface Evaluator< TExample extends Example = Example, TTaskOutput extends TaskOutput = TaskOutput diff --git a/yarn.lock b/yarn.lock index 023c0253c5631..4834031837a22 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6827,6 +6827,10 @@ version "0.0.0" uid "" +"@kbn/evals-extensions@link:x-pack/platform/packages/shared/kbn-evals-extensions": + version "0.0.0" + uid "" + "@kbn/evals-phoenix-executor@link:x-pack/platform/packages/shared/kbn-evals-phoenix-executor": version "0.0.0" uid ""