diff --git a/scripts/evals.js b/scripts/evals.js index 886363f1d7e09..d9d81efe83ba5 100644 --- a/scripts/evals.js +++ b/scripts/evals.js @@ -431,6 +431,17 @@ function main() { return; } + if (hasFlag(args, '--local')) { + process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false'; + require('@kbn/setup-node-env'); + void require('@kbn/evals') + .injectLocalConnector(process.argv) + .then(function () { + return require('@kbn/evals').cli.run(); + }); + return; + } + process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false'; require('@kbn/setup-node-env'); void require('@kbn/evals').cli.run(); diff --git a/x-pack/platform/packages/shared/kbn-evals/README.md b/x-pack/platform/packages/shared/kbn-evals/README.md index e344d290fc3a9..b5b57ce108ebc 100644 --- a/x-pack/platform/packages/shared/kbn-evals/README.md +++ b/x-pack/platform/packages/shared/kbn-evals/README.md @@ -186,6 +186,52 @@ For convenience, `start` and `run` support shorter aliases: node scripts/evals start --suite agent-builder --model eis-gpt-4.1 --judge eis-claude-4-5-sonnet ``` +### Local model quick start (Ollama) + +Run evals entirely offline using a local LLM served by [Ollama](https://ollama.com). No cloud credentials required. + +#### 1. Install and pull a model + +```bash +brew install ollama && ollama pull +``` + +Pick a model based on your available RAM: + +| RAM | Recommended model | Notes | +| ------ | ----------------- | ----------------------------------- | +| 16 GB | `qwen3:8b` | Fast; good for smoke-testing suites | +| 32 GB | `qwen3:14b` | Balanced quality / speed | +| 48 GB | `qwen3:32b` | Production-grade reasoning | +| 64 GB+ | `llama3.3:70b` | Highest quality for complex tasks | + +#### 2. Set the required timeout env var + +Local models are slower than cloud connectors. Raise the task timeout so evaluations do not time out mid-run: + +```bash +export EVAL_TASK_TIMEOUT_MS=600000 +``` + +#### 3. Run a suite + +```bash +node scripts/evals run --suite --local +``` + +`--local` detects the running Ollama instance, auto-wires a connector pointing at it, and runs the full suite against that model. + +#### `--local` vs `--dry-run` + +| Flag | Purpose | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--local` | Full eval run against a local Ollama runtime — all examples, all repetitions. | +| `--dry-run` | Smoke-test suite wiring without committing to a full run: samples **one example per dataset** and sets `EVALUATION_REPETITIONS=1`. Use this to verify datasets, connectors, and Playwright config before a long run. Works with any connector, not just `--local`. | + +#### Automated orchestration + +For model benchmarking, automated model selection, and multi-suite orchestration on a local runtime, use the `local-evals` skill in [`elastic-agent-builder-skill-dev`](https://github.com/elastic/elastic-agent-builder-skill-dev). That skill provides the full provisioning + benchmark + recommendation workflow that was intentionally kept out of `@kbn/evals` core. + ### Evals CLI commands ```bash diff --git a/x-pack/platform/packages/shared/kbn-evals/index.ts b/x-pack/platform/packages/shared/kbn-evals/index.ts index 6cc39dc7762ba..48c369388effb 100644 --- a/x-pack/platform/packages/shared/kbn-evals/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/index.ts @@ -23,6 +23,7 @@ // CLI tools export * as cli from './src/cli'; +export { injectLocalConnector } from './src/cli/inject_local_connector'; export { evaluate } from './src/evaluate'; export type { DefaultEvaluators, ReportDisplayOptions } from './src/types'; diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts index e58c1422fc193..08f4bf980aee0 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts @@ -247,13 +247,15 @@ export const runSuiteCmd: Command = { args.push(...positionals); } - const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim(); - log.info(`Running: ${commandPreview}`); - if (flagsReader.boolean('dry-run')) { - return; + envOverrides.EVALUATION_REPETITIONS = '1'; + envOverrides.EVALUATION_DRY_RUN = 'true'; + log.info('[DRY-RUN] sampling 1 example per dataset, repetitions=1'); } + const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim(); + log.info(`Running: ${commandPreview}`); + await new Promise((resolve, reject) => { const childEnv: Record = { ...process.env, ...envOverrides } as Record< string, diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.test.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.test.ts new file mode 100644 index 0000000000000..680da531e1cd8 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.test.ts @@ -0,0 +1,123 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { execFileSync } from 'node:child_process'; +import { injectLocalConnector } from './inject_local_connector'; + +jest.mock('node:child_process', () => ({ + execFileSync: jest.fn(), +})); + +const mockExecFileSync = execFileSync as jest.Mock; + +describe('injectLocalConnector', () => { + let fetchSpy: jest.SpyInstance; + let stderrSpy: jest.SpyInstance; + + beforeEach(() => { + fetchSpy = jest.spyOn(global, 'fetch').mockRejectedValue(new Error('connection refused')); + stderrSpy = jest.spyOn(process.stderr, 'write').mockImplementation(() => true); + mockExecFileSync.mockImplementation(() => { + throw new Error('command not found'); + }); + delete process.env.KIBANA_TESTING_AI_CONNECTORS; + delete process.env.EVALUATION_CONNECTOR_ID; + }); + + afterEach(() => { + fetchSpy.mockRestore(); + stderrSpy.mockRestore(); + jest.clearAllMocks(); + }); + + describe('hard-fail when no local runtime is reachable', () => { + it('throws with an actionable error message', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + '--local requires a running local runtime, but none was detected' + ); + }); + + it('error message tells user to start Ollama or LM Studio', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + 'Start Ollama (`ollama serve`) or LM Studio' + ); + }); + + it('error message explains the fallback refusal', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + 'Refusing to silently fall back to the cloud connector' + ); + }); + + it('does not set env vars when no runtime found', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow(); + + expect(process.env.EVALUATION_CONNECTOR_ID).toBeUndefined(); + expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeUndefined(); + }); + + it('strips --local from args before throwing', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow(); + + // --local was stripped from args before detection ran + expect(args).not.toContain('--local'); + }); + }); + + describe('happy path: runtime reachable with a loaded model', () => { + it('injects connector env vars when Ollama is running with a model', async () => { + fetchSpy + // first call: probeEndpoint(ollamaEndpoint) → ok + .mockResolvedValueOnce({ ok: true, status: 200 } as Response) + // second call: getOllamaModels → returns a model + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ models: [{ name: 'llama3.2:3b', size: 2_000_000_000 }] }), + } as unknown as Response); + + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + await injectLocalConnector(args); + + expect(process.env.EVALUATION_CONNECTOR_ID).toBe('local-eval-model'); + expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeDefined(); + + const decoded = JSON.parse( + Buffer.from(process.env.KIBANA_TESTING_AI_CONNECTORS!, 'base64').toString('utf-8') + ); + expect(decoded['local-eval-model'].config.defaultModel).toBe('llama3.2:3b'); + expect(decoded['local-eval-model'].config.apiUrl).toContain('/v1/chat/completions'); + }); + }); + + describe('hard-fail when ollama binary exists but server is not running', () => { + it('throws when binary is installed but server is not reachable', async () => { + // ollama binary exists + mockExecFileSync.mockImplementation((cmd: string, cmdArgs: string[]) => { + if (cmd === 'sh' && cmdArgs.includes('ollama')) { + return Buffer.from('/usr/local/bin/ollama'); + } + throw new Error('not found'); + }); + + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + '--local requires a running local runtime, but none was detected' + ); + }); + }); +}); diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts new file mode 100644 index 0000000000000..a7049855fc6a1 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts @@ -0,0 +1,264 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { execFileSync } from 'node:child_process'; + +// --------------------------------------------------------------------------- +// Runtime detection helpers (inlined from @kbn/evals-local/src/local/detect) +// --------------------------------------------------------------------------- + +type RuntimeType = 'ollama' | 'lm-studio'; + +interface LoadedModel { + name: string; + size?: string; +} + +interface DetectionResult { + runtime: RuntimeType; + endpoint: string | null; + loadedModel: LoadedModel | null; + serverWasRunning: boolean; +} + +async function probeEndpoint(url: string, timeoutMs = 3000): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + try { + const response = await fetch(url, { signal: controller.signal }); + return response.ok || response.status === 200; + } catch { + return false; + } finally { + clearTimeout(timer); + } +} + +async function getOllamaModels(endpoint: string): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 5000); + try { + const response = await fetch(`${endpoint}/api/ps`, { signal: controller.signal }); + if (!response.ok) return null; + const data = (await response.json()) as { models?: Array<{ name: string; size?: number }> }; + if (data.models && data.models.length > 0) { + const model = data.models[0]; + return { + name: model.name, + size: model.size ? `${Math.round(model.size / (1024 * 1024 * 1024))}GB` : undefined, + }; + } + return null; + } catch { + return null; + } finally { + clearTimeout(timer); + } +} + +async function getLmStudioModel(endpoint: string): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 5000); + try { + const response = await fetch(`${endpoint}/v1/models`, { signal: controller.signal }); + if (!response.ok) return null; + const data = (await response.json()) as { data?: Array<{ id: string }> }; + if (data.data && data.data.length > 0) { + return { name: data.data[0].id }; + } + return null; + } catch { + return null; + } finally { + clearTimeout(timer); + } +} + +function commandExists(cmd: string): boolean { + // execFileSync (no shell) — `cmd` is passed as a positional arg, never + // interpolated into a command string. Avoids any future-shell-injection + // risk if a caller ever accepts the binary name from user input. + try { + execFileSync('sh', ['-c', 'command -v "$1"', '_', cmd], { + stdio: 'pipe', + timeout: 5000, + }); + return true; + } catch { + return false; + } +} + +async function detect(customEndpoint?: string): Promise { + if (customEndpoint) { + let isOllama = false; + try { + const parsedUrl = new URL(customEndpoint); + isOllama = parsedUrl.port === '11434'; + } catch { + isOllama = customEndpoint.includes(':11434'); + } + const runtime: RuntimeType = isOllama ? 'ollama' : 'lm-studio'; + const baseEndpoint = customEndpoint.replace(/\/v1\/?$/, ''); + const loadedModel = isOllama + ? await getOllamaModels(baseEndpoint) + : await getLmStudioModel(baseEndpoint); + return { + runtime, + endpoint: customEndpoint.replace(/\/+$/, '').endsWith('/v1') + ? customEndpoint.replace(/\/+$/, '') + : `${customEndpoint.replace(/\/+$/, '')}/v1`, + loadedModel, + serverWasRunning: true, + }; + } + + const ollamaEndpoint = 'http://localhost:11434'; + const lmsEndpoint = 'http://localhost:1234'; + + const ollamaRunning = await probeEndpoint(ollamaEndpoint); + if (ollamaRunning) { + const loadedModel = await getOllamaModels(ollamaEndpoint); + return { + runtime: 'ollama', + endpoint: `${ollamaEndpoint}/v1`, + loadedModel, + serverWasRunning: true, + }; + } + + const lmsRunning = await probeEndpoint(`${lmsEndpoint}/v1/models`); + if (lmsRunning) { + const loadedModel = await getLmStudioModel(lmsEndpoint); + return { + runtime: 'lm-studio', + endpoint: `${lmsEndpoint}/v1`, + loadedModel, + serverWasRunning: true, + }; + } + + if (commandExists('ollama')) { + return { + runtime: 'ollama', + endpoint: null, + loadedModel: null, + serverWasRunning: false, + }; + } + + if (commandExists('lms')) { + return { + runtime: 'lm-studio', + endpoint: null, + loadedModel: null, + serverWasRunning: false, + }; + } + + return { + runtime: 'ollama', + endpoint: null, + loadedModel: null, + serverWasRunning: false, + }; +} + +// --------------------------------------------------------------------------- +// Local connector env helpers (inlined from @kbn/evals-local/src/local/connector_factory) +// --------------------------------------------------------------------------- + +const LOCAL_CONNECTOR_ID = 'local-eval-model'; + +function setLocalConnectorEnv(endpoint: string, modelName: string): void { + const normalized = endpoint.replace(/\/+$/, ''); + const apiUrl = normalized.endsWith('/v1') + ? `${normalized}/chat/completions` + : `${normalized}/v1/chat/completions`; + + const config = { + [LOCAL_CONNECTOR_ID]: { + name: `Local: ${modelName}`, + actionTypeId: '.gen-ai', + config: { + apiUrl, + apiProvider: 'Other', + defaultModel: modelName, + }, + secrets: { + apiKey: 'local-eval', + }, + }, + }; + + process.env.KIBANA_TESTING_AI_CONNECTORS = Buffer.from(JSON.stringify(config)).toString('base64'); + process.env.EVALUATION_CONNECTOR_ID = LOCAL_CONNECTOR_ID; +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +const log = { + info: (msg: string) => process.stderr.write(`[evals-local] ${msg}\n`), +}; + +/** + * Lightweight connector injection for --local flag on any existing evals command. + * Probes a running local endpoint, discovers the model name, and sets env vars. + * Does NOT provision or teardown -- assumes the runtime is already running. + */ +export async function injectLocalConnector(args: string[]): Promise { + let localEndpoint: string | undefined; + let localModel: string | undefined; + + const filteredArgs: string[] = []; + for (let i = 0; i < args.length; i++) { + if (args[i] === '--local') continue; + if (args[i] === '--local-endpoint') { + const value = args[++i]; + if (!value || value.startsWith('--')) { + throw new Error(`--local-endpoint requires a value, got: ${value}`); + } + localEndpoint = value; + continue; + } + if (args[i] === '--local-model') { + const value = args[++i]; + if (!value || value.startsWith('--')) { + throw new Error(`--local-model requires a value, got: ${value}`); + } + localModel = value; + continue; + } + filteredArgs.push(args[i]); + } + + args.length = 0; + args.push(...filteredArgs); + + const detection = await detect(localEndpoint); + + if (!detection.endpoint) { + // Hard-fail rather than warn-and-return. The caller (scripts/evals.js) + // chains `.then(() => cli.run())` unconditionally on the resolved promise, + // so a silent return here would let @kbn/evals start with the default + // CLOUD connector — silently producing eval results that look like local + // model output but actually used Anthropic/OpenAI. That is a data-trust + // regression: a user reading the eval report cannot tell their local + // model was never invoked. Throw so the process exits visibly. + throw new Error( + '--local requires a running local runtime, but none was detected. ' + + 'Start Ollama (`ollama serve`) or LM Studio. ' + + 'Refusing to silently fall back to the cloud connector.' + ); + } + + const modelName = localModel ?? detection.loadedModel?.name ?? 'local-model'; + setLocalConnectorEnv(detection.endpoint, modelName); + log.info(`Local connector injected: ${modelName} at ${detection.endpoint}`); +} diff --git a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts index c90301749c47f..5ef85784f9de4 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts @@ -334,6 +334,62 @@ describe('KibanaEvalsClient', () => { expect(Object.values(ranExperiment.runs)).toHaveLength(1); }); + describe('EVALUATION_DRY_RUN', () => { + afterEach(() => { + delete process.env.EVALUATION_DRY_RUN; + }); + + it('slices dataset to the first example when EVALUATION_DRY_RUN=true', async () => { + process.env.EVALUATION_DRY_RUN = 'true'; + const client = createClient({ repetitions: 3 }); + + const dataset: EvaluationDataset = { + name: 'dry-run-ds', + description: 'desc', + examples: [ + { input: { q: 1 }, output: { expected: 1 } }, + { input: { q: 2 }, output: { expected: 2 } }, + { input: { q: 3 }, output: { expected: 3 } }, + ], + }; + + let taskCalls = 0; + const task = async () => { + taskCalls++; + return { ok: true }; + }; + + await client.runExperiment({ dataset, task }, []); + + // Only 1 example × 3 repetitions — not 3 × 3 + expect(taskCalls).toBe(3); + }); + + it('runs all examples when EVALUATION_DRY_RUN is not set', async () => { + const client = createClient({ repetitions: 1 }); + + const dataset: EvaluationDataset = { + name: 'ds', + description: 'desc', + examples: [ + { input: { q: 1 } }, + { input: { q: 2 } }, + { input: { q: 3 } }, + ], + }; + + let taskCalls = 0; + const task = async () => { + taskCalls++; + return { ok: true }; + }; + + await client.runExperiment({ dataset, task }, []); + + expect(taskCalls).toBe(3); + }); + }); + it('throws when trustUpstreamDataset=true without getDatasetByName', async () => { const client = createClient(); diff --git a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts index 87ee8fb55e73b..ab0988be5a038 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts @@ -97,6 +97,11 @@ export class KibanaEvalsClient implements EvalsExecutorClient { const resolvedDataset = await this.resolveDataset(dataset, trustUpstreamDataset); await this.options.upsertDataset?.(resolvedDataset); + const isDryRun = process.env.EVALUATION_DRY_RUN === 'true'; + const effectiveExamples = isDryRun + ? resolvedDataset.examples.slice(0, 1) + : resolvedDataset.examples; + const datasetId = computeDatasetId(resolvedDataset.name); const experimentId = randomUUID(); const repetitions = this.options.repetitions ?? 3; @@ -113,7 +118,7 @@ export class KibanaEvalsClient implements EvalsExecutorClient { ); for (let rep = 0; rep < repetitions; rep++) { - resolvedDataset.examples.forEach((example, exampleIndex) => { + effectiveExamples.forEach((example, exampleIndex) => { runJobs.push( limiter(async () => { const runKey = `${exampleIndex}-${rep}-${randomUUID()}`;