From 68e603b54cbba01db8f3f5e1bcd080bb093dcda2 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 09:14:06 +0200 Subject: [PATCH 1/9] feat(kbn-evals): move inject_local_connector from kbn-evals-local Move src/cli/inject.ts from @kbn/evals-local into @kbn/evals as src/cli/inject_local_connector.ts. The file is made self-contained by inlining the runtime-detection helpers (probeEndpoint, getOllamaModels, getLmStudioModel, commandExists, detect) and the connector env-setter from connector_factory, both of which are being deleted in the broader kbn-evals-local retirement. The ModelRegistry dependency is dropped in favour of accepting --local-model values as plain model-name strings (bare-connector path). The three load-bearing behaviours are preserved verbatim: the hard-fail guard when no endpoint is detected, the process.argv strip-and-sync after --local removal, and the execFileSync-based commandExists call that prevents shell injection. Co-Authored-By: Claude Sonnet 4.6 --- .../src/cli/inject_local_connector.ts | 261 ++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts new file mode 100644 index 0000000000000..f92c124db48eb --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts @@ -0,0 +1,261 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { execFileSync } from 'node:child_process'; + +// --------------------------------------------------------------------------- +// Runtime detection helpers (inlined from @kbn/evals-local/src/local/detect) +// --------------------------------------------------------------------------- + +type RuntimeType = 'ollama' | 'lm-studio'; + +interface LoadedModel { + name: string; + size?: string; +} + +interface DetectionResult { + runtime: RuntimeType; + endpoint: string | null; + loadedModel: LoadedModel | null; + serverWasRunning: boolean; +} + +async function probeEndpoint(url: string, timeoutMs = 3000): Promise { + try { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + const response = await fetch(url, { signal: controller.signal }); + clearTimeout(timer); + return response.ok || response.status === 200; + } catch { + return false; + } +} + +async function getOllamaModels(endpoint: string): Promise { + try { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 5000); + const response = await fetch(`${endpoint}/api/ps`, { signal: controller.signal }); + clearTimeout(timer); + if (!response.ok) return null; + const data = (await response.json()) as { models?: Array<{ name: string; size?: number }> }; + if (data.models && data.models.length > 0) { + const model = data.models[0]; + return { + name: model.name, + size: model.size ? `${Math.round(model.size / (1024 * 1024 * 1024))}GB` : undefined, + }; + } + return null; + } catch { + return null; + } +} + +async function getLmStudioModel(endpoint: string): Promise { + try { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 5000); + const response = await fetch(`${endpoint}/v1/models`, { signal: controller.signal }); + clearTimeout(timer); + if (!response.ok) return null; + const data = (await response.json()) as { data?: Array<{ id: string }> }; + if (data.data && data.data.length > 0) { + return { name: data.data[0].id }; + } + return null; + } catch { + return null; + } +} + +function commandExists(cmd: string): boolean { + // execFileSync (no shell) — `cmd` is passed as a positional arg, never + // interpolated into a command string. Avoids any future-shell-injection + // risk if a caller ever accepts the binary name from user input. + try { + execFileSync('sh', ['-c', 'command -v "$1"', '_', cmd], { + stdio: 'pipe', + timeout: 5000, + }); + return true; + } catch { + return false; + } +} + +async function detect(customEndpoint?: string): Promise { + if (customEndpoint) { + let isOllama = false; + try { + const parsedUrl = new URL(customEndpoint); + isOllama = parsedUrl.port === '11434'; + } catch { + isOllama = customEndpoint.includes(':11434'); + } + const runtime: RuntimeType = isOllama ? 'ollama' : 'lm-studio'; + const baseEndpoint = customEndpoint.replace(/\/v1\/?$/, ''); + const loadedModel = isOllama + ? await getOllamaModels(baseEndpoint) + : await getLmStudioModel(baseEndpoint); + return { + runtime, + endpoint: customEndpoint.replace(/\/+$/, '').endsWith('/v1') + ? customEndpoint.replace(/\/+$/, '') + : `${customEndpoint.replace(/\/+$/, '')}/v1`, + loadedModel, + serverWasRunning: true, + }; + } + + const ollamaEndpoint = 'http://localhost:11434'; + const lmsEndpoint = 'http://localhost:1234'; + + const ollamaRunning = await probeEndpoint(ollamaEndpoint); + if (ollamaRunning) { + const loadedModel = await getOllamaModels(ollamaEndpoint); + return { + runtime: 'ollama', + endpoint: `${ollamaEndpoint}/v1`, + loadedModel, + serverWasRunning: true, + }; + } + + const lmsRunning = await probeEndpoint(`${lmsEndpoint}/v1/models`); + if (lmsRunning) { + const loadedModel = await getLmStudioModel(lmsEndpoint); + return { + runtime: 'lm-studio', + endpoint: `${lmsEndpoint}/v1`, + loadedModel, + serverWasRunning: true, + }; + } + + if (commandExists('ollama')) { + return { + runtime: 'ollama', + endpoint: null, + loadedModel: null, + serverWasRunning: false, + }; + } + + if (commandExists('lms')) { + return { + runtime: 'lm-studio', + endpoint: null, + loadedModel: null, + serverWasRunning: false, + }; + } + + return { + runtime: 'ollama', + endpoint: null, + loadedModel: null, + serverWasRunning: false, + }; +} + +// --------------------------------------------------------------------------- +// Local connector env helpers (inlined from @kbn/evals-local/src/local/connector_factory) +// --------------------------------------------------------------------------- + +const LOCAL_CONNECTOR_ID = 'local-eval-model'; + +function setLocalConnectorEnv(endpoint: string, modelName: string): void { + const normalized = endpoint.replace(/\/+$/, ''); + const apiUrl = normalized.endsWith('/v1') + ? `${normalized}/chat/completions` + : `${normalized}/v1/chat/completions`; + + const config = { + [LOCAL_CONNECTOR_ID]: { + name: `Local: ${modelName}`, + actionTypeId: '.gen-ai', + config: { + apiUrl, + apiProvider: 'Other', + defaultModel: modelName, + }, + secrets: { + apiKey: 'local-eval', + }, + }, + }; + + process.env.KIBANA_TESTING_AI_CONNECTORS = Buffer.from(JSON.stringify(config)).toString('base64'); + process.env.EVALUATION_CONNECTOR_ID = LOCAL_CONNECTOR_ID; +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +const log = { + info: (msg: string) => process.stderr.write(`[evals-local] ${msg}\n`), +}; + +/** + * Lightweight connector injection for --local flag on any existing evals command. + * Probes a running local endpoint, discovers the model name, and sets env vars. + * Does NOT provision or teardown -- assumes the runtime is already running. + */ +export async function injectLocalConnector(args: string[]): Promise { + let localEndpoint: string | undefined; + let localModel: string | undefined; + + const filteredArgs: string[] = []; + for (let i = 0; i < args.length; i++) { + if (args[i] === '--local') continue; + if (args[i] === '--local-endpoint') { + const value = args[++i]; + if (!value || value.startsWith('--')) { + throw new Error(`--local-endpoint requires a value, got: ${value}`); + } + localEndpoint = value; + continue; + } + if (args[i] === '--local-model') { + const value = args[++i]; + if (!value || value.startsWith('--')) { + throw new Error(`--local-model requires a value, got: ${value}`); + } + localModel = value; + continue; + } + filteredArgs.push(args[i]); + } + + args.length = 0; + args.push(...filteredArgs); + + const detection = await detect(localEndpoint); + + if (!detection.endpoint) { + // Hard-fail rather than warn-and-return. The caller (scripts/evals.js) + // chains `.then(() => cli.run())` unconditionally on the resolved promise, + // so a silent return here would let @kbn/evals start with the default + // CLOUD connector — silently producing eval results that look like local + // model output but actually used Anthropic/OpenAI. That is a data-trust + // regression: a user reading the eval report cannot tell their local + // model was never invoked. Throw so the process exits visibly. + throw new Error( + '--local requires a running local runtime, but none was detected. ' + + 'Start Ollama (`ollama serve`) or LM Studio. ' + + 'Refusing to silently fall back to the cloud connector.' + ); + } + + const modelName = localModel ?? detection.loadedModel?.name ?? 'local-model'; + setLocalConnectorEnv(detection.endpoint, modelName); + log.info(`Local connector injected: ${modelName} at ${detection.endpoint}`); +} From 6073da3ec6311aebb1b92c29a176d4fc9ee583aa Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 09:32:22 +0200 Subject: [PATCH 2/9] feat(kbn-evals): export injectLocalConnector from package index Adds the named re-export so scripts/evals.js can require it directly from @kbn/evals instead of @kbn/evals-local. Co-Authored-By: Claude Sonnet 4.6 --- x-pack/platform/packages/shared/kbn-evals/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/platform/packages/shared/kbn-evals/index.ts b/x-pack/platform/packages/shared/kbn-evals/index.ts index 6cc39dc7762ba..48c369388effb 100644 --- a/x-pack/platform/packages/shared/kbn-evals/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/index.ts @@ -23,6 +23,7 @@ // CLI tools export * as cli from './src/cli'; +export { injectLocalConnector } from './src/cli/inject_local_connector'; export { evaluate } from './src/evaluate'; export type { DefaultEvaluators, ReportDisplayOptions } from './src/types'; From a8cf0fd4d742510331da80cbd8653d1ba338756b Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 09:37:14 +0200 Subject: [PATCH 3/9] feat(evals): route --local flag through @kbn/evals injectLocalConnector When scripts/evals.js is invoked with --local, call injectLocalConnector(process.argv) from @kbn/evals before handing off to cli.run(). Passing process.argv directly (not a slice) lets the function strip --local / --local-endpoint / --local-model in-place so cli.run() sees a clean argv. The .then() chain ensures cli.run() only starts after connector env vars are set. Co-Authored-By: Claude Sonnet 4.6 --- scripts/evals.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/evals.js b/scripts/evals.js index 886363f1d7e09..d9d81efe83ba5 100644 --- a/scripts/evals.js +++ b/scripts/evals.js @@ -431,6 +431,17 @@ function main() { return; } + if (hasFlag(args, '--local')) { + process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false'; + require('@kbn/setup-node-env'); + void require('@kbn/evals') + .injectLocalConnector(process.argv) + .then(function () { + return require('@kbn/evals').cli.run(); + }); + return; + } + process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false'; require('@kbn/setup-node-env'); void require('@kbn/evals').cli.run(); From 7b638f0c422ac91e895244a1fcc15f23a0bb10f8 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 09:54:08 +0200 Subject: [PATCH 4/9] feat(kbn-evals): wire --dry-run flag to set env overrides and print banner Replace the early-return stub with env-var injection so --dry-run falls through to spawn Playwright: sets EVALUATION_REPETITIONS=1 and EVALUATION_DRY_RUN=true in envOverrides, prints the '[DRY-RUN] sampling 1 example per dataset, repetitions=1' banner, then proceeds to the existing spawn block. Co-Authored-By: Claude Sonnet 4.6 --- .../packages/shared/kbn-evals/src/cli/commands/run.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts index e58c1422fc193..6bf2b41bfa65a 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts @@ -251,7 +251,9 @@ export const runSuiteCmd: Command = { log.info(`Running: ${commandPreview}`); if (flagsReader.boolean('dry-run')) { - return; + envOverrides.EVALUATION_REPETITIONS = '1'; + envOverrides.EVALUATION_DRY_RUN = 'true'; + log.info('[DRY-RUN] sampling 1 example per dataset, repetitions=1'); } await new Promise((resolve, reject) => { From 511e23c3f2b0288dc0df45e4e468c44f665bfa0b Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 10:06:43 +0200 Subject: [PATCH 5/9] feat(kbn-evals): slice dataset examples to first when EVALUATION_DRY_RUN=true In KibanaEvalsClient.runExperiment(), check process.env.EVALUATION_DRY_RUN and slice resolvedDataset.examples to [examples[0]] before the run loop. This wires the --dry-run flag end-to-end: CLI sets EVALUATION_DRY_RUN=true, Playwright inherits it, and the executor limits each dataset to one example. Co-Authored-By: Claude Sonnet 4.6 --- .../shared/kbn-evals/src/kibana_evals_executor/client.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts index 87ee8fb55e73b..ab0988be5a038 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.ts @@ -97,6 +97,11 @@ export class KibanaEvalsClient implements EvalsExecutorClient { const resolvedDataset = await this.resolveDataset(dataset, trustUpstreamDataset); await this.options.upsertDataset?.(resolvedDataset); + const isDryRun = process.env.EVALUATION_DRY_RUN === 'true'; + const effectiveExamples = isDryRun + ? resolvedDataset.examples.slice(0, 1) + : resolvedDataset.examples; + const datasetId = computeDatasetId(resolvedDataset.name); const experimentId = randomUUID(); const repetitions = this.options.repetitions ?? 3; @@ -113,7 +118,7 @@ export class KibanaEvalsClient implements EvalsExecutorClient { ); for (let rep = 0; rep < repetitions; rep++) { - resolvedDataset.examples.forEach((example, exampleIndex) => { + effectiveExamples.forEach((example, exampleIndex) => { runJobs.push( limiter(async () => { const runKey = `${exampleIndex}-${rep}-${randomUUID()}`; From e8b3626212008a7f62a92a0013efdd8af58e66e0 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 10:14:44 +0200 Subject: [PATCH 6/9] docs(kbn-evals): add Local model quick start section to README Covers: one-line Ollama install, one model recommendation per RAM tier (16/32/48/64 GB+), EVAL_TASK_TIMEOUT_MS=600000 requirement, --local vs --dry-run guidance, and pointer to elastic-agent-builder-skill-dev for advanced benchmarking orchestration. Co-Authored-By: Claude Sonnet 4.6 --- .../packages/shared/kbn-evals/README.md | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/x-pack/platform/packages/shared/kbn-evals/README.md b/x-pack/platform/packages/shared/kbn-evals/README.md index e344d290fc3a9..b5b57ce108ebc 100644 --- a/x-pack/platform/packages/shared/kbn-evals/README.md +++ b/x-pack/platform/packages/shared/kbn-evals/README.md @@ -186,6 +186,52 @@ For convenience, `start` and `run` support shorter aliases: node scripts/evals start --suite agent-builder --model eis-gpt-4.1 --judge eis-claude-4-5-sonnet ``` +### Local model quick start (Ollama) + +Run evals entirely offline using a local LLM served by [Ollama](https://ollama.com). No cloud credentials required. + +#### 1. Install and pull a model + +```bash +brew install ollama && ollama pull +``` + +Pick a model based on your available RAM: + +| RAM | Recommended model | Notes | +| ------ | ----------------- | ----------------------------------- | +| 16 GB | `qwen3:8b` | Fast; good for smoke-testing suites | +| 32 GB | `qwen3:14b` | Balanced quality / speed | +| 48 GB | `qwen3:32b` | Production-grade reasoning | +| 64 GB+ | `llama3.3:70b` | Highest quality for complex tasks | + +#### 2. Set the required timeout env var + +Local models are slower than cloud connectors. Raise the task timeout so evaluations do not time out mid-run: + +```bash +export EVAL_TASK_TIMEOUT_MS=600000 +``` + +#### 3. Run a suite + +```bash +node scripts/evals run --suite --local +``` + +`--local` detects the running Ollama instance, auto-wires a connector pointing at it, and runs the full suite against that model. + +#### `--local` vs `--dry-run` + +| Flag | Purpose | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--local` | Full eval run against a local Ollama runtime — all examples, all repetitions. | +| `--dry-run` | Smoke-test suite wiring without committing to a full run: samples **one example per dataset** and sets `EVALUATION_REPETITIONS=1`. Use this to verify datasets, connectors, and Playwright config before a long run. Works with any connector, not just `--local`. | + +#### Automated orchestration + +For model benchmarking, automated model selection, and multi-suite orchestration on a local runtime, use the `local-evals` skill in [`elastic-agent-builder-skill-dev`](https://github.com/elastic/elastic-agent-builder-skill-dev). That skill provides the full provisioning + benchmark + recommendation workflow that was intentionally kept out of `@kbn/evals` core. + ### Evals CLI commands ```bash From 30e8d500f2bbe7561652f9670db3ac940ed6b6b4 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 11:08:09 +0200 Subject: [PATCH 7/9] fix(kbn-evals): include dry-run env overrides in Running: command preview MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the --dry-run envOverrides block above the commandPreview snapshot so EVALUATION_REPETITIONS=1 and EVALUATION_DRY_RUN=true appear in the logged "Running: ..." line. Previously the preview was built before the dry-run mutation, making the logged command unreproducible when copy-pasted. Runtime behavior is unchanged — spawn() always received the correct env. Smoke test verified: node scripts/evals run --suite agent-builder --dry-run --local prints the [DRY-RUN] banner, shows all overrides in the Running: line, and Playwright starts 12 tests (1 per dataset spec) with EVALUATION_REPETITIONS=1 and EVALUATION_DRY_RUN=true set. Co-Authored-By: Claude Sonnet 4.6 --- .../packages/shared/kbn-evals/src/cli/commands/run.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts index 6bf2b41bfa65a..08f4bf980aee0 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/commands/run.ts @@ -247,15 +247,15 @@ export const runSuiteCmd: Command = { args.push(...positionals); } - const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim(); - log.info(`Running: ${commandPreview}`); - if (flagsReader.boolean('dry-run')) { envOverrides.EVALUATION_REPETITIONS = '1'; envOverrides.EVALUATION_DRY_RUN = 'true'; log.info('[DRY-RUN] sampling 1 example per dataset, repetitions=1'); } + const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim(); + log.info(`Running: ${commandPreview}`); + await new Promise((resolve, reject) => { const childEnv: Record = { ...process.env, ...envOverrides } as Record< string, From d3c5cc348ae560a676e0aa887f1e3fa94f5220df Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 11:14:27 +0200 Subject: [PATCH 8/9] test(kbn-evals): smoke-test --local hard-fail when no runtime detected; fix timer leak Adds inject_local_connector.test.ts with 7 unit tests covering the hard-fail path (no Ollama, no LM Studio, no binary installed), the env-var injection happy path, and the binary-installed-but-not-running path. All assertions verified: throws with actionable message, strips --local from args before detection, never sets EVALUATION_CONNECTOR_ID or KIBANA_TESTING_AI_CONNECTORS when no runtime is found. Also fixes a timer resource leak in probeEndpoint / getOllamaModels / getLmStudioModel: clearTimeout was only called on the success path; moved to finally{} so it fires on rejection too. This eliminated the "Jest did not exit" open-handle warning that surfaced during test authoring. Co-Authored-By: Claude Sonnet 4.6 --- .../src/cli/inject_local_connector.test.ts | 123 ++++++++++++++++++ .../src/cli/inject_local_connector.ts | 21 +-- 2 files changed, 135 insertions(+), 9 deletions(-) create mode 100644 x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.test.ts diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.test.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.test.ts new file mode 100644 index 0000000000000..680da531e1cd8 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.test.ts @@ -0,0 +1,123 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { execFileSync } from 'node:child_process'; +import { injectLocalConnector } from './inject_local_connector'; + +jest.mock('node:child_process', () => ({ + execFileSync: jest.fn(), +})); + +const mockExecFileSync = execFileSync as jest.Mock; + +describe('injectLocalConnector', () => { + let fetchSpy: jest.SpyInstance; + let stderrSpy: jest.SpyInstance; + + beforeEach(() => { + fetchSpy = jest.spyOn(global, 'fetch').mockRejectedValue(new Error('connection refused')); + stderrSpy = jest.spyOn(process.stderr, 'write').mockImplementation(() => true); + mockExecFileSync.mockImplementation(() => { + throw new Error('command not found'); + }); + delete process.env.KIBANA_TESTING_AI_CONNECTORS; + delete process.env.EVALUATION_CONNECTOR_ID; + }); + + afterEach(() => { + fetchSpy.mockRestore(); + stderrSpy.mockRestore(); + jest.clearAllMocks(); + }); + + describe('hard-fail when no local runtime is reachable', () => { + it('throws with an actionable error message', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + '--local requires a running local runtime, but none was detected' + ); + }); + + it('error message tells user to start Ollama or LM Studio', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + 'Start Ollama (`ollama serve`) or LM Studio' + ); + }); + + it('error message explains the fallback refusal', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + 'Refusing to silently fall back to the cloud connector' + ); + }); + + it('does not set env vars when no runtime found', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow(); + + expect(process.env.EVALUATION_CONNECTOR_ID).toBeUndefined(); + expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeUndefined(); + }); + + it('strips --local from args before throwing', async () => { + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow(); + + // --local was stripped from args before detection ran + expect(args).not.toContain('--local'); + }); + }); + + describe('happy path: runtime reachable with a loaded model', () => { + it('injects connector env vars when Ollama is running with a model', async () => { + fetchSpy + // first call: probeEndpoint(ollamaEndpoint) → ok + .mockResolvedValueOnce({ ok: true, status: 200 } as Response) + // second call: getOllamaModels → returns a model + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ models: [{ name: 'llama3.2:3b', size: 2_000_000_000 }] }), + } as unknown as Response); + + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + await injectLocalConnector(args); + + expect(process.env.EVALUATION_CONNECTOR_ID).toBe('local-eval-model'); + expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeDefined(); + + const decoded = JSON.parse( + Buffer.from(process.env.KIBANA_TESTING_AI_CONNECTORS!, 'base64').toString('utf-8') + ); + expect(decoded['local-eval-model'].config.defaultModel).toBe('llama3.2:3b'); + expect(decoded['local-eval-model'].config.apiUrl).toContain('/v1/chat/completions'); + }); + }); + + describe('hard-fail when ollama binary exists but server is not running', () => { + it('throws when binary is installed but server is not reachable', async () => { + // ollama binary exists + mockExecFileSync.mockImplementation((cmd: string, cmdArgs: string[]) => { + if (cmd === 'sh' && cmdArgs.includes('ollama')) { + return Buffer.from('/usr/local/bin/ollama'); + } + throw new Error('not found'); + }); + + const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local']; + + await expect(injectLocalConnector(args)).rejects.toThrow( + '--local requires a running local runtime, but none was detected' + ); + }); + }); +}); diff --git a/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts index f92c124db48eb..a7049855fc6a1 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/cli/inject_local_connector.ts @@ -26,23 +26,23 @@ interface DetectionResult { } async function probeEndpoint(url: string, timeoutMs = 3000): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); try { - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), timeoutMs); const response = await fetch(url, { signal: controller.signal }); - clearTimeout(timer); return response.ok || response.status === 200; } catch { return false; + } finally { + clearTimeout(timer); } } async function getOllamaModels(endpoint: string): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 5000); try { - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), 5000); const response = await fetch(`${endpoint}/api/ps`, { signal: controller.signal }); - clearTimeout(timer); if (!response.ok) return null; const data = (await response.json()) as { models?: Array<{ name: string; size?: number }> }; if (data.models && data.models.length > 0) { @@ -55,15 +55,16 @@ async function getOllamaModels(endpoint: string): Promise { return null; } catch { return null; + } finally { + clearTimeout(timer); } } async function getLmStudioModel(endpoint: string): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 5000); try { - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), 5000); const response = await fetch(`${endpoint}/v1/models`, { signal: controller.signal }); - clearTimeout(timer); if (!response.ok) return null; const data = (await response.json()) as { data?: Array<{ id: string }> }; if (data.data && data.data.length > 0) { @@ -72,6 +73,8 @@ async function getLmStudioModel(endpoint: string): Promise { return null; } catch { return null; + } finally { + clearTimeout(timer); } } From 29047105be62caa5b5eb4b352089eb5b2d23f7d5 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Tue, 12 May 2026 11:30:54 +0200 Subject: [PATCH 9/9] test(kbn-evals): add EVALUATION_DRY_RUN slicing coverage to KibanaEvalsClient Verifies that runExperiment() limits execution to the first example when EVALUATION_DRY_RUN=true (regardless of repetitions), and runs all examples when the var is absent. Co-Authored-By: Claude Sonnet 4.6 --- .../src/kibana_evals_executor/client.test.ts | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts index c90301749c47f..5ef85784f9de4 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/kibana_evals_executor/client.test.ts @@ -334,6 +334,62 @@ describe('KibanaEvalsClient', () => { expect(Object.values(ranExperiment.runs)).toHaveLength(1); }); + describe('EVALUATION_DRY_RUN', () => { + afterEach(() => { + delete process.env.EVALUATION_DRY_RUN; + }); + + it('slices dataset to the first example when EVALUATION_DRY_RUN=true', async () => { + process.env.EVALUATION_DRY_RUN = 'true'; + const client = createClient({ repetitions: 3 }); + + const dataset: EvaluationDataset = { + name: 'dry-run-ds', + description: 'desc', + examples: [ + { input: { q: 1 }, output: { expected: 1 } }, + { input: { q: 2 }, output: { expected: 2 } }, + { input: { q: 3 }, output: { expected: 3 } }, + ], + }; + + let taskCalls = 0; + const task = async () => { + taskCalls++; + return { ok: true }; + }; + + await client.runExperiment({ dataset, task }, []); + + // Only 1 example × 3 repetitions — not 3 × 3 + expect(taskCalls).toBe(3); + }); + + it('runs all examples when EVALUATION_DRY_RUN is not set', async () => { + const client = createClient({ repetitions: 1 }); + + const dataset: EvaluationDataset = { + name: 'ds', + description: 'desc', + examples: [ + { input: { q: 1 } }, + { input: { q: 2 } }, + { input: { q: 3 } }, + ], + }; + + let taskCalls = 0; + const task = async () => { + taskCalls++; + return { ok: true }; + }; + + await client.runExperiment({ dataset, task }, []); + + expect(taskCalls).toBe(3); + }); + }); + it('throws when trustUpstreamDataset=true without getDatasetByName', async () => { const client = createClient();