Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions scripts/evals.js
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,17 @@ function main() {
return;
}

if (hasFlag(args, '--local')) {
process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false';
require('@kbn/setup-node-env');
void require('@kbn/evals')
.injectLocalConnector(process.argv)
.then(function () {
return require('@kbn/evals').cli.run();
});
return;
}

process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false';
require('@kbn/setup-node-env');
void require('@kbn/evals').cli.run();
Expand Down
46 changes: 46 additions & 0 deletions x-pack/platform/packages/shared/kbn-evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,52 @@ For convenience, `start` and `run` support shorter aliases:
node scripts/evals start --suite agent-builder --model eis-gpt-4.1 --judge eis-claude-4-5-sonnet
```

### Local model quick start (Ollama)

Run evals entirely offline using a local LLM served by [Ollama](https://ollama.com). No cloud credentials required.

#### 1. Install and pull a model

```bash
brew install ollama && ollama pull <model>
```

Pick a model based on your available RAM:

| RAM | Recommended model | Notes |
| ------ | ----------------- | ----------------------------------- |
| 16 GB | `qwen3:8b` | Fast; good for smoke-testing suites |
| 32 GB | `qwen3:14b` | Balanced quality / speed |
| 48 GB | `qwen3:32b` | Production-grade reasoning |
| 64 GB+ | `llama3.3:70b` | Highest quality for complex tasks |

#### 2. Set the required timeout env var

Local models are slower than cloud connectors. Raise the task timeout so evaluations do not time out mid-run:

```bash
export EVAL_TASK_TIMEOUT_MS=600000
```

#### 3. Run a suite

```bash
node scripts/evals run --suite <suite-id> --local
```

`--local` detects the running Ollama instance, auto-wires a connector pointing at it, and runs the full suite against that model.

#### `--local` vs `--dry-run`

| Flag | Purpose |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `--local` | Full eval run against a local Ollama runtime — all examples, all repetitions. |
| `--dry-run` | Smoke-test suite wiring without committing to a full run: samples **one example per dataset** and sets `EVALUATION_REPETITIONS=1`. Use this to verify datasets, connectors, and Playwright config before a long run. Works with any connector, not just `--local`. |

#### Automated orchestration

For model benchmarking, automated model selection, and multi-suite orchestration on a local runtime, use the `local-evals` skill in [`elastic-agent-builder-skill-dev`](https://github.com/elastic/elastic-agent-builder-skill-dev). That skill provides the full provisioning + benchmark + recommendation workflow that was intentionally kept out of `@kbn/evals` core.

### Evals CLI commands

```bash
Expand Down
1 change: 1 addition & 0 deletions x-pack/platform/packages/shared/kbn-evals/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

// CLI tools
export * as cli from './src/cli';
export { injectLocalConnector } from './src/cli/inject_local_connector';

export { evaluate } from './src/evaluate';
export type { DefaultEvaluators, ReportDisplayOptions } from './src/types';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,15 @@ export const runSuiteCmd: Command<void> = {
args.push(...positionals);
}

const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim();
log.info(`Running: ${commandPreview}`);

if (flagsReader.boolean('dry-run')) {
return;
envOverrides.EVALUATION_REPETITIONS = '1';
envOverrides.EVALUATION_DRY_RUN = 'true';
log.info('[DRY-RUN] sampling 1 example per dataset, repetitions=1');
}

const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim();
log.info(`Running: ${commandPreview}`);

await new Promise<void>((resolve, reject) => {
const childEnv: Record<string, string> = { ...process.env, ...envOverrides } as Record<
string,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { execFileSync } from 'node:child_process';
import { injectLocalConnector } from './inject_local_connector';

jest.mock('node:child_process', () => ({
execFileSync: jest.fn(),
}));

const mockExecFileSync = execFileSync as jest.Mock;

describe('injectLocalConnector', () => {
let fetchSpy: jest.SpyInstance;
let stderrSpy: jest.SpyInstance;

beforeEach(() => {
fetchSpy = jest.spyOn(global, 'fetch').mockRejectedValue(new Error('connection refused'));
stderrSpy = jest.spyOn(process.stderr, 'write').mockImplementation(() => true);
mockExecFileSync.mockImplementation(() => {
throw new Error('command not found');
});
delete process.env.KIBANA_TESTING_AI_CONNECTORS;
delete process.env.EVALUATION_CONNECTOR_ID;
});

afterEach(() => {
fetchSpy.mockRestore();
stderrSpy.mockRestore();
jest.clearAllMocks();
});

describe('hard-fail when no local runtime is reachable', () => {
it('throws with an actionable error message', async () => {
const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];

await expect(injectLocalConnector(args)).rejects.toThrow(
'--local requires a running local runtime, but none was detected'
);
});

it('error message tells user to start Ollama or LM Studio', async () => {
const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];

await expect(injectLocalConnector(args)).rejects.toThrow(
'Start Ollama (`ollama serve`) or LM Studio'
);
});

it('error message explains the fallback refusal', async () => {
const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];

await expect(injectLocalConnector(args)).rejects.toThrow(
'Refusing to silently fall back to the cloud connector'
);
});

it('does not set env vars when no runtime found', async () => {
const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];

await expect(injectLocalConnector(args)).rejects.toThrow();

expect(process.env.EVALUATION_CONNECTOR_ID).toBeUndefined();
expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeUndefined();
});

it('strips --local from args before throwing', async () => {
const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];

await expect(injectLocalConnector(args)).rejects.toThrow();

// --local was stripped from args before detection ran
expect(args).not.toContain('--local');
});
});

describe('happy path: runtime reachable with a loaded model', () => {
it('injects connector env vars when Ollama is running with a model', async () => {
fetchSpy
// first call: probeEndpoint(ollamaEndpoint) → ok
.mockResolvedValueOnce({ ok: true, status: 200 } as Response)
// second call: getOllamaModels → returns a model
.mockResolvedValueOnce({
ok: true,
json: async () => ({ models: [{ name: 'llama3.2:3b', size: 2_000_000_000 }] }),
} as unknown as Response);

const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
await injectLocalConnector(args);

expect(process.env.EVALUATION_CONNECTOR_ID).toBe('local-eval-model');
expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeDefined();

const decoded = JSON.parse(
Buffer.from(process.env.KIBANA_TESTING_AI_CONNECTORS!, 'base64').toString('utf-8')
);
expect(decoded['local-eval-model'].config.defaultModel).toBe('llama3.2:3b');
expect(decoded['local-eval-model'].config.apiUrl).toContain('/v1/chat/completions');
});
});

describe('hard-fail when ollama binary exists but server is not running', () => {
it('throws when binary is installed but server is not reachable', async () => {
// ollama binary exists
mockExecFileSync.mockImplementation((cmd: string, cmdArgs: string[]) => {
if (cmd === 'sh' && cmdArgs.includes('ollama')) {
return Buffer.from('/usr/local/bin/ollama');
}
throw new Error('not found');
});

const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];

await expect(injectLocalConnector(args)).rejects.toThrow(
'--local requires a running local runtime, but none was detected'
);
});
});
});
Loading
Loading