elastic · patrykkopycinski · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/scripts/evals.js b/scripts/evals.js
@@ -431,6 +431,17 @@ function main() {
     return;
   }
 
+  if (hasFlag(args, '--local')) {
+    process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false';
+    require('@kbn/setup-node-env');
+    void require('@kbn/evals')
+      .injectLocalConnector(process.argv)
+      .then(function () {
+        return require('@kbn/evals').cli.run();
+      });
+    return;
+  }
+
   process.env.KBN_PEGGY_REQUIRE_HOOK_LOG ??= 'false';
   require('@kbn/setup-node-env');
   void require('@kbn/evals').cli.run();

@@ -186,6 +186,52 @@ For convenience, `start` and `run` support shorter aliases:
 node scripts/evals start --suite agent-builder --model eis-gpt-4.1 --judge eis-claude-4-5-sonnet
 ```
 
+### Local model quick start (Ollama)
+
+Run evals entirely offline using a local LLM served by [Ollama](https://ollama.com). No cloud credentials required.
+
+#### 1. Install and pull a model
+
+```bash
+brew install ollama && ollama pull <model>
+```
+
+Pick a model based on your available RAM:
+
+| RAM    | Recommended model | Notes                               |
+| ------ | ----------------- | ----------------------------------- |
+| 16 GB  | `qwen3:8b`        | Fast; good for smoke-testing suites |
+| 32 GB  | `qwen3:14b`       | Balanced quality / speed            |
+| 48 GB  | `qwen3:32b`       | Production-grade reasoning          |
+| 64 GB+ | `llama3.3:70b`    | Highest quality for complex tasks   |
+
+#### 2. Set the required timeout env var
+
+Local models are slower than cloud connectors. Raise the task timeout so evaluations do not time out mid-run:
+
+```bash
+export EVAL_TASK_TIMEOUT_MS=600000
+```
+
+#### 3. Run a suite
+
+```bash
+node scripts/evals run --suite <suite-id> --local
+```
+
+`--local` detects the running Ollama instance, auto-wires a connector pointing at it, and runs the full suite against that model.
+
+#### `--local` vs `--dry-run`
+
+| Flag        | Purpose                                                                                                                                                         |
+| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--local`   | Full eval run against a local Ollama runtime — all examples, all repetitions.                                                                                   |
+| `--dry-run` | Smoke-test suite wiring without committing to a full run: samples **one example per dataset** and sets `EVALUATION_REPETITIONS=1`. Use this to verify datasets, connectors, and Playwright config before a long run. Works with any connector, not just `--local`. |
+
+#### Automated orchestration
+
+For model benchmarking, automated model selection, and multi-suite orchestration on a local runtime, use the `local-evals` skill in [`elastic-agent-builder-skill-dev`](https://github.com/elastic/elastic-agent-builder-skill-dev). That skill provides the full provisioning + benchmark + recommendation workflow that was intentionally kept out of `@kbn/evals` core.
+
 ### Evals CLI commands
 
 ```bash

@@ -23,6 +23,7 @@
 
 // CLI tools
 export * as cli from './src/cli';
+export { injectLocalConnector } from './src/cli/inject_local_connector';
 
 export { evaluate } from './src/evaluate';
 export type { DefaultEvaluators, ReportDisplayOptions } from './src/types';

@@ -247,13 +247,15 @@ export const runSuiteCmd: Command<void> = {
       args.push(...positionals);
     }
 
-    const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim();
-    log.info(`Running: ${commandPreview}`);
-
     if (flagsReader.boolean('dry-run')) {
-      return;
+      envOverrides.EVALUATION_REPETITIONS = '1';
+      envOverrides.EVALUATION_DRY_RUN = 'true';
+      log.info('[DRY-RUN] sampling 1 example per dataset, repetitions=1');
     }
 
+    const commandPreview = `${formatEnvPrefix(envOverrides)} node ${args.join(' ')}`.trim();
+    log.info(`Running: ${commandPreview}`);
+
     await new Promise<void>((resolve, reject) => {
       const childEnv: Record<string, string> = { ...process.env, ...envOverrides } as Record<
         string,

@@ -0,0 +1,123 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { execFileSync } from 'node:child_process';
+import { injectLocalConnector } from './inject_local_connector';
+
+jest.mock('node:child_process', () => ({
+  execFileSync: jest.fn(),
+}));
+
+const mockExecFileSync = execFileSync as jest.Mock;
+
+describe('injectLocalConnector', () => {
+  let fetchSpy: jest.SpyInstance;
+  let stderrSpy: jest.SpyInstance;
+
+  beforeEach(() => {
+    fetchSpy = jest.spyOn(global, 'fetch').mockRejectedValue(new Error('connection refused'));
+    stderrSpy = jest.spyOn(process.stderr, 'write').mockImplementation(() => true);
+    mockExecFileSync.mockImplementation(() => {
+      throw new Error('command not found');
+    });
+    delete process.env.KIBANA_TESTING_AI_CONNECTORS;
+    delete process.env.EVALUATION_CONNECTOR_ID;
+  });
+
+  afterEach(() => {
+    fetchSpy.mockRestore();
+    stderrSpy.mockRestore();
+    jest.clearAllMocks();
+  });
+
+  describe('hard-fail when no local runtime is reachable', () => {
+    it('throws with an actionable error message', async () => {
+      const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
+
+      await expect(injectLocalConnector(args)).rejects.toThrow(
+        '--local requires a running local runtime, but none was detected'
+      );
+    });
+
+    it('error message tells user to start Ollama or LM Studio', async () => {
+      const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
+
+      await expect(injectLocalConnector(args)).rejects.toThrow(
+        'Start Ollama (`ollama serve`) or LM Studio'
+      );
+    });
+
+    it('error message explains the fallback refusal', async () => {
+      const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
+
+      await expect(injectLocalConnector(args)).rejects.toThrow(
+        'Refusing to silently fall back to the cloud connector'
+      );
+    });
+
+    it('does not set env vars when no runtime found', async () => {
+      const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
+
+      await expect(injectLocalConnector(args)).rejects.toThrow();
+
+      expect(process.env.EVALUATION_CONNECTOR_ID).toBeUndefined();
+      expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeUndefined();
+    });
+
+    it('strips --local from args before throwing', async () => {
+      const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
+
+      await expect(injectLocalConnector(args)).rejects.toThrow();
+
+      // --local was stripped from args before detection ran
+      expect(args).not.toContain('--local');
+    });
+  });
+
+  describe('happy path: runtime reachable with a loaded model', () => {
+    it('injects connector env vars when Ollama is running with a model', async () => {
+      fetchSpy
+        // first call: probeEndpoint(ollamaEndpoint) → ok
+        .mockResolvedValueOnce({ ok: true, status: 200 } as Response)
+        // second call: getOllamaModels → returns a model
+        .mockResolvedValueOnce({
+          ok: true,
+          json: async () => ({ models: [{ name: 'llama3.2:3b', size: 2_000_000_000 }] }),
+        } as unknown as Response);
+
+      const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
+      await injectLocalConnector(args);
+
+      expect(process.env.EVALUATION_CONNECTOR_ID).toBe('local-eval-model');
+      expect(process.env.KIBANA_TESTING_AI_CONNECTORS).toBeDefined();
+
+      const decoded = JSON.parse(
+        Buffer.from(process.env.KIBANA_TESTING_AI_CONNECTORS!, 'base64').toString('utf-8')
+      );
+      expect(decoded['local-eval-model'].config.defaultModel).toBe('llama3.2:3b');
+      expect(decoded['local-eval-model'].config.apiUrl).toContain('/v1/chat/completions');
+    });
+  });
+
+  describe('hard-fail when ollama binary exists but server is not running', () => {
+    it('throws when binary is installed but server is not reachable', async () => {
+      // ollama binary exists
+      mockExecFileSync.mockImplementation((cmd: string, cmdArgs: string[]) => {
+        if (cmd === 'sh' && cmdArgs.includes('ollama')) {
+          return Buffer.from('/usr/local/bin/ollama');
+        }
+        throw new Error('not found');
+      });
+
+      const args = ['node', 'scripts/evals.js', 'run', '--suite', 'agent-builder', '--local'];
+
+      await expect(injectLocalConnector(args)).rejects.toThrow(
+        '--local requires a running local runtime, but none was detected'
+      );
+    });
+  });
+});