diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/correctness/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/correctness/index.ts index dbb2f8398e663..545626304068a 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/correctness/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/correctness/index.ts @@ -154,7 +154,7 @@ export function createQuantitativeCorrectnessEvaluators(): Evaluator[] { score, label: summaryText, explanation: summaryText, - metadata: metadata ?? undefined, + metadata: { ...(metadata ?? {}), correctnessAnalysis }, }; }, kind: 'LLM', diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/index.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/index.ts index cb6538236a55e..3270796778474 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/index.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/index.ts @@ -135,7 +135,7 @@ export function createQuantitativeGroundednessEvaluator(): Evaluator { score, label: summaryText, explanation: summaryText, - metadata: metadata ?? undefined, + metadata: { ...(metadata ?? {}), groundednessAnalysis }, }; }, kind: 'LLM', diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/scoring.test.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/scoring.test.ts new file mode 100644 index 0000000000000..c3a559b9b33fc --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/scoring.test.ts @@ -0,0 +1,92 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { calculateGroundednessScore } from './scoring'; +import type { GroundednessAnalysis } from './types'; + +const buildAnalysis = ( + claims: Array> +): GroundednessAnalysis => ({ + summary_verdict: 'GROUNDED', + analysis: claims.map((c) => ({ + claim: 'test claim', + centrality: c.centrality, + centrality_reason: '', + verdict: c.verdict, + evidence: undefined, + explanation: '', + })), +}); + +describe('calculateGroundednessScore', () => { + it('returns 0 when analysis is empty', () => { + expect(calculateGroundednessScore({ summary_verdict: 'GROUNDED', analysis: [] })).toBe(0); + }); + + it('returns 1 when all claims are FULLY_SUPPORTED', () => { + const analysis = buildAnalysis([ + { verdict: 'FULLY_SUPPORTED', centrality: 'central' }, + { verdict: 'FULLY_SUPPORTED', centrality: 'peripheral' }, + ]); + expect(calculateGroundednessScore(analysis)).toBe(1); + }); + + it('uses peripheral weight for NOT_FOUND peripheral claim', () => { + const analysis = buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'peripheral' }]); + expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.5); + }); + + it('uses central weight for NOT_FOUND central claim', () => { + const analysis = buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'central' }]); + expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.1); + }); + + it('does not zero the score when LLM returns NOT_FOUND verdict (regression: schema/scoring key mismatch)', () => { + // Reproduces the bug where the scoring map key was `NOT_IN_GROUND_TRUTH` + // but the LLM judge schema (./prompt.ts, ./types.ts) emits `NOT_FOUND`. + // A single peripheral NOT_FOUND alongside supported claims should not collapse + // the geometric mean to 0. + const analysis = buildAnalysis([ + { verdict: 'FULLY_SUPPORTED', centrality: 'central' }, + { verdict: 'FULLY_SUPPORTED', centrality: 'central' }, + { verdict: 'NOT_FOUND', centrality: 'peripheral' }, + ]); + expect(calculateGroundednessScore(analysis)).toBeGreaterThan(0); + }); + + it('returns 0 when a CONTRADICTED central claim is present (geometric mean zero-propagation)', () => { + const analysis = buildAnalysis([ + { verdict: 'FULLY_SUPPORTED', centrality: 'central' }, + { verdict: 'CONTRADICTED', centrality: 'central' }, + ]); + expect(calculateGroundednessScore(analysis)).toBe(0); + }); + + it('applies peripheral weight (0.1) for CONTRADICTED peripheral claim', () => { + const analysis = buildAnalysis([{ verdict: 'CONTRADICTED', centrality: 'peripheral' }]); + expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.1); + }); + + it('weights UNGROUNDED_BUT_DISCLOSED less harshly than NOT_FOUND', () => { + const ungrounded = calculateGroundednessScore( + buildAnalysis([{ verdict: 'UNGROUNDED_BUT_DISCLOSED', centrality: 'central' }]) + ); + const notFound = calculateGroundednessScore( + buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'central' }]) + ); + expect(ungrounded).toBeGreaterThan(notFound); + }); + + it('computes geometric mean across mixed verdicts', () => { + // PARTIALLY_SUPPORTED central (0.9) * FULLY_SUPPORTED (1.0) -> sqrt(0.9) ≈ 0.9487 + const analysis = buildAnalysis([ + { verdict: 'PARTIALLY_SUPPORTED', centrality: 'central' }, + { verdict: 'FULLY_SUPPORTED', centrality: 'central' }, + ]); + expect(calculateGroundednessScore(analysis)).toBeCloseTo(Math.sqrt(0.9)); + }); +}); diff --git a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/scoring.ts b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/scoring.ts index 84ab2f84f0fba..ebadb46c7cd9b 100644 --- a/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/scoring.ts +++ b/x-pack/platform/packages/shared/kbn-evals/src/evaluators/groundedness/scoring.ts @@ -7,7 +7,8 @@ import type { GroundednessAnalysis } from './types'; -// Scoring weights based on the severity of each error type +// Scoring weights based on the severity of each error type. +// Keys must match the `verdict` enum emitted by the LLM judge (see ./prompt.ts and ./types.ts). const CLAIM_FACTUAL_SCORE_MAP = { FULLY_SUPPORTED: 1.0, PARTIALLY_SUPPORTED: { @@ -18,7 +19,7 @@ const CLAIM_FACTUAL_SCORE_MAP = { central: 0.0, peripheral: 0.1, }, - NOT_IN_GROUND_TRUTH: { + NOT_FOUND: { central: 0.1, peripheral: 0.5, }, @@ -43,7 +44,7 @@ export function calculateGroundednessScore(groundednessAnalysis: GroundednessAna let productOfScores = 1.0; for (const claim of analysis) { - const verdict = claim.verdict || 'NOT_IN_GROUND_TRUTH'; + const verdict = claim.verdict || 'NOT_FOUND'; const centrality = claim.centrality || 'peripheral'; const scoreMapEntry = CLAIM_FACTUAL_SCORE_MAP[verdict as keyof typeof CLAIM_FACTUAL_SCORE_MAP]; diff --git a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/README.md b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/README.md index 59a75d87be6a6..849af5cb3bc3d 100644 --- a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/README.md +++ b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/README.md @@ -146,6 +146,7 @@ Prompt-to-spec mapping showing which strategy doc prompts are covered by which s | V2 | Entity Store V2 get_entity routing | `v2/entity_store_v2_get_entity.spec.ts` | | V2 | Entity Store V2 search_entities routing | `v2/entity_store_v2_search_entities.spec.ts` | | V2 | Entity Store V2 multi-skill routing | `v2/entity_store_v2_multi_skill.spec.ts` | +| V2 | `security.entity` attachment side-effects (single card, table, not-found) | `v2/entity_attachment_side_effect.spec.ts` | ## Adding New Tests @@ -155,6 +156,13 @@ To add new evaluation tests: 2. Use the `evaluate` fixture from `src/evaluate.ts` 3. Define your dataset with `examples` containing `input` and `output` fields 4. Use `criteria` in the output for criteria-based evaluation +5. Use `attachments` in the output to assert conversation-level attachments + (e.g. `security.entity` side-effects) persisted during the run. Each entry + supports `{ type, shape?: 'single' | 'table', entityId?, entityType?, + minEntities?, count?: { exact?|min?|max? }, criteria? }`. Count-based + assertions (including `count.exact: 0` for negative checks) are evaluated + deterministically; `criteria` delegates to the LLM judge over the matched + payload. Example: diff --git a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/evals/v2/entity_attachment_side_effect.spec.ts b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/evals/v2/entity_attachment_side_effect.spec.ts new file mode 100644 index 0000000000000..262ff6bc8af62 --- /dev/null +++ b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/evals/v2/entity_attachment_side_effect.spec.ts @@ -0,0 +1,244 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { tags } from '@kbn/scout-security'; +import { getEntitiesAlias, ENTITY_LATEST } from '@kbn/entity-store/common'; +import { hashEuid } from '@kbn/entity-store/common/domain/euid'; +import { evaluate } from '../../src/evaluate'; + +/** + * Entity Store V2 - attachment side-effect evals. + * + * Validates that the entity-analytics skill persists the expected + * `security.entity` conversation attachment as a side effect of tool + * execution. + * + * Seed strategy (fast path, follows `highlights_v2.ts`): + * - Install V2 engines only (creates latest alias + indices). + * - Bulk-index two user entities directly into the latest alias. + * - Skip the real extractor + maintainer pipeline because our assertion + * only needs entities to be resolvable by `security.get_entity` / + * `security.search_entities`, which read from the latest alias. + * + * Seeded entities: + * user:attach-alice — asset.criticality high_impact + * user:attach-bob — asset.criticality medium_impact + * + * Total beforeAll runtime: ~5s on a dev box (vs ~5min for the full real + * pipeline with rules/alerts/force-extraction). + */ +evaluate.describe( + 'SIEM Entity Analytics V2 Skill - Attachment Side-Effects', + { tag: tags.serverless.security.complete }, + () => { + const aliceEuid = 'user:attach-alice'; + const bobEuid = 'user:attach-bob'; + + evaluate.beforeAll(async ({ log, esClient: es, supertest }) => { + log.info('[attachment-evals] beforeAll: POST /api/security/entity_store/install'); + const installRes = await supertest + .post('/api/security/entity_store/install') + .set('kbn-xsrf', 'true') + .set('x-elastic-internal-origin', 'Kibana') + .set('elastic-api-version', '2023-10-31') + .send({ entityTypes: ['user', 'host'] }); + log.info( + `[attachment-evals] beforeAll: install responded status=${ + installRes.status + } body=${JSON.stringify(installRes.body)}` + ); + if (installRes.status !== 200 && installRes.status !== 201) { + throw new Error( + `Entity Store V2 install failed (${installRes.status}): ${JSON.stringify( + installRes.body + )}` + ); + } + + // Wait until status=running so the latest alias exists before we bulk + // index into it. 120s ceiling aligns with `highlights_v2.ts`; normally + // completes in ~5–15s on dev hardware. + await waitForCondition( + async () => { + const res = await supertest + .get('/api/security/entity_store/status') + .set('elastic-api-version', '2023-10-31'); + if (res.status !== 200) return false; + const status = (res.body as { status?: string }).status; + if (status === 'error') { + throw new Error(`Entity Store V2 is in error state: ${JSON.stringify(res.body)}`); + } + return status === 'running'; + }, + { label: 'entity store v2 status=running', timeoutMs: 120_000, log } + ); + + log.info('[attachment-evals] beforeAll: bulk indexing seeded entities into latest alias'); + const latestAlias = getEntitiesAlias(ENTITY_LATEST, 'default'); + const now = new Date().toISOString(); + await es.bulk({ + refresh: true, + operations: [ + { index: { _index: latestAlias, _id: hashEuid(aliceEuid) } }, + { + '@timestamp': now, + entity: { id: aliceEuid, EngineMetadata: { Type: 'user' } }, + user: { name: 'attach-alice' }, + asset: { criticality: 'high_impact' }, + }, + { index: { _index: latestAlias, _id: hashEuid(bobEuid) } }, + { + '@timestamp': now, + entity: { id: bobEuid, EngineMetadata: { Type: 'user' } }, + user: { name: 'attach-bob' }, + asset: { criticality: 'medium_impact' }, + }, + ], + }); + + log.info('[attachment-evals] beforeAll: setup complete'); + }); + + evaluate.afterAll(async ({ log, quickApiClient }) => { + try { + await quickApiClient.deleteEntityEngines({ query: { delete_data: true } }); + } catch (err) { + log.warning(`deleteEntityEngines failed during teardown: ${(err as Error).message}`); + } + }); + + evaluate('entity store v2: attachment side-effects', async ({ evaluateDataset }) => { + await evaluateDataset({ + dataset: { + name: 'entity-analytics-v2: attachment side-effects', + description: + 'Validates the security.entity attachment created as a side effect of get_entity / search_entities tool calls against seeded entity store V2 data.', + examples: [ + // Single-entity card path. + { + input: { + question: "Tell me about user attach-alice's current risk profile", + }, + output: { + criteria: [ + 'Summarise the entity profile for user attach-alice.', + 'Do not fabricate entity data.', + ], + toolCalls: [ + { + id: 'security.get_entity', + criteria: [ + 'The tool is called with an entityId matching "attach-alice" (prefixed or non-prefixed form).', + ], + }, + ], + attachments: [ + { + type: 'security.entity', + shape: 'single', + entityType: 'user', + entityId: 'attach-alice', + count: { min: 1 }, + }, + ], + }, + metadata: { query_intent: 'Factual' }, + }, + + // Multi-entity table path. + { + input: { + question: 'Which users have the highest risk scores right now?', + }, + output: { + criteria: [ + 'Return a list of users from the entity store including attach-alice and attach-bob, or clearly state that data is unavailable.', + 'Do not fabricate entity data.', + ], + toolCalls: [ + { + id: 'security.search_entities', + criteria: [ + 'The tool is called with parameters that sort or filter by risk score.', + ], + }, + ], + attachments: [ + { + type: 'security.entity', + shape: 'table', + minEntities: 2, + count: { min: 1 }, + }, + ], + }, + metadata: { query_intent: 'Factual' }, + }, + + // Negative case: no attachment persisted when entity cannot be resolved. + { + input: { + question: "Tell me about the entity zzz_missing_999xyz's current risk profile", + }, + output: { + criteria: [ + 'Clearly state that the entity zzz_missing_999xyz was not found.', + 'Do not fabricate entity data.', + ], + toolCalls: [ + { + id: 'security.get_entity', + criteria: [ + 'The tool is called with an entityId matching "zzz_missing_999xyz" or equivalent.', + ], + }, + ], + attachments: [ + { + type: 'security.entity', + count: { exact: 0 }, + }, + ], + }, + metadata: { query_intent: 'Factual' }, + }, + ], + }, + }); + }); + } +); + +/** + * Minimal polling helper. Pulled inline rather than dragging in a dependency + * on FTR's `retry` service (not available in the evals fixture graph). + */ +async function waitForCondition( + check: () => Promise, + { + label, + timeoutMs, + intervalMs = 2000, + log, + }: { + label: string; + timeoutMs: number; + intervalMs?: number; + log: { warning: (m: string) => void }; + } +): Promise { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + try { + if (await check()) return; + } catch (err) { + log.warning(`${label} check threw: ${(err as Error).message}`); + } + await new Promise((r) => setTimeout(r, intervalMs)); + } + throw new Error(`Timed out waiting for: ${label}`); +} diff --git a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/moon.yml b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/moon.yml index 142603e2329f0..1a84c95006a1e 100644 --- a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/moon.yml +++ b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/moon.yml @@ -29,6 +29,7 @@ dependsOn: - '@kbn/test-suites-xpack-platform' - '@kbn/core-http-common' - '@kbn/ml-common-types' + - '@kbn/entity-store' tags: - functional-tests - package diff --git a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/chat_client.ts b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/chat_client.ts index d8613430da2c5..cc1b869f03119 100644 --- a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/chat_client.ts +++ b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/chat_client.ts @@ -23,6 +23,29 @@ export interface Step { [key: string]: unknown; } +/** + * Conversation-level attachment record as returned by + * `GET /api/agent_builder/conversations/{id}/attachments`. Mirrors the shape + * of `VersionedAttachment` in `@kbn/agent-builder-common/attachments` but kept + * local here so the evals suite does not pull in that package for a single + * structural type. + */ +export interface AttachmentRecord { + id: string; + type: string; + current_version: number; + versions: Array<{ + version: number; + data: unknown; + created_at?: string; + content_hash?: string; + }>; + description?: string; + active?: boolean; + hidden?: boolean; + origin?: string; +} + interface ConverseFunctionParams { messages: Messages; conversationId?: string; @@ -35,6 +58,7 @@ type ConverseFunction = (params: ConverseFunctionParams) => Promise<{ steps?: Step[]; traceId?: string; modelUsage?: ModelUsageStats; + attachments?: AttachmentRecord[]; }>; interface ModelUsageStats { @@ -52,6 +76,7 @@ interface CallConverseApiResults { modelUsage?: ModelUsageStats; steps?: Step[]; traceId?: string; + attachments?: AttachmentRecord[]; } export class EvaluationChatClient { @@ -88,12 +113,15 @@ export class EvaluationChatClient { model_usage: modelUsage, } = response; + const attachments = await this.fetchAttachments(conversationIdFromResponse); + return { conversationId: conversationIdFromResponse, messages: [...messages, latestResponse], steps, traceId, modelUsage, + attachments, errors: [], }; }; @@ -145,4 +173,34 @@ export class EvaluationChatClient { }; } }; + + /** + * Fetches conversation-level attachments created as a side effect of the + * latest converse call (e.g. `security.get_entity` persisting a + * `security.entity` attachment). Returns an empty array on any error so + * attachment assertions in specs report "no attachments found" with tool-call + * context instead of the test failing at fetch time. + */ + private fetchAttachments = async (conversationId: string): Promise => { + if (!conversationId) { + return []; + } + try { + const response: { results?: AttachmentRecord[] } = await this.fetch( + `/api/agent_builder/conversations/${encodeURIComponent(conversationId)}/attachments`, + { + method: 'GET', + version: '2023-10-31', + } + ); + return response.results ?? []; + } catch (error) { + this.log.warning( + new Error(`Failed to fetch attachments for conversation "${conversationId}"`, { + cause: error instanceof Error ? error : undefined, + }) + ); + return []; + } + }; } diff --git a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/evaluate_dataset.ts b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/evaluate_dataset.ts index cf4b9a598ac0a..6cea053b4ae4b 100644 --- a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/evaluate_dataset.ts +++ b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/src/evaluate_dataset.ts @@ -17,7 +17,13 @@ import { type Evaluator, type Example, } from '@kbn/evals'; -import type { EvaluationChatClient, ErrorResponse, Step, Messages } from './chat_client'; +import type { + EvaluationChatClient, + ErrorResponse, + Step, + Messages, + AttachmentRecord, +} from './chat_client'; interface ToolCallAssertion { id: string; @@ -26,9 +32,42 @@ interface ToolCallAssertion { criteria?: string[]; } +/** + * Assertion against a conversation-level attachment persisted as a side effect + * of tool execution. Used by the attachments evaluator to validate that (for + * example) `security.get_entity` registered a `security.entity` attachment + * with the expected payload shape. + * + * Semantics: + * - `type` is required and matched exactly against `attachment.type`. + * - `shape` narrows the payload on `attachment.versions[current_version].data`: + * - `single` expects `EntityAttachmentSingleData` (identifier + identifierType). + * - `table` expects `EntityAttachmentMultiData` (entities[]). + * - `entityId` / `entityType` match the attachment identifier (single payload + * only). `entityId` matches either raw or stripped-prefix form. + * - `minEntities` / `count` bound the number of matching attachments and/or + * entries in a table payload. + * - `criteria` delegates free-form payload assertions to the LLM judge via the + * standard `evaluators.criteria(...)` pathway — the payload JSON is passed + * as the evaluator output. + */ +interface AttachmentAssertion { + type: string; + shape?: 'single' | 'table'; + entityId?: string; + entityType?: 'host' | 'user' | 'service' | 'generic'; + minEntities?: number; + count?: { min?: number; max?: number; exact?: number }; + criteria?: string[]; +} + interface DatasetExample extends Example { input: { question: string }; - output: { criteria?: string[]; toolCalls?: ToolCallAssertion[] }; + output: { + criteria?: string[]; + toolCalls?: ToolCallAssertion[]; + attachments?: AttachmentAssertion[]; + }; metadata?: { query_intent?: string }; } @@ -36,6 +75,7 @@ interface ChatTaskOutput { errors: ErrorResponse[]; messages: Messages; steps?: Step[]; + attachments?: AttachmentRecord[]; } export type EvaluateDataset = ({ @@ -258,6 +298,7 @@ export function createEvaluateDataset({ errors: response.errors, messages: response.messages, steps: response.steps, + attachments: response.attachments, traceId: response.traceId, modelUsage: response.modelUsage, correctnessAnalysis: correctnessResult?.metadata, @@ -268,6 +309,7 @@ export function createEvaluateDataset({ [ createCriteriaEvaluator({ evaluators }), createToolCallsEvaluator({ evaluators }), + createAttachmentsEvaluator({ evaluators }), ...selectEvaluators([ createQuantitativeGroundednessEvaluator(), ...createQuantitativeCorrectnessEvaluators().filter( @@ -352,3 +394,208 @@ const createToolCallsEvaluator = ({ evaluators }: { evaluators: DefaultEvaluator }, }; }; + +/** + * Returns the current-version payload for an attachment, or `undefined` when + * the record has no versions / current version mismatch. The Agent Builder + * attachment API keeps version arrays 1-indexed so we locate the matching + * entry rather than indexing blindly. + */ +const getCurrentAttachmentData = (attachment: AttachmentRecord): unknown => { + const current = attachment.versions.find((v) => v.version === attachment.current_version); + return current?.data; +}; + +interface AttachmentPayloadShape { + shape?: 'single' | 'table'; + identifier?: string; + identifierType?: string; + entityStoreId?: string; + entitiesCount?: number; +} + +/** + * Classifies an attachment payload into `single` / `table` shape and extracts + * the identifier fields used by the matcher. Mirrors the discriminator logic + * in `EntityAttachmentData` (single: `identifier` + `identifierType`; + * table: `entities: []`). + */ +const classifyPayload = (data: unknown): AttachmentPayloadShape => { + if (!data || typeof data !== 'object') { + return {}; + } + const obj = data as Record; + if (Array.isArray(obj.entities)) { + return { shape: 'table', entitiesCount: obj.entities.length }; + } + if (typeof obj.identifier === 'string' && typeof obj.identifierType === 'string') { + return { + shape: 'single', + identifier: obj.identifier, + identifierType: obj.identifierType, + entityStoreId: typeof obj.entityStoreId === 'string' ? obj.entityStoreId : undefined, + }; + } + return {}; +}; + +/** + * Checks whether an `entityId` assertion matches an attachment identifier. + * Tolerant of the `{type}:` prefix so specs can assert either the canonical + * EUID ("user:jsmith123") or the bare identity value ("jsmith123") — the tool + * strips the prefix on single payloads (see `stripEntityIdPrefix`). + */ +const entityIdMatches = (expected: string, shape: AttachmentPayloadShape): boolean => { + const candidates = [shape.identifier, shape.entityStoreId].filter( + (v): v is string => typeof v === 'string' + ); + const expectedLower = expected.toLowerCase(); + const expectedStripped = expectedLower.includes(':') + ? expectedLower.slice(expectedLower.indexOf(':') + 1) + : expectedLower; + return candidates.some((c) => { + const cLower = c.toLowerCase(); + return cLower === expectedLower || cLower === expectedStripped; + }); +}; + +const findMatchingAttachments = ( + assertion: AttachmentAssertion, + attachments: AttachmentRecord[] +): Array<{ attachment: AttachmentRecord; shape: AttachmentPayloadShape }> => { + return attachments + .filter((a) => a.type === assertion.type && a.active !== false) + .map((attachment) => ({ + attachment, + shape: classifyPayload(getCurrentAttachmentData(attachment)), + })) + .filter(({ shape }) => { + if (assertion.shape && shape.shape !== assertion.shape) return false; + if (assertion.entityType && shape.identifierType !== assertion.entityType) return false; + if (assertion.entityId && !entityIdMatches(assertion.entityId, shape)) return false; + if (assertion.minEntities !== undefined) { + if (shape.shape !== 'table') return false; + if ((shape.entitiesCount ?? 0) < assertion.minEntities) return false; + } + return true; + }); +}; + +const formatCountBounds = (count: NonNullable): string => { + const parts: string[] = []; + if (count.exact !== undefined) parts.push(`exact=${count.exact}`); + if (count.min !== undefined) parts.push(`min=${count.min}`); + if (count.max !== undefined) parts.push(`max=${count.max}`); + return parts.join(', '); +}; + +const evaluateAttachmentAssertion = async ( + assertion: AttachmentAssertion, + attachments: AttachmentRecord[], + evaluators: DefaultEvaluators, + input: DatasetExample['input'], + metadata: DatasetExample['metadata'] +): Promise => { + const matches = findMatchingAttachments(assertion, attachments); + const count = assertion.count; + + // Count-based assertion (including `count.exact: 0` for negative assertions). + if (count) { + const n = matches.length; + const exactFail = count.exact !== undefined && n !== count.exact; + const minFail = count.min !== undefined && n < count.min; + const maxFail = count.max !== undefined && n > count.max; + if (exactFail || minFail || maxFail) { + return { + score: 0, + label: 'FAIL', + explanation: `Expected ${formatCountBounds(count)} attachments of type "${ + assertion.type + }" matching shape/entity filters, found ${n}.`, + }; + } + // If exact=0, pass here — no criteria to judge. + if (count.exact === 0) { + return { + score: 1, + label: 'PASS', + explanation: `Confirmed 0 attachments of type "${assertion.type}" matching filters.`, + }; + } + } else if (matches.length === 0) { + // Default: require at least one match. + const typesSeen = [...new Set(attachments.map((a) => a.type))].join(', ') || '(none)'; + return { + score: 0, + label: 'FAIL', + explanation: `No attachment matched type="${assertion.type}"${ + assertion.shape ? ` shape=${assertion.shape}` : '' + }${assertion.entityId ? ` entityId=${assertion.entityId}` : ''}${ + assertion.entityType ? ` entityType=${assertion.entityType}` : '' + }. Types present: [${typesSeen}].`, + }; + } + + if (!assertion.criteria || assertion.criteria.length === 0) { + return { + score: 1, + label: 'PASS', + explanation: `Attachment assertion satisfied (${matches.length} match${ + matches.length === 1 ? '' : 'es' + } of type "${assertion.type}").`, + }; + } + + // Delegate free-form payload assertions to the LLM judge. Feed the matched + // attachment data in as the evaluator `output` so the judge can reason over + // the actual payload JSON. + const payloadOutput = { + attachments: matches.map(({ attachment, shape }) => ({ + id: attachment.id, + type: attachment.type, + shape: shape.shape, + data: getCurrentAttachmentData(attachment), + })), + }; + const criteriaResult = await evaluators.criteria(assertion.criteria).evaluate({ + input, + expected: { criteria: assertion.criteria }, + output: payloadOutput, + metadata, + }); + return { + score: criteriaResult.score ?? null, + label: criteriaResult.label ?? 'PASS', + explanation: `Matched ${matches.length} attachment(s). ${criteriaResult.explanation ?? ''}`, + }; +}; + +const createAttachmentsEvaluator = ({ + evaluators, +}: { + evaluators: DefaultEvaluators; +}): Evaluator => { + return { + name: 'Attachments', + kind: 'LLM' as const, + evaluate: async ({ input, output, expected, metadata }: EvaluateOpts) => { + const assertions = expected.attachments ?? []; + if (assertions.length === 0) { + return { + score: 1, + label: 'PASS', + explanation: 'No attachment assertions specified.', + }; + } + + const attachments = output.attachments ?? []; + const results: EvaluationResult[] = []; + for (const assertion of assertions) { + results.push( + await evaluateAttachmentAssertion(assertion, attachments, evaluators, input, metadata) + ); + } + return combineEvaluationResults(results); + }, + }; +}; diff --git a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/tsconfig.json b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/tsconfig.json index a8128186ac54f..f6e8c549c8629 100644 --- a/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/tsconfig.json +++ b/x-pack/solutions/security/packages/kbn-evals-suite-entity-analytics/tsconfig.json @@ -23,6 +23,6 @@ "@kbn/test-suites-xpack-platform", "@kbn/core-http-common", "@kbn/ml-common-types", + "@kbn/entity-store", ] } -