elastic · enriquesanchez-elastic · May 4, 2026 · Apr 24, 2026 · Apr 27, 2026 · Apr 29, 2026
@@ -154,7 +154,7 @@ export function createQuantitativeCorrectnessEvaluators(): Evaluator[] {
         score,
         label: summaryText,
         explanation: summaryText,
-        metadata: metadata ?? undefined,
+        metadata: { ...(metadata ?? {}), correctnessAnalysis },
       };
     },
     kind: 'LLM',

@@ -135,7 +135,7 @@ export function createQuantitativeGroundednessEvaluator(): Evaluator {
         score,
         label: summaryText,
         explanation: summaryText,
-        metadata: metadata ?? undefined,
+        metadata: { ...(metadata ?? {}), groundednessAnalysis },
       };
     },
     kind: 'LLM',

@@ -0,0 +1,92 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { calculateGroundednessScore } from './scoring';
+import type { GroundednessAnalysis } from './types';
+
+const buildAnalysis = (
+  claims: Array<Pick<GroundednessAnalysis['analysis'][number], 'verdict' | 'centrality'>>
+): GroundednessAnalysis => ({
+  summary_verdict: 'GROUNDED',
+  analysis: claims.map((c) => ({
+    claim: 'test claim',
+    centrality: c.centrality,
+    centrality_reason: '',
+    verdict: c.verdict,
+    evidence: undefined,
+    explanation: '',
+  })),
+});
+
+describe('calculateGroundednessScore', () => {
+  it('returns 0 when analysis is empty', () => {
+    expect(calculateGroundednessScore({ summary_verdict: 'GROUNDED', analysis: [] })).toBe(0);
+  });
+
+  it('returns 1 when all claims are FULLY_SUPPORTED', () => {
+    const analysis = buildAnalysis([
+      { verdict: 'FULLY_SUPPORTED', centrality: 'central' },
+      { verdict: 'FULLY_SUPPORTED', centrality: 'peripheral' },
+    ]);
+    expect(calculateGroundednessScore(analysis)).toBe(1);
+  });
+
+  it('uses peripheral weight for NOT_FOUND peripheral claim', () => {
+    const analysis = buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'peripheral' }]);
+    expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.5);
+  });
+
+  it('uses central weight for NOT_FOUND central claim', () => {
+    const analysis = buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'central' }]);
+    expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.1);
+  });
+
+  it('does not zero the score when LLM returns NOT_FOUND verdict (regression: schema/scoring key mismatch)', () => {
+    // Reproduces the bug where the scoring map key was `NOT_IN_GROUND_TRUTH`
+    // but the LLM judge schema (./prompt.ts, ./types.ts) emits `NOT_FOUND`.
+    // A single peripheral NOT_FOUND alongside supported claims should not collapse
+    // the geometric mean to 0.
+    const analysis = buildAnalysis([
+      { verdict: 'FULLY_SUPPORTED', centrality: 'central' },
+      { verdict: 'FULLY_SUPPORTED', centrality: 'central' },
+      { verdict: 'NOT_FOUND', centrality: 'peripheral' },
+    ]);
+    expect(calculateGroundednessScore(analysis)).toBeGreaterThan(0);
+  });
+
+  it('returns 0 when a CONTRADICTED central claim is present (geometric mean zero-propagation)', () => {
+    const analysis = buildAnalysis([
+      { verdict: 'FULLY_SUPPORTED', centrality: 'central' },
+      { verdict: 'CONTRADICTED', centrality: 'central' },
+    ]);
+    expect(calculateGroundednessScore(analysis)).toBe(0);
+  });
+
+  it('applies peripheral weight (0.1) for CONTRADICTED peripheral claim', () => {
+    const analysis = buildAnalysis([{ verdict: 'CONTRADICTED', centrality: 'peripheral' }]);
+    expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.1);
+  });
+
+  it('weights UNGROUNDED_BUT_DISCLOSED less harshly than NOT_FOUND', () => {
+    const ungrounded = calculateGroundednessScore(
+      buildAnalysis([{ verdict: 'UNGROUNDED_BUT_DISCLOSED', centrality: 'central' }])
+    );
+    const notFound = calculateGroundednessScore(
+      buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'central' }])
+    );
+    expect(ungrounded).toBeGreaterThan(notFound);
+  });
+
+  it('computes geometric mean across mixed verdicts', () => {
+    // PARTIALLY_SUPPORTED central (0.9) * FULLY_SUPPORTED (1.0) -> sqrt(0.9) ≈ 0.9487
+    const analysis = buildAnalysis([
+      { verdict: 'PARTIALLY_SUPPORTED', centrality: 'central' },
+      { verdict: 'FULLY_SUPPORTED', centrality: 'central' },
+    ]);
+    expect(calculateGroundednessScore(analysis)).toBeCloseTo(Math.sqrt(0.9));
+  });
+});
@@ -7,7 +7,8 @@
 
 import type { GroundednessAnalysis } from './types';
 
-// Scoring weights based on the severity of each error type
+// Scoring weights based on the severity of each error type.
+// Keys must match the `verdict` enum emitted by the LLM judge (see ./prompt.ts and ./types.ts).
 const CLAIM_FACTUAL_SCORE_MAP = {
   FULLY_SUPPORTED: 1.0,
   PARTIALLY_SUPPORTED: {
@@ -18,7 +19,7 @@ const CLAIM_FACTUAL_SCORE_MAP = {
     central: 0.0,
     peripheral: 0.1,
   },
-  NOT_IN_GROUND_TRUTH: {
+  NOT_FOUND: {
     central: 0.1,
     peripheral: 0.5,
   },
@@ -43,7 +44,7 @@ export function calculateGroundednessScore(groundednessAnalysis: GroundednessAna
 
   let productOfScores = 1.0;
   for (const claim of analysis) {
-    const verdict = claim.verdict || 'NOT_IN_GROUND_TRUTH';
+    const verdict = claim.verdict || 'NOT_FOUND';
     const centrality = claim.centrality || 'peripheral';
 
     const scoreMapEntry = CLAIM_FACTUAL_SCORE_MAP[verdict as keyof typeof CLAIM_FACTUAL_SCORE_MAP];

@@ -146,6 +146,7 @@ Prompt-to-spec mapping showing which strategy doc prompts are covered by which s
 | V2 | Entity Store V2 get_entity routing | `v2/entity_store_v2_get_entity.spec.ts` |
 | V2 | Entity Store V2 search_entities routing | `v2/entity_store_v2_search_entities.spec.ts` |
 | V2 | Entity Store V2 multi-skill routing | `v2/entity_store_v2_multi_skill.spec.ts` |
+| V2 | `security.entity` attachment side-effects (single card, table, not-found) | `v2/entity_attachment_side_effect.spec.ts` |
 
 ## Adding New Tests
 
@@ -155,6 +156,13 @@ To add new evaluation tests:
 2. Use the `evaluate` fixture from `src/evaluate.ts`
 3. Define your dataset with `examples` containing `input` and `output` fields
 4. Use `criteria` in the output for criteria-based evaluation
+5. Use `attachments` in the output to assert conversation-level attachments
+   (e.g. `security.entity` side-effects) persisted during the run. Each entry
+   supports `{ type, shape?: 'single' | 'table', entityId?, entityType?,
+   minEntities?, count?: { exact?|min?|max? }, criteria? }`. Count-based
+   assertions (including `count.exact: 0` for negative checks) are evaluated
+   deterministically; `criteria` delegates to the LLM judge over the matched
+   payload.
 
 Example: