Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ export function createQuantitativeCorrectnessEvaluators(): Evaluator[] {
score,
label: summaryText,
explanation: summaryText,
metadata: metadata ?? undefined,
metadata: { ...(metadata ?? {}), correctnessAnalysis },
};
},
kind: 'LLM',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ export function createQuantitativeGroundednessEvaluator(): Evaluator {
score,
label: summaryText,
explanation: summaryText,
metadata: metadata ?? undefined,
metadata: { ...(metadata ?? {}), groundednessAnalysis },
};
},
kind: 'LLM',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { calculateGroundednessScore } from './scoring';
import type { GroundednessAnalysis } from './types';

const buildAnalysis = (
claims: Array<Pick<GroundednessAnalysis['analysis'][number], 'verdict' | 'centrality'>>
): GroundednessAnalysis => ({
summary_verdict: 'GROUNDED',
analysis: claims.map((c) => ({
claim: 'test claim',
centrality: c.centrality,
centrality_reason: '',
verdict: c.verdict,
evidence: undefined,
explanation: '',
})),
});

describe('calculateGroundednessScore', () => {
it('returns 0 when analysis is empty', () => {
expect(calculateGroundednessScore({ summary_verdict: 'GROUNDED', analysis: [] })).toBe(0);
});

it('returns 1 when all claims are FULLY_SUPPORTED', () => {
const analysis = buildAnalysis([
{ verdict: 'FULLY_SUPPORTED', centrality: 'central' },
{ verdict: 'FULLY_SUPPORTED', centrality: 'peripheral' },
]);
expect(calculateGroundednessScore(analysis)).toBe(1);
});

it('uses peripheral weight for NOT_FOUND peripheral claim', () => {
const analysis = buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'peripheral' }]);
expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.5);
});

it('uses central weight for NOT_FOUND central claim', () => {
const analysis = buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'central' }]);
expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.1);
});

it('does not zero the score when LLM returns NOT_FOUND verdict (regression: schema/scoring key mismatch)', () => {
// Reproduces the bug where the scoring map key was `NOT_IN_GROUND_TRUTH`
// but the LLM judge schema (./prompt.ts, ./types.ts) emits `NOT_FOUND`.
// A single peripheral NOT_FOUND alongside supported claims should not collapse
// the geometric mean to 0.
const analysis = buildAnalysis([
{ verdict: 'FULLY_SUPPORTED', centrality: 'central' },
{ verdict: 'FULLY_SUPPORTED', centrality: 'central' },
{ verdict: 'NOT_FOUND', centrality: 'peripheral' },
]);
expect(calculateGroundednessScore(analysis)).toBeGreaterThan(0);
});

it('returns 0 when a CONTRADICTED central claim is present (geometric mean zero-propagation)', () => {
const analysis = buildAnalysis([
{ verdict: 'FULLY_SUPPORTED', centrality: 'central' },
{ verdict: 'CONTRADICTED', centrality: 'central' },
]);
expect(calculateGroundednessScore(analysis)).toBe(0);
});

it('applies peripheral weight (0.1) for CONTRADICTED peripheral claim', () => {
const analysis = buildAnalysis([{ verdict: 'CONTRADICTED', centrality: 'peripheral' }]);
expect(calculateGroundednessScore(analysis)).toBeCloseTo(0.1);
});

it('weights UNGROUNDED_BUT_DISCLOSED less harshly than NOT_FOUND', () => {
const ungrounded = calculateGroundednessScore(
buildAnalysis([{ verdict: 'UNGROUNDED_BUT_DISCLOSED', centrality: 'central' }])
);
const notFound = calculateGroundednessScore(
buildAnalysis([{ verdict: 'NOT_FOUND', centrality: 'central' }])
);
expect(ungrounded).toBeGreaterThan(notFound);
});

it('computes geometric mean across mixed verdicts', () => {
// PARTIALLY_SUPPORTED central (0.9) * FULLY_SUPPORTED (1.0) -> sqrt(0.9) ≈ 0.9487
const analysis = buildAnalysis([
{ verdict: 'PARTIALLY_SUPPORTED', centrality: 'central' },
{ verdict: 'FULLY_SUPPORTED', centrality: 'central' },
]);
expect(calculateGroundednessScore(analysis)).toBeCloseTo(Math.sqrt(0.9));
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

import type { GroundednessAnalysis } from './types';

// Scoring weights based on the severity of each error type
// Scoring weights based on the severity of each error type.
// Keys must match the `verdict` enum emitted by the LLM judge (see ./prompt.ts and ./types.ts).
const CLAIM_FACTUAL_SCORE_MAP = {
FULLY_SUPPORTED: 1.0,
PARTIALLY_SUPPORTED: {
Expand All @@ -18,7 +19,7 @@ const CLAIM_FACTUAL_SCORE_MAP = {
central: 0.0,
peripheral: 0.1,
},
NOT_IN_GROUND_TRUTH: {
NOT_FOUND: {
central: 0.1,
peripheral: 0.5,
},
Expand All @@ -43,7 +44,7 @@ export function calculateGroundednessScore(groundednessAnalysis: GroundednessAna

let productOfScores = 1.0;
for (const claim of analysis) {
const verdict = claim.verdict || 'NOT_IN_GROUND_TRUTH';
const verdict = claim.verdict || 'NOT_FOUND';
const centrality = claim.centrality || 'peripheral';

const scoreMapEntry = CLAIM_FACTUAL_SCORE_MAP[verdict as keyof typeof CLAIM_FACTUAL_SCORE_MAP];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ Prompt-to-spec mapping showing which strategy doc prompts are covered by which s
| V2 | Entity Store V2 get_entity routing | `v2/entity_store_v2_get_entity.spec.ts` |
| V2 | Entity Store V2 search_entities routing | `v2/entity_store_v2_search_entities.spec.ts` |
| V2 | Entity Store V2 multi-skill routing | `v2/entity_store_v2_multi_skill.spec.ts` |
| V2 | `security.entity` attachment side-effects (single card, table, not-found) | `v2/entity_attachment_side_effect.spec.ts` |

## Adding New Tests

Expand All @@ -155,6 +156,13 @@ To add new evaluation tests:
2. Use the `evaluate` fixture from `src/evaluate.ts`
3. Define your dataset with `examples` containing `input` and `output` fields
4. Use `criteria` in the output for criteria-based evaluation
5. Use `attachments` in the output to assert conversation-level attachments
(e.g. `security.entity` side-effects) persisted during the run. Each entry
supports `{ type, shape?: 'single' | 'table', entityId?, entityType?,
minEntities?, count?: { exact?|min?|max? }, criteria? }`. Count-based
assertions (including `count.exact: 0` for negative checks) are evaluated
deterministically; `criteria` delegates to the LLM judge over the matched
payload.

Example:

Expand Down
Loading
Loading