-
Notifications
You must be signed in to change notification settings - Fork 8.6k
Added AI Insight evals #263561
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added AI Insight evals #263561
Changes from all commits
02bb9b8
7f52eec
4514d4d
1516726
c37247f
1d25c96
efa8e2b
41748fa
0ad08aa
3d54c2e
6df64b2
0ddddea
0d0d216
f5bdf67
a3c11a5
c7e4e58
0c165f0
3f0f558
c9db5c4
8ddcc71
c6318c0
d742c53
2341965
1e3dbe8
827b0a8
b1eb01e
c9dc50d
e0ee499
25cfd76
6f71e26
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the Elastic License | ||
| * 2.0; you may not use this file except in compliance with the Elastic License | ||
| * 2.0. | ||
| */ | ||
|
|
||
| import { tags } from '@kbn/scout'; | ||
| import type { LoadResult } from '@kbn/es-snapshot-loader'; | ||
| import type { LogInsightParams } from '../../src/clients/ai_insight_client'; | ||
| import { | ||
| replayObservabilityDataStreams, | ||
| cleanObservabilityDataStreams, | ||
| } from '../../src/data_generators/replay'; | ||
| import { getLogScenarios, type LogScenario } from '../../src/scenarios/log_scenarios'; | ||
| import { evaluate } from './evaluate_ai_insights'; | ||
|
|
||
| const INDEX_REFRESH_WAIT_MS = 2500; | ||
|
|
||
| const scenarios = getLogScenarios(); | ||
|
|
||
| for (const scenario of scenarios) { | ||
| createScenarioTest(scenario); | ||
| } | ||
|
|
||
| function createScenarioTest(scenario: LogScenario) { | ||
| evaluate.describe( | ||
| `Log AI Insights - ${scenario.id} (${scenario.snapshotName})`, | ||
| { tag: tags.serverless.observability.complete }, | ||
| () => { | ||
| let logDocId: string; | ||
| let logIndex: string; | ||
| let replayResult: LoadResult; | ||
|
|
||
| evaluate.beforeAll(async ({ esClient, log }) => { | ||
| log.info(`Replaying scenario: ${scenario.id}`); | ||
| replayResult = await replayObservabilityDataStreams( | ||
| esClient, | ||
| log, | ||
| scenario.snapshotName, | ||
| scenario.gcs | ||
| ); | ||
|
|
||
| log.debug('Waiting to make sure all indices are refreshed'); | ||
| await new Promise((resolve) => setTimeout(resolve, INDEX_REFRESH_WAIT_MS)); | ||
|
|
||
| log.info( | ||
| `Querying for log: service=${scenario.logQuery.serviceName}, pattern="${scenario.logQuery.messagePattern}"` | ||
| ); | ||
|
|
||
| const logResponse = await esClient.search({ | ||
| index: scenario.logQuery.index, | ||
| query: { | ||
| bool: { | ||
| filter: [{ term: { 'service.name': scenario.logQuery.serviceName } }], | ||
| should: [ | ||
| { match_phrase: { message: scenario.logQuery.messagePattern } }, | ||
| { match_phrase: { 'exception.message': scenario.logQuery.messagePattern } }, | ||
| ], | ||
| minimum_should_match: 1, | ||
| }, | ||
| }, | ||
| sort: [{ '@timestamp': 'desc' }], | ||
| size: 1, | ||
| _source: false, | ||
| }); | ||
|
|
||
| const logDoc = logResponse.hits.hits[0]; | ||
| if (!logDoc) { | ||
| throw new Error( | ||
| `No log found for scenario ${scenario.id} (service: ${scenario.logQuery.serviceName}, pattern: "${scenario.logQuery.messagePattern}")` | ||
| ); | ||
| } | ||
|
|
||
| if (!logDoc._id || !logDoc._index) { | ||
| throw new Error(`Log document missing _id or _index for scenario ${scenario.id}`); | ||
| } | ||
| logDocId = logDoc._id; | ||
| logIndex = logDoc._index; | ||
| log.info(`Found log document: ${logIndex}/${logDocId}`); | ||
| }); | ||
|
|
||
| evaluate( | ||
| `Log AI insight correctness (${scenario.id}, ${scenario.snapshotName})`, | ||
| async ({ aiInsightClient, evaluateDataset }) => { | ||
| await evaluateDataset<LogInsightParams>({ | ||
| getInsight: (params) => aiInsightClient.getLogInsight(params), | ||
| dataset: { | ||
| name: `ai insights: log analysis (${scenario.id}, ${scenario.snapshotName})`, | ||
| description: `Evaluates correctness of log AI insight summaries for ${scenario.id} (snapshot: ${scenario.snapshotName})`, | ||
| examples: [ | ||
| { | ||
| input: { | ||
| requestPayload: { | ||
| index: logIndex, | ||
| id: logDocId, | ||
| }, | ||
| question: | ||
| 'Analyze this log entry and provide a summary explaining what it means, identify where it originated, assess the root cause and impact, and recommend next steps.', | ||
| }, | ||
| output: { | ||
| expected: scenario.expectedOutput, | ||
| }, | ||
| }, | ||
| ], | ||
| }, | ||
| }); | ||
| } | ||
| ); | ||
|
|
||
| evaluate.afterAll(async ({ esClient, log }) => { | ||
| log.debug('Cleaning up indices'); | ||
| await cleanObservabilityDataStreams(esClient, replayResult, log); | ||
| }); | ||
| } | ||
| ); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,9 +6,10 @@ | |
| */ | ||
|
|
||
| import type { AlertScenario } from './types'; | ||
| import { PAYMENT_SERVICE_GCS } from './constants'; | ||
| import { PAYMENT_SERVICE_GCS, PAYMENT_UNREACHABLE_GCS } from './constants'; | ||
|
|
||
| const PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID = 'payment-error-count-alert'; | ||
| const PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID = 'payment-unreachable-alert'; | ||
|
|
||
| const PAYMENT_ALERT_EXPECTED_OUTPUT = `- Summary: A single handled error was detected in the payment service, specifically related to an invalid token during a payment request. The error appears isolated, with no evidence of broader anomalies or downstream impact. | ||
|
|
||
|
|
@@ -26,6 +27,21 @@ const PAYMENT_ALERT_EXPECTED_OUTPUT = `- Summary: A single handled error was d | |
| - Validate that the error is properly handled and does not impact payment processing for valid tokens. | ||
| - If no further errors occur, monitor for recurrence but no urgent action is required. If errors increase, investigate token validation logic and upstream authentication flows.`; | ||
|
|
||
| const PAYMENT_UNREACHABLE_ALERT_EXPECTED = `- Summary: An APM error count alert fired for the frontend service because the payment service is unreachable. The checkout flow fails with a gRPC Unavailable error ("name resolver error: produced zero addresses") when attempting to charge a card via the payment service. This is a connectivity or infrastructure failure, not an application code defect. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Question] Just curious if you tweaked the expected responses for all insights based on our preferences/expectations or this is an actual response from the AI Insight API?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question, @SrdjanLL ! I took an answer from Claude Opus and tweaked the wording a bit |
||
|
|
||
| - Assessment: The payment service is entirely unreachable from the checkout service — DNS or name resolution returns zero addresses for the payment endpoint. This causes all checkout attempts to fail, resulting in user-facing errors propagated through the frontend. The \`paymentUnreachable\` feature flag in flagd is the most likely cause if this is a test environment; otherwise, this indicates a real infrastructure issue (service down, DNS failure, network partition). | ||
|
|
||
| - Related signals: | ||
|
|
||
| - Errors: "failed to charge card: could not charge the card: rpc error: code = Unavailable desc = name resolver error: produced zero addresses" (apmErrors, last seen within alert window, Direct) — all checkout/payment flows fail. | ||
| - Anomalies: Payment service absent from traces (apmServiceSummary, alert window, Direct) — the payment service is not running or not reachable. | ||
| - Downstream: checkout and frontend-proxy report errors due to payment unavailability (apmServiceTopology, Indirect). | ||
| - Immediate actions: | ||
|
|
||
| 1. Verify the payment service is running, healthy, and reachable from the checkout service's network. | ||
| 2. Check DNS resolution for the payment service endpoint from within the checkout service's environment. | ||
| 3. If using the \`paymentUnreachable\` feature flag, verify its state in flagd and disable it if unintentional.`; | ||
|
|
||
| export const ALERT_SCENARIOS: Record<string, AlertScenario> = { | ||
| [PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID]: { | ||
| id: PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID, | ||
|
|
@@ -56,6 +72,35 @@ export const ALERT_SCENARIOS: Record<string, AlertScenario> = { | |
| }, | ||
| expectedOutput: PAYMENT_ALERT_EXPECTED_OUTPUT, | ||
| }, | ||
| [PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID]: { | ||
| id: PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID, | ||
| description: 'APM error count alert for frontend when payment service is unreachable', | ||
| snapshotName: 'payment-unreachable', | ||
| gcs: PAYMENT_UNREACHABLE_GCS, | ||
| alertRule: { | ||
| ruleParams: { | ||
| consumer: 'apm', | ||
| enabled: true, | ||
| name: 'Error count threshold - payment unreachable', | ||
| rule_type_id: 'apm.error_rate', | ||
| tags: [], | ||
| params: { | ||
| threshold: 1, | ||
| windowSize: 5, | ||
| windowUnit: 'm', | ||
| serviceName: 'frontend', | ||
| environment: 'ENVIRONMENT_ALL', | ||
| groupBy: ['service.name', 'service.environment'], | ||
| }, | ||
| actions: [], | ||
| schedule: { | ||
| interval: '1m', | ||
| }, | ||
| }, | ||
| alertsIndex: '.alerts-observability.apm.alerts-default', | ||
| }, | ||
| expectedOutput: PAYMENT_UNREACHABLE_ALERT_EXPECTED, | ||
| }, | ||
| }; | ||
|
|
||
| export const getAlertScenarios = (): AlertScenario[] => Object.values(ALERT_SCENARIOS); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I assume this was the root cause of us not having AI Insights responses in the task-under-evaluation payloads?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that was exactly it :)