Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
02bb9b8
Added Error AI Insight evals for Product Catalog failure
yuliia-fryshko Apr 15, 2026
7f52eec
Changes from node scripts/eslint_all_files --no-cache --fix
kibanamachine Apr 15, 2026
4514d4d
added scenarios for Error Ai Insights
yuliia-fryshko Apr 16, 2026
1516726
Added evals for Logs AI Insights
yuliia-fryshko Apr 16, 2026
c37247f
corrected logs messages, added Alerts AI Insights evals
yuliia-fryshko Apr 17, 2026
1d25c96
Changes from node scripts/eslint_all_files --no-cache --fix
kibanamachine Apr 17, 2026
efa8e2b
replace hardcoded sleeps with polling in alert_insight.spec.ts
yuliia-fryshko Apr 21, 2026
41748fa
edited ground truth for logs ai insights
yuliia-fryshko Apr 21, 2026
0ad08aa
fixed ai insights response parsing
yuliia-fryshko Apr 23, 2026
3d54c2e
using pRetry here for polling with exponential backoff
yuliia-fryshko Apr 27, 2026
6df64b2
skip failing test suite (#262047)
kibanamachine Apr 28, 2026
0ddddea
fixed test
yuliia-fryshko Apr 28, 2026
0d0d216
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko Apr 29, 2026
f5bdf67
move the request outside of the pRetry's block
yuliia-fryshko Apr 29, 2026
a3c11a5
adjusted dataset and ground truth for high cpu alert ai insight
yuliia-fryshko Apr 30, 2026
c7e4e58
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko Apr 30, 2026
0c165f0
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko May 4, 2026
3f0f558
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko May 4, 2026
c9db5c4
increased polling time for alert ai insight
yuliia-fryshko May 4, 2026
8ddcc71
fixed tests; added refresh replayed source indices so APM rules see data
yuliia-fryshko May 4, 2026
c6318c0
Changes from node scripts/eslint_all_files --no-cache --fix
kibanamachine May 4, 2026
d742c53
added delay for alert ai tests
yuliia-fryshko May 4, 2026
2341965
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko May 4, 2026
1e3dbe8
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko May 5, 2026
827b0a8
fix tests on CI
yuliia-fryshko May 5, 2026
b1eb01e
fix test
yuliia-fryshko May 5, 2026
c9dc50d
removed failed scenario, will cover it separately
yuliia-fryshko May 5, 2026
e0ee499
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko May 5, 2026
25cfd76
removed unused methods
yuliia-fryshko May 5, 2026
6f71e26
Merge branch 'main' into ai-insight-evals-505
yuliia-fryshko May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ function createScenarioTest(scenario: AlertScenario) {
esClient.deleteByQuery({
index: scenario.alertRule.alertsIndex,
query: { match_all: {} },
conflicts: 'proceed',
refresh: true,
}),
...(ruleId
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ function createScenarioTest(scenario: ApmErrorScenario) {
let replayResult: LoadResult;

evaluate.beforeAll(async ({ esClient, log }) => {
end = moment().toISOString();
start = moment().subtract(15, 'minutes').toISOString();

log.info(`Replaying scenario: ${scenario.id}`);
replayResult = await replayObservabilityDataStreams(
esClient,
Expand All @@ -50,6 +47,9 @@ function createScenarioTest(scenario: ApmErrorScenario) {
log.debug('Waiting to make sure all indices are refreshed');
await new Promise((resolve) => setTimeout(resolve, INDEX_REFRESH_WAIT_MS));

end = moment().toISOString();
start = moment().subtract(15, 'minutes').toISOString();

log.info(`Querying for APM error: ${scenario.errorQuery.errorMessage}`);
const errorsResponse = await esClient.search({
index: 'logs-*',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { tags } from '@kbn/scout';
import type { LoadResult } from '@kbn/es-snapshot-loader';
import type { LogInsightParams } from '../../src/clients/ai_insight_client';
import {
replayObservabilityDataStreams,
cleanObservabilityDataStreams,
} from '../../src/data_generators/replay';
import { getLogScenarios, type LogScenario } from '../../src/scenarios/log_scenarios';
import { evaluate } from './evaluate_ai_insights';

const INDEX_REFRESH_WAIT_MS = 2500;

const scenarios = getLogScenarios();

for (const scenario of scenarios) {
createScenarioTest(scenario);
}

function createScenarioTest(scenario: LogScenario) {
evaluate.describe(
`Log AI Insights - ${scenario.id} (${scenario.snapshotName})`,
{ tag: tags.serverless.observability.complete },
() => {
let logDocId: string;
let logIndex: string;
let replayResult: LoadResult;

evaluate.beforeAll(async ({ esClient, log }) => {
log.info(`Replaying scenario: ${scenario.id}`);
replayResult = await replayObservabilityDataStreams(
esClient,
log,
scenario.snapshotName,
scenario.gcs
);

log.debug('Waiting to make sure all indices are refreshed');
await new Promise((resolve) => setTimeout(resolve, INDEX_REFRESH_WAIT_MS));

log.info(
`Querying for log: service=${scenario.logQuery.serviceName}, pattern="${scenario.logQuery.messagePattern}"`
);

const logResponse = await esClient.search({
index: scenario.logQuery.index,
query: {
bool: {
filter: [{ term: { 'service.name': scenario.logQuery.serviceName } }],
should: [
{ match_phrase: { message: scenario.logQuery.messagePattern } },
{ match_phrase: { 'exception.message': scenario.logQuery.messagePattern } },
],
minimum_should_match: 1,
},
},
sort: [{ '@timestamp': 'desc' }],
size: 1,
_source: false,
});

const logDoc = logResponse.hits.hits[0];
if (!logDoc) {
throw new Error(
`No log found for scenario ${scenario.id} (service: ${scenario.logQuery.serviceName}, pattern: "${scenario.logQuery.messagePattern}")`
);
}

if (!logDoc._id || !logDoc._index) {
throw new Error(`Log document missing _id or _index for scenario ${scenario.id}`);
}
logDocId = logDoc._id;
logIndex = logDoc._index;
log.info(`Found log document: ${logIndex}/${logDocId}`);
});

evaluate(
`Log AI insight correctness (${scenario.id}, ${scenario.snapshotName})`,
async ({ aiInsightClient, evaluateDataset }) => {
await evaluateDataset<LogInsightParams>({
getInsight: (params) => aiInsightClient.getLogInsight(params),
dataset: {
name: `ai insights: log analysis (${scenario.id}, ${scenario.snapshotName})`,
description: `Evaluates correctness of log AI insight summaries for ${scenario.id} (snapshot: ${scenario.snapshotName})`,
examples: [
{
input: {
requestPayload: {
index: logIndex,
id: logDocId,
},
question:
'Analyze this log entry and provide a summary explaining what it means, identify where it originated, assess the root cause and impact, and recommend next steps.',
},
output: {
expected: scenario.expectedOutput,
},
},
],
},
});
}
);

evaluate.afterAll(async ({ esClient, log }) => {
log.debug('Cleaning up indices');
await cleanObservabilityDataStreams(esClient, replayResult, log);
});
}
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,82 @@ export interface ErrorInsightParams {
environment?: string;
}

export interface LogInsightParams {
index: string;
id: string;
}

const EVENT_PREFIX = 'event: ';
const DATA_PREFIX = 'data: ';

/**
* The AI insight endpoints return SSE (Server-Sent Events) streams.
* This parses the raw SSE text into the summary and context fields.
*/
function parseSseResponse(raw: unknown): AiInsightResponse {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I assume this was the root cause of us not having AI Insights responses in the task-under-evaluation payloads?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that was exactly it :)

const text = typeof raw === 'string' ? raw : String(raw);

const events = text
.split(/\n\n/)
.map((block) => {
const lines = block.split('\n').map((line) => line.trim());
const eventLine = lines.find((line) => line.startsWith(EVENT_PREFIX));
const dataLine = lines.find((line) => line.startsWith(DATA_PREFIX));

if (!eventLine || !dataLine) return null;

try {
return {
type: eventLine.slice(EVENT_PREFIX.length).trim(),
data: JSON.parse(dataLine.slice(DATA_PREFIX.length)) as Record<string, unknown>,
};
} catch {
return null;
}
})
.filter((event): event is { type: string; data: Record<string, unknown> } => event !== null);

const contextEvent = events.find((e) => e.type === 'context');
const messageEvent = events.find((e) => e.type === 'chatCompletionMessage');

const summary = (messageEvent?.data?.content as string) || '';
const context = (contextEvent?.data?.context as string) || '';

if (!summary) {
const chunks = events
.filter((e) => e.type === 'chatCompletionChunk')
.map((e) => (e.data?.content as string) || '')
.join('');
return { summary: chunks, context };
}

return { summary, context };
}

export class AiInsightClient {
constructor(private readonly fetch: HttpHandler) {}

async getAlertInsight(params: AlertInsightParams): Promise<AiInsightResponse> {
return this.fetch('/internal/observability_agent_builder/ai_insights/alert', {
const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/alert', {
method: 'POST',
body: JSON.stringify(params),
}) as Promise<AiInsightResponse>;
});
return parseSseResponse(raw);
}

async getErrorInsight(params: ErrorInsightParams): Promise<AiInsightResponse> {
return this.fetch('/internal/observability_agent_builder/ai_insights/error', {
const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/error', {
method: 'POST',
body: JSON.stringify(params),
});
return parseSseResponse(raw);
}

async getLogInsight(params: LogInsightParams): Promise<AiInsightResponse> {
const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/log', {
method: 'POST',
body: JSON.stringify(params),
}) as Promise<AiInsightResponse>;
});
return parseSseResponse(raw);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
*/

import type { AlertScenario } from './types';
import { PAYMENT_SERVICE_GCS } from './constants';
import { PAYMENT_SERVICE_GCS, PAYMENT_UNREACHABLE_GCS } from './constants';

const PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID = 'payment-error-count-alert';
const PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID = 'payment-unreachable-alert';

const PAYMENT_ALERT_EXPECTED_OUTPUT = `- Summary: A single handled error was detected in the payment service, specifically related to an invalid token during a payment request. The error appears isolated, with no evidence of broader anomalies or downstream impact.

Expand All @@ -26,6 +27,21 @@ const PAYMENT_ALERT_EXPECTED_OUTPUT = `- Summary: A single handled error was d
- Validate that the error is properly handled and does not impact payment processing for valid tokens.
- If no further errors occur, monitor for recurrence but no urgent action is required. If errors increase, investigate token validation logic and upstream authentication flows.`;

const PAYMENT_UNREACHABLE_ALERT_EXPECTED = `- Summary: An APM error count alert fired for the frontend service because the payment service is unreachable. The checkout flow fails with a gRPC Unavailable error ("name resolver error: produced zero addresses") when attempting to charge a card via the payment service. This is a connectivity or infrastructure failure, not an application code defect.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Question] Just curious if you tweaked the expected responses for all insights based on our preferences/expectations or this is an actual response from the AI Insight API?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question, @SrdjanLL ! I took an answer from Claude Opus and tweaked the wording a bit


- Assessment: The payment service is entirely unreachable from the checkout service — DNS or name resolution returns zero addresses for the payment endpoint. This causes all checkout attempts to fail, resulting in user-facing errors propagated through the frontend. The \`paymentUnreachable\` feature flag in flagd is the most likely cause if this is a test environment; otherwise, this indicates a real infrastructure issue (service down, DNS failure, network partition).

- Related signals:

- Errors: "failed to charge card: could not charge the card: rpc error: code = Unavailable desc = name resolver error: produced zero addresses" (apmErrors, last seen within alert window, Direct) — all checkout/payment flows fail.
- Anomalies: Payment service absent from traces (apmServiceSummary, alert window, Direct) — the payment service is not running or not reachable.
- Downstream: checkout and frontend-proxy report errors due to payment unavailability (apmServiceTopology, Indirect).
- Immediate actions:

1. Verify the payment service is running, healthy, and reachable from the checkout service's network.
2. Check DNS resolution for the payment service endpoint from within the checkout service's environment.
3. If using the \`paymentUnreachable\` feature flag, verify its state in flagd and disable it if unintentional.`;

export const ALERT_SCENARIOS: Record<string, AlertScenario> = {
[PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID]: {
id: PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID,
Expand Down Expand Up @@ -56,6 +72,35 @@ export const ALERT_SCENARIOS: Record<string, AlertScenario> = {
},
expectedOutput: PAYMENT_ALERT_EXPECTED_OUTPUT,
},
[PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID]: {
id: PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID,
description: 'APM error count alert for frontend when payment service is unreachable',
snapshotName: 'payment-unreachable',
gcs: PAYMENT_UNREACHABLE_GCS,
alertRule: {
ruleParams: {
consumer: 'apm',
enabled: true,
name: 'Error count threshold - payment unreachable',
rule_type_id: 'apm.error_rate',
tags: [],
params: {
threshold: 1,
windowSize: 5,
windowUnit: 'm',
serviceName: 'frontend',
environment: 'ENVIRONMENT_ALL',
groupBy: ['service.name', 'service.environment'],
},
actions: [],
schedule: {
interval: '1m',
},
},
alertsIndex: '.alerts-observability.apm.alerts-default',
},
expectedOutput: PAYMENT_UNREACHABLE_ALERT_EXPECTED,
},
};

export const getAlertScenarios = (): AlertScenario[] => Object.values(ALERT_SCENARIOS);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,18 @@ export const PAYMENT_SERVICE_GCS: GcsConfig = {
bucket: GCS_BUCKET,
basePath: 'otel-demo/payment-service-failures',
};

export const PAYMENT_UNREACHABLE_GCS: GcsConfig = {
bucket: GCS_BUCKET,
basePath: 'otel-demo/payment-unreachable',
};

export const PRODUCT_CATALOG_GCS: GcsConfig = {
bucket: GCS_BUCKET,
basePath: 'otel-demo/product-catalog',
};

export const AD_HIGH_CPU_GCS: GcsConfig = {
bucket: GCS_BUCKET,
basePath: 'otel-demo/ad-high-cpu',
};
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,12 @@
* 2.0.
*/

import type { ApmErrorScenario, GcsConfig } from './types';
import { GCS_BUCKET, PAYMENT_SERVICE_GCS } from './constants';
import type { ApmErrorScenario } from './types';
import { PAYMENT_SERVICE_GCS, PAYMENT_UNREACHABLE_GCS, PRODUCT_CATALOG_GCS } from './constants';

const PAYMENT_SERVICE_FAILURE_SCENARIO_ID = 'payment-service-failure';
const PAYMENT_UNREACHABLE_SCENARIO_ID = 'payment-unreachable';

const PAYMENT_UNREACHABLE_GCS: GcsConfig = {
bucket: GCS_BUCKET,
basePath: 'otel-demo/payment-unreachable',
};
const PRODUCT_CATALOG_FAILURE_SCENARIO_ID = 'product-catalog-failure';

const PAYMENT_ERROR_EXPECTED_OUTPUT = `- Error summary:
The payment service failed to process a charge request due to an "Invalid token" error, as indicated by the handled exception in the payment service and corroborated by error propagation through checkout and frontend services.
Expand Down Expand Up @@ -62,6 +58,26 @@ const PAYMENT_UNREACHABLE_EXPECTED_OUTPUT = `- Error summary:
- Why is the payment service unreachable (deployment, scaling, network partition)?
- Are there recent changes to service discovery, configuration, or infrastructure that could have broken connectivity?`;

const PRODUCT_CATALOG_FAILURE_EXPECTED_OUTPUT = `- Error summary:
The frontend fails with "failed to prepare order: failed to get product #OLJCESPC7Z" because the \`product-catalog\` service returns a gRPC Internal error ("Product Catalog Fail Feature Flag Enabled") when retrieving that specific product. The root cause is the \`productCatalogFailure\` feature flag being enabled, which causes a deliberate fault injection in the product catalog service for product \`OLJCESPC7Z\`.

- Failure pinpoint:

- The error is observed in the \`frontend\` service when preparing an order. It propagates from \`checkout\`, which calls \`product-catalog\` to validate cart items. The \`product-catalog\` service's \`GetProduct\` RPC fails for product ID \`OLJCESPC7Z\` with gRPC status Internal and message "Error: Product Catalog Fail Feature Flag Enabled".
- The failure originates in the \`product-catalog\` service, which evaluates the \`productCatalogFailure\` feature flag via the flagd provider. When the flag is enabled, the service intentionally rejects requests for this specific product. The feature flag evaluation itself succeeds (flagd dependency is healthy).
- This is a deliberate fault injection, not a code defect or infrastructure failure.
- Impact:

- Any request that requires fetching product \`OLJCESPC7Z\` (product detail pages, checkout with this item in cart, recommendations including this product) will fail while the feature flag remains enabled.
- Other products are unaffected; \`ListProducts\` and \`SearchProducts\` do not check this flag.
- Multiple services in the trace report errors: \`product-catalog\`, \`checkout\`, \`frontend\`, and \`frontend-proxy\`, indicating user-facing impact on orders containing this product.
- Immediate actions:

1. Disable the \`productCatalogFailure\` feature flag in the flagd configuration (\`demo.flagd.json\`) or set its \`defaultVariant\` to \`"off"\` to restore normal behavior.
2. Verify the flag state via the flagd OFREP API or management interface to confirm it is currently enabled.
3. Review recent changes to \`demo.flagd.json\` or flagd targeting rules to determine if the flag was enabled intentionally (e.g., chaos testing) or accidentally.
4. Monitor the \`product-catalog\` service error rate after toggling the flag to confirm the errors stop.`;

export const APM_ERROR_SCENARIOS: Record<string, ApmErrorScenario> = {
[PAYMENT_SERVICE_FAILURE_SCENARIO_ID]: {
id: PAYMENT_SERVICE_FAILURE_SCENARIO_ID,
Expand All @@ -86,6 +102,18 @@ export const APM_ERROR_SCENARIOS: Record<string, ApmErrorScenario> = {
},
expectedOutput: PAYMENT_UNREACHABLE_EXPECTED_OUTPUT,
},
[PRODUCT_CATALOG_FAILURE_SCENARIO_ID]: {
id: PRODUCT_CATALOG_FAILURE_SCENARIO_ID,
description:
'Product catalog service fails on product OLJCESPC7Z due to productCatalogFailure feature flag',
snapshotName: 'product-catalog',
gcs: PRODUCT_CATALOG_GCS,
errorQuery: {
errorMessage: 'failed to prepare order: failed to get product #"OLJCESPC7Z"',
serviceName: 'checkout',
},
expectedOutput: PRODUCT_CATALOG_FAILURE_EXPECTED_OUTPUT,
},
};

export const getErrorScenarios = (): ApmErrorScenario[] => Object.values(APM_ERROR_SCENARIOS);
Expand Down
Loading
Loading