Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .buildkite/pipelines/evals/evals.suites.json
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,15 @@
"ciLabels": ["evals:pci-compliance"],
"serverConfigSet": "evals_pci_compliance"
},
{
"id": "pci-compliance-autonomous",
"name": "PCI DSS v4.0.1 Compliance (autonomous skill variant)",
"slackChannel": "#security-defend-workflows-tests",
"configPath": "x-pack/solutions/security/packages/kbn-evals-suite-pci-compliance/playwright.config.ts",
"tags": ["security", "pci-compliance", "autonomous"],
"ciLabels": ["evals:pci-compliance-autonomous"],
"serverConfigSet": "evals_pci_compliance_autonomous"
},
{
"id": "security-automatic-migrations",
"name": "Security Automatic Migrations",
Expand Down
25 changes: 25 additions & 0 deletions .buildkite/pipelines/evals/llm_evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,31 @@ steps:
EVAL_INCLUDE_EIS_MODELS: '1'
EVAL_MODEL_GROUPS: *weekly_eis_core_models
EVAL_SERVER_CONFIG_SET: 'evals_pci_compliance'
EVAL_PCI_VARIANT: 'handwritten'
timeout_in_minutes: 60
agents:
image: family/kibana-ubuntu-2404
imageProject: elastic-images-prod
provider: gcp
machineType: n2-standard-8
preemptible: true
retry:
automatic:
- exit_status: '-1'
limit: 3

- label: 'Evals: PCI Compliance (autonomous skill variant)'
key: kbn-evals-weekly-pci-compliance-autonomous
command: bash .buildkite/scripts/steps/evals/run_suite.sh
env:
KBN_EVALS: '1'
FTR_EIS_CCM: '1'
EVAL_SUITE_ID: 'pci-compliance-autonomous'
EVAL_FANOUT: '1'
EVAL_INCLUDE_EIS_MODELS: '1'
EVAL_MODEL_GROUPS: *weekly_eis_core_models
EVAL_SERVER_CONFIG_SET: 'evals_pci_compliance_autonomous'
EVAL_PCI_VARIANT: 'autonomous'
timeout_in_minutes: 60
agents:
image: family/kibana-ubuntu-2404
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

import type { ScoutServerConfig } from '../../../../../types';
import { servers as evalsTracingConfig } from '../../evals_tracing/stateful/classic.stateful.config';

/**
* Custom Scout stateful server configuration for the **autonomously-architected** PCI DSS
* v4.0.1 compliance skill eval variant. Enables the Agent Builder experimental features UI
* setting and ONLY the autonomous skill flag (the hand-written `pciComplianceAgentBuilder`
* is intentionally NOT enabled here so the agent router has only one PCI skill to choose
* from — keeping the comparison clean).
*
* Pair this config set with `EVAL_PCI_VARIANT=autonomous` when running the eval suite to
* label outputs and side-by-side reports correctly.
*
* Usage:
* node scripts/scout start-server \\
* --arch stateful --domain classic --serverConfigSet evals_pci_compliance_autonomous
*
* EVAL_PCI_VARIANT=autonomous node scripts/evals start --suite pci-compliance
*/
export const servers: ScoutServerConfig = {
...evalsTracingConfig,
kbnTestServer: {
...evalsTracingConfig.kbnTestServer,
serverArgs: [
...evalsTracingConfig.kbnTestServer.serverArgs,
'--uiSettings.overrides.agentBuilder:experimentalFeatures=true',
// Explicitly enable ONLY the autonomous variant. The handwritten flag
// `pciComplianceAgentBuilder` defaults to `true` in
// `experimental_features.ts`, so we must override it back to `false` here
// (via the boolean-flag tuple syntax) to keep the agent router's PCI
// skill choice cleanly isolated to the autonomous variant.
`--xpack.securitySolution.enableExperimental=${JSON.stringify([
'pciComplianceAutonomousAgentBuilder',
'disable:pciComplianceAgentBuilder',
])}`,
],
},
};
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ export function getExtraKbnOpts(installDir: string | undefined, isServerless: bo

return [
'--dev',
'--no-dev-config',
// Local-only patch: allow config/kibana.dev.yml so preconfigured AI
// connectors defined there reach the Scout-managed Kibana process.
// The upstream behaviour is `--no-dev-config`; this branch reverts that.
...(process.env.SCOUT_READ_DEV_CONFIG === 'true' ? [] : ['--no-dev-config']),
'--no-dev-credentials',
isServerless
? '--server.versioned.versionResolution=newest'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ export const AGENT_BUILDER_BUILTIN_TOOLS = [
`${internalNamespaces.security}.pci_scope_discovery`,
`${internalNamespaces.security}.pci_compliance`,
`${internalNamespaces.security}.pci_field_mapper`,
// Autonomous-architected PCI tool bundle (per cycle-17 architect blueprint).
// Registered independently of the hand-written variant so the autonomous skill
// can be validated as a true end-to-end skill+tool autonomous stack.
`${internalNamespaces.security}.pci_autonomous_scope_discovery`,
`${internalNamespaces.security}.pci_autonomous_compliance_check`,
`${internalNamespaces.security}.pci_autonomous_scorecard_report`,
`${internalNamespaces.security}.pci_autonomous_field_mapper`,

// Streams
`${internalNamespaces.streams}.inspect_streams`,
Expand Down Expand Up @@ -135,6 +142,7 @@ export const AGENT_BUILDER_BUILTIN_SKILLS = [
'detection-rule-edit',
'threat-hunting',
'pci-compliance',
'pci-compliance-autonomous',

// O11Y
'observability.rca',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ export interface ModelDefinition {
provider: ModelProvider;
family: ModelFamily;
contextWindow: number;
/**
* `false` for models that reject the `temperature` inference parameter
* (e.g. Bedrock surfaces `temperature is deprecated for this model` for
* Claude Opus 4.7). Treated as `true` when omitted to preserve existing
* behavior for models we have not explicitly classified.
*/
supportsTemperature?: boolean;
}

/**
Expand Down Expand Up @@ -167,6 +174,17 @@ export const knownModels: ModelDefinition[] = [
family: ModelFamily.Claude,
contextWindow: 200000,
},
{
// Claude Opus 4.7 (released Nov 2025). On Bedrock the model returns
// `temperature is deprecated for this model` if the param is sent, so we
// mark it as not supporting temperature; downstream callers omit the
// parameter and let the provider default apply.
id: 'claude-opus-4-7',
provider: ModelProvider.Anthropic,
family: ModelFamily.Claude,
contextWindow: 200000,
supportsTemperature: false,
},
// OpenAI o-series reasoning models
{
id: 'o3-mini',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* 2.0.
*/
import type { InferenceConnector } from '@kbn/inference-common';
import { InferenceConnectorType } from '@kbn/inference-common';
import { InferenceConnectorType, getModelDefinition } from '@kbn/inference-common';

const OPENAI_MODELS_WITHOUT_TEMPERATURE = ['o1', 'o3', 'gpt-5'];

Expand Down Expand Up @@ -48,6 +48,17 @@ export const getTemperatureIfValid = (
}
}

// Bedrock (and any provider whose model registry marks the model as
// temperature-incompatible) — omit the parameter so the provider's default
// applies. e.g. Bedrock returns a 400 with "temperature is deprecated for
// this model" for Claude Opus 4.7.
if (model) {
const definition = getModelDefinition(model);
if (definition?.supportsTemperature === false) {
return {};
}
}

if (temperature === undefined || temperature < 0) return {};

// Else, use the temperature from the request
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ import type {
} from '@kbn/connector-schemas/bedrock';
import { initDashboard } from '../lib/gen_ai/create_gen_ai_dashboard';
import {
bedrockModelSupportsTemperature,
extractRegionId,
formatBedrockBody,
parseContent,
Expand Down Expand Up @@ -386,10 +387,19 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
}: InvokeAIRawActionParams,
connectorUsageCollector: ConnectorUsageCollector
): Promise<IncomingMessage> {
const effectiveModel = model ?? this.model;
const res = (await this.streamApi(
{
body: JSON.stringify(
formatBedrockBody({ messages, stopSequences, system, temperature, tools, toolChoice })
formatBedrockBody({
messages,
stopSequences,
system,
temperature,
tools,
toolChoice,
model: effectiveModel,
})
),
model,
signal,
Expand Down Expand Up @@ -423,6 +433,7 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
}: InvokeAIActionParams,
connectorUsageCollector: ConnectorUsageCollector
): Promise<InvokeAIActionResponse> {
const effectiveModel = model ?? this.model;
const res = (await this.runApi(
{
body: JSON.stringify(
Expand All @@ -434,6 +445,7 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
maxTokens,
tools,
toolChoice,
model: effectiveModel,
})
),
model,
Expand Down Expand Up @@ -461,13 +473,17 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
}: InvokeAIRawActionParams,
connectorUsageCollector: ConnectorUsageCollector
): Promise<InvokeAIRawActionResponse> {
const effectiveModel = model ?? this.model;
// Newer Bedrock Claude variants (e.g. Opus 4.7) 400 when `temperature` is
// present in the payload — strip it for those model ids.
const includeTemperature = bedrockModelSupportsTemperature(effectiveModel);
const res = await this.runApi(
{
body: JSON.stringify({
messages,
stop_sequences: stopSequences,
system,
temperature,
...(includeTemperature ? { temperature } : {}),
max_tokens: maxTokens,
tools,
tool_choice: toolChoice,
Expand Down Expand Up @@ -550,10 +566,16 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
}
: undefined;

// Some Bedrock models (e.g. Claude Opus 4.7) reject `temperature`
// outright. The inference plugin omits the value via
// `getTemperatureIfValid`; for direct callers we also gate it here based
// on the connector's model id.
const includeTemperature =
temperature !== undefined && bedrockModelSupportsTemperature(modelId);
const request: ConverseRequest = {
messages,
inferenceConfig: {
temperature,
...(includeTemperature ? { temperature } : {}),
stopSequences,
maxTokens,
},
Expand Down Expand Up @@ -605,10 +627,15 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
}
: undefined;

// See `_converse` for context — newer Claude models on Bedrock 400 if
// `temperature` is sent. Mirror the same conditional spread here so
// streaming and non-streaming paths stay aligned.
const includeTemperature =
temperature !== undefined && bedrockModelSupportsTemperature(modelId);
const request: ConverseStreamRequest = {
messages,
inferenceConfig: {
temperature,
...(includeTemperature ? { temperature } : {}),
stopSequences,
maxTokens,
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,37 @@ import { SmithyMessageDecoderStream } from '@smithy/eventstream-codec';
import { DEFAULT_TOKEN_LIMIT } from '@kbn/connector-schemas/bedrock';
import type { BedrockMessage, BedrockToolChoice } from '@kbn/connector-schemas/bedrock';

/**
* Substrings of Bedrock model IDs that reject the `temperature` inference
* parameter and return HTTP 400 ("`temperature` is deprecated for this
* model"). The connector strips `temperature` from outgoing payloads when the
* configured model matches one of these fragments. Keep this list small and
* append-only.
*
* The inference plugin maintains the canonical list in
* `@kbn/inference-common` (`known_models.ts`, `supportsTemperature: false`).
* This local guard avoids a cross-plugin dependency for callers that hit the
* connector sub-actions directly (e.g. `invokeAI`).
*/
const BEDROCK_MODEL_FRAGMENTS_WITHOUT_TEMPERATURE = ['claude-opus-4-7'];

export const bedrockModelSupportsTemperature = (model?: string): boolean => {
if (!model) return true;
const normalized = model.toLowerCase();
return !BEDROCK_MODEL_FRAGMENTS_WITHOUT_TEMPERATURE.some((fragment) =>
normalized.includes(fragment)
);
};

export const formatBedrockBody = ({
messages,
stopSequences,
temperature = 0,
temperature,
system,
maxTokens = DEFAULT_TOKEN_LIMIT,
tools,
toolChoice,
model,
}: {
messages: BedrockMessage[];
stopSequences?: string[];
Expand All @@ -26,15 +49,25 @@ export const formatBedrockBody = ({
system?: string;
tools?: Array<{ name: string; description: string }>;
toolChoice?: BedrockToolChoice;
}) => ({
anthropic_version: 'bedrock-2023-05-31',
...ensureMessageFormat(messages, system),
max_tokens: maxTokens,
stop_sequences: stopSequences,
temperature,
tools,
tool_choice: toolChoice,
});
/**
* Bedrock model id (e.g. `us.anthropic.claude-opus-4-7`). When provided the
* helper omits parameters the model is known to reject. When omitted the
* legacy default of `temperature: 0` is preserved for backward compat.
*/
model?: string;
}) => {
const includeTemperature = bedrockModelSupportsTemperature(model);
const effectiveTemperature = includeTemperature ? temperature ?? 0 : undefined;
return {
anthropic_version: 'bedrock-2023-05-31',
...ensureMessageFormat(messages, system),
max_tokens: maxTokens,
stop_sequences: stopSequences,
...(effectiveTemperature !== undefined ? { temperature: effectiveTemperature } : {}),
tools,
tool_choice: toolChoice,
};
};

interface FormattedBedrockMessage {
role: string;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Local eval-result outputs from compare_variants.sh / build_comparison_html.mjs.
# Each run drops Playwright/eval JSON artefacts into runs/<variant>/ for the
# HTML builder to read. Don't commit them — comparison.html (the rendered
# snapshot) is checked in instead.
runs/
Loading