patrykkopycinski · patrykkopycinski · May 10, 2026 · May 10, 2026 · May 11, 2026 · May 11, 2026
diff --git a/.buildkite/pipelines/evals/evals.suites.json b/.buildkite/pipelines/evals/evals.suites.json
@@ -179,6 +179,15 @@
       "ciLabels": ["evals:pci-compliance"],
       "serverConfigSet": "evals_pci_compliance"
     },
+    {
+      "id": "pci-compliance-autonomous",
+      "name": "PCI DSS v4.0.1 Compliance (autonomous skill variant)",
+      "slackChannel": "#security-defend-workflows-tests",
+      "configPath": "x-pack/solutions/security/packages/kbn-evals-suite-pci-compliance/playwright.config.ts",
+      "tags": ["security", "pci-compliance", "autonomous"],
+      "ciLabels": ["evals:pci-compliance-autonomous"],
+      "serverConfigSet": "evals_pci_compliance_autonomous"
+    },
     {
       "id": "security-automatic-migrations",
       "name": "Security Automatic Migrations",

diff --git a/.buildkite/pipelines/evals/llm_evals.yml b/.buildkite/pipelines/evals/llm_evals.yml
@@ -253,6 +253,31 @@ steps:
           EVAL_INCLUDE_EIS_MODELS: '1'
           EVAL_MODEL_GROUPS: *weekly_eis_core_models
           EVAL_SERVER_CONFIG_SET: 'evals_pci_compliance'
+          EVAL_PCI_VARIANT: 'handwritten'
+        timeout_in_minutes: 60
+        agents:
+          image: family/kibana-ubuntu-2404
+          imageProject: elastic-images-prod
+          provider: gcp
+          machineType: n2-standard-8
+          preemptible: true
+        retry:
+          automatic:
+            - exit_status: '-1'
+              limit: 3
+
+      - label: 'Evals: PCI Compliance (autonomous skill variant)'
+        key: kbn-evals-weekly-pci-compliance-autonomous
+        command: bash .buildkite/scripts/steps/evals/run_suite.sh
+        env:
+          KBN_EVALS: '1'
+          FTR_EIS_CCM: '1'
+          EVAL_SUITE_ID: 'pci-compliance-autonomous'
+          EVAL_FANOUT: '1'
+          EVAL_INCLUDE_EIS_MODELS: '1'
+          EVAL_MODEL_GROUPS: *weekly_eis_core_models
+          EVAL_SERVER_CONFIG_SET: 'evals_pci_compliance_autonomous'
+          EVAL_PCI_VARIANT: 'autonomous'
         timeout_in_minutes: 60
         agents:
           image: family/kibana-ubuntu-2404

diff --git a/...s/configs/config_sets/evals_pci_compliance_autonomous/stateful/classic.stateful.config.ts b/...s/configs/config_sets/evals_pci_compliance_autonomous/stateful/classic.stateful.config.ts
@@ -0,0 +1,47 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+import type { ScoutServerConfig } from '../../../../../types';
+import { servers as evalsTracingConfig } from '../../evals_tracing/stateful/classic.stateful.config';
+
+/**
+ * Custom Scout stateful server configuration for the **autonomously-architected** PCI DSS
+ * v4.0.1 compliance skill eval variant. Enables the Agent Builder experimental features UI
+ * setting and ONLY the autonomous skill flag (the hand-written `pciComplianceAgentBuilder`
+ * is intentionally NOT enabled here so the agent router has only one PCI skill to choose
+ * from — keeping the comparison clean).
+ *
+ * Pair this config set with `EVAL_PCI_VARIANT=autonomous` when running the eval suite to
+ * label outputs and side-by-side reports correctly.
+ *
+ * Usage:
+ *   node scripts/scout start-server \\
+ *     --arch stateful --domain classic --serverConfigSet evals_pci_compliance_autonomous
+ *
+ *   EVAL_PCI_VARIANT=autonomous node scripts/evals start --suite pci-compliance
+ */
+export const servers: ScoutServerConfig = {
+  ...evalsTracingConfig,
+  kbnTestServer: {
+    ...evalsTracingConfig.kbnTestServer,
+    serverArgs: [
+      ...evalsTracingConfig.kbnTestServer.serverArgs,
+      '--uiSettings.overrides.agentBuilder:experimentalFeatures=true',
+      // Explicitly enable ONLY the autonomous variant. The handwritten flag
+      // `pciComplianceAgentBuilder` defaults to `true` in
+      // `experimental_features.ts`, so we must override it back to `false` here
+      // (via the boolean-flag tuple syntax) to keep the agent router's PCI
+      // skill choice cleanly isolated to the autonomous variant.
+      `--xpack.securitySolution.enableExperimental=${JSON.stringify([
+        'pciComplianceAutonomousAgentBuilder',
+        'disable:pciComplianceAgentBuilder',
+      ])}`,
+    ],
+  },
+};
diff --git a/src/platform/packages/shared/kbn-scout/src/servers/run_kibana_server.ts b/src/platform/packages/shared/kbn-scout/src/servers/run_kibana_server.ts
@@ -33,7 +33,10 @@ export function getExtraKbnOpts(installDir: string | undefined, isServerless: bo
 
   return [
     '--dev',
-    '--no-dev-config',
+    // Local-only patch: allow config/kibana.dev.yml so preconfigured AI
+    // connectors defined there reach the Scout-managed Kibana process.
+    // The upstream behaviour is `--no-dev-config`; this branch reverts that.
+    ...(process.env.SCOUT_READ_DEV_CONFIG === 'true' ? [] : ['--no-dev-config']),
     '--no-dev-credentials',
     isServerless
       ? '--server.versioned.versionResolution=newest'

diff --git a/x-pack/platform/packages/shared/agent-builder/agent-builder-server/allow_lists.ts b/x-pack/platform/packages/shared/agent-builder/agent-builder-server/allow_lists.ts
@@ -50,6 +50,13 @@ export const AGENT_BUILDER_BUILTIN_TOOLS = [
   `${internalNamespaces.security}.pci_scope_discovery`,
   `${internalNamespaces.security}.pci_compliance`,
   `${internalNamespaces.security}.pci_field_mapper`,
+  // Autonomous-architected PCI tool bundle (per cycle-17 architect blueprint).
+  // Registered independently of the hand-written variant so the autonomous skill
+  // can be validated as a true end-to-end skill+tool autonomous stack.
+  `${internalNamespaces.security}.pci_autonomous_scope_discovery`,
+  `${internalNamespaces.security}.pci_autonomous_compliance_check`,
+  `${internalNamespaces.security}.pci_autonomous_scorecard_report`,
+  `${internalNamespaces.security}.pci_autonomous_field_mapper`,
 
   // Streams
   `${internalNamespaces.streams}.inspect_streams`,
@@ -135,6 +142,7 @@ export const AGENT_BUILDER_BUILTIN_SKILLS = [
   'detection-rule-edit',
   'threat-hunting',
   'pci-compliance',
+  'pci-compliance-autonomous',
 
   // O11Y
   'observability.rca',

diff --git a/x-pack/platform/packages/shared/ai-infra/inference-common/src/connectors/known_models.ts b/x-pack/platform/packages/shared/ai-infra/inference-common/src/connectors/known_models.ts
@@ -12,6 +12,13 @@ export interface ModelDefinition {
   provider: ModelProvider;
   family: ModelFamily;
   contextWindow: number;
+  /**
+   * `false` for models that reject the `temperature` inference parameter
+   * (e.g. Bedrock surfaces `temperature is deprecated for this model` for
+   * Claude Opus 4.7). Treated as `true` when omitted to preserve existing
+   * behavior for models we have not explicitly classified.
+   */
+  supportsTemperature?: boolean;
 }
 
 /**
@@ -167,6 +174,17 @@ export const knownModels: ModelDefinition[] = [
     family: ModelFamily.Claude,
     contextWindow: 200000,
   },
+  {
+    // Claude Opus 4.7 (released Nov 2025). On Bedrock the model returns
+    // `temperature is deprecated for this model` if the param is sent, so we
+    // mark it as not supporting temperature; downstream callers omit the
+    // parameter and let the provider default apply.
+    id: 'claude-opus-4-7',
+    provider: ModelProvider.Anthropic,
+    family: ModelFamily.Claude,
+    contextWindow: 200000,
+    supportsTemperature: false,
+  },
   // OpenAI o-series reasoning models
   {
     id: 'o3-mini',

diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/get_temperature.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/get_temperature.ts
@@ -5,7 +5,7 @@
  * 2.0.
  */
 import type { InferenceConnector } from '@kbn/inference-common';
-import { InferenceConnectorType } from '@kbn/inference-common';
+import { InferenceConnectorType, getModelDefinition } from '@kbn/inference-common';
 
 const OPENAI_MODELS_WITHOUT_TEMPERATURE = ['o1', 'o3', 'gpt-5'];
 
@@ -48,6 +48,17 @@ export const getTemperatureIfValid = (
     }
   }
 
+  // Bedrock (and any provider whose model registry marks the model as
+  // temperature-incompatible) — omit the parameter so the provider's default
+  // applies. e.g. Bedrock returns a 400 with "temperature is deprecated for
+  // this model" for Claude Opus 4.7.
+  if (model) {
+    const definition = getModelDefinition(model);
+    if (definition?.supportsTemperature === false) {
+      return {};
+    }
+  }
+
   if (temperature === undefined || temperature < 0) return {};
 
   // Else, use the temperature from the request

diff --git a/x-pack/platform/plugins/shared/stack_connectors/server/connector_types/bedrock/bedrock.ts b/x-pack/platform/plugins/shared/stack_connectors/server/connector_types/bedrock/bedrock.ts
@@ -57,6 +57,7 @@ import type {
 } from '@kbn/connector-schemas/bedrock';
 import { initDashboard } from '../lib/gen_ai/create_gen_ai_dashboard';
 import {
+  bedrockModelSupportsTemperature,
   extractRegionId,
   formatBedrockBody,
   parseContent,
@@ -386,10 +387,19 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
     }: InvokeAIRawActionParams,
     connectorUsageCollector: ConnectorUsageCollector
   ): Promise<IncomingMessage> {
+    const effectiveModel = model ?? this.model;
     const res = (await this.streamApi(
       {
         body: JSON.stringify(
-          formatBedrockBody({ messages, stopSequences, system, temperature, tools, toolChoice })
+          formatBedrockBody({
+            messages,
+            stopSequences,
+            system,
+            temperature,
+            tools,
+            toolChoice,
+            model: effectiveModel,
+          })
         ),
         model,
         signal,
@@ -423,6 +433,7 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
     }: InvokeAIActionParams,
     connectorUsageCollector: ConnectorUsageCollector
   ): Promise<InvokeAIActionResponse> {
+    const effectiveModel = model ?? this.model;
     const res = (await this.runApi(
       {
         body: JSON.stringify(
@@ -434,6 +445,7 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
             maxTokens,
             tools,
             toolChoice,
+            model: effectiveModel,
           })
         ),
         model,
@@ -461,13 +473,17 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
     }: InvokeAIRawActionParams,
     connectorUsageCollector: ConnectorUsageCollector
   ): Promise<InvokeAIRawActionResponse> {
+    const effectiveModel = model ?? this.model;
+    // Newer Bedrock Claude variants (e.g. Opus 4.7) 400 when `temperature` is
+    // present in the payload — strip it for those model ids.
+    const includeTemperature = bedrockModelSupportsTemperature(effectiveModel);
     const res = await this.runApi(
       {
         body: JSON.stringify({
           messages,
           stop_sequences: stopSequences,
           system,
-          temperature,
+          ...(includeTemperature ? { temperature } : {}),
           max_tokens: maxTokens,
           tools,
           tool_choice: toolChoice,
@@ -550,10 +566,16 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
           }
         : undefined;
 
+    // Some Bedrock models (e.g. Claude Opus 4.7) reject `temperature`
+    // outright. The inference plugin omits the value via
+    // `getTemperatureIfValid`; for direct callers we also gate it here based
+    // on the connector's model id.
+    const includeTemperature =
+      temperature !== undefined && bedrockModelSupportsTemperature(modelId);
     const request: ConverseRequest = {
       messages,
       inferenceConfig: {
-        temperature,
+        ...(includeTemperature ? { temperature } : {}),
         stopSequences,
         maxTokens,
       },
@@ -605,10 +627,15 @@ The Kibana Connector in use may need to be reconfigured with an updated Amazon B
           }
         : undefined;
 
+    // See `_converse` for context — newer Claude models on Bedrock 400 if
+    // `temperature` is sent. Mirror the same conditional spread here so
+    // streaming and non-streaming paths stay aligned.
+    const includeTemperature =
+      temperature !== undefined && bedrockModelSupportsTemperature(modelId);
     const request: ConverseStreamRequest = {
       messages,
       inferenceConfig: {
-        temperature,
+        ...(includeTemperature ? { temperature } : {}),
         stopSequences,
         maxTokens,
       },

diff --git a/x-pack/platform/plugins/shared/stack_connectors/server/connector_types/bedrock/utils.ts b/x-pack/platform/plugins/shared/stack_connectors/server/connector_types/bedrock/utils.ts
@@ -9,14 +9,37 @@ import { SmithyMessageDecoderStream } from '@smithy/eventstream-codec';
 import { DEFAULT_TOKEN_LIMIT } from '@kbn/connector-schemas/bedrock';
 import type { BedrockMessage, BedrockToolChoice } from '@kbn/connector-schemas/bedrock';
 
+/**
+ * Substrings of Bedrock model IDs that reject the `temperature` inference
+ * parameter and return HTTP 400 ("`temperature` is deprecated for this
+ * model"). The connector strips `temperature` from outgoing payloads when the
+ * configured model matches one of these fragments. Keep this list small and
+ * append-only.
+ *
+ * The inference plugin maintains the canonical list in
+ * `@kbn/inference-common` (`known_models.ts`, `supportsTemperature: false`).
+ * This local guard avoids a cross-plugin dependency for callers that hit the
+ * connector sub-actions directly (e.g. `invokeAI`).
+ */
+const BEDROCK_MODEL_FRAGMENTS_WITHOUT_TEMPERATURE = ['claude-opus-4-7'];
+
+export const bedrockModelSupportsTemperature = (model?: string): boolean => {
+  if (!model) return true;
+  const normalized = model.toLowerCase();
+  return !BEDROCK_MODEL_FRAGMENTS_WITHOUT_TEMPERATURE.some((fragment) =>
+    normalized.includes(fragment)
+  );
+};
+
 export const formatBedrockBody = ({
   messages,
   stopSequences,
-  temperature = 0,
+  temperature,
   system,
   maxTokens = DEFAULT_TOKEN_LIMIT,
   tools,
   toolChoice,
+  model,
 }: {
   messages: BedrockMessage[];
   stopSequences?: string[];
@@ -26,15 +49,25 @@ export const formatBedrockBody = ({
   system?: string;
   tools?: Array<{ name: string; description: string }>;
   toolChoice?: BedrockToolChoice;
-}) => ({
-  anthropic_version: 'bedrock-2023-05-31',
-  ...ensureMessageFormat(messages, system),
-  max_tokens: maxTokens,
-  stop_sequences: stopSequences,
-  temperature,
-  tools,
-  tool_choice: toolChoice,
-});
+  /**
+   * Bedrock model id (e.g. `us.anthropic.claude-opus-4-7`). When provided the
+   * helper omits parameters the model is known to reject. When omitted the
+   * legacy default of `temperature: 0` is preserved for backward compat.
+   */
+  model?: string;
+}) => {
+  const includeTemperature = bedrockModelSupportsTemperature(model);
+  const effectiveTemperature = includeTemperature ? temperature ?? 0 : undefined;
+  return {
+    anthropic_version: 'bedrock-2023-05-31',
+    ...ensureMessageFormat(messages, system),
+    max_tokens: maxTokens,
+    stop_sequences: stopSequences,
+    ...(effectiveTemperature !== undefined ? { temperature: effectiveTemperature } : {}),
+    tools,
+    tool_choice: toolChoice,
+  };
+};
 
 interface FormattedBedrockMessage {
   role: string;

diff --git a/x-pack/solutions/security/packages/kbn-evals-suite-pci-compliance/.gitignore b/x-pack/solutions/security/packages/kbn-evals-suite-pci-compliance/.gitignore
@@ -0,0 +1,5 @@
+# Local eval-result outputs from compare_variants.sh / build_comparison_html.mjs.
+# Each run drops Playwright/eval JSON artefacts into runs/<variant>/ for the
+# HTML builder to read. Don't commit them — comparison.html (the rendered
+# snapshot) is checked in instead.
+runs/