elastic · patrykkopycinski · Mar 18, 2026 · Mar 18, 2026
diff --git a/package.json b/package.json
@@ -1687,6 +1687,7 @@
     "@kbn/evals-suite-obs-ai-assistant": "link:x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant",
     "@kbn/evals-suite-observability-ai": "link:x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai",
     "@kbn/evals-suite-security-ai-rules": "link:x-pack/solutions/security/packages/kbn-evals-suite-security-ai-rules",
+    "@kbn/evals-suite-security-detection-engineering": "link:x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering",
     "@kbn/evals-suite-streams": "link:x-pack/platform/packages/shared/kbn-evals-suite-streams",
     "@kbn/expect": "link:src/platform/packages/shared/kbn-expect",
     "@kbn/extract-plugin-translations": "link:packages/kbn-extract-plugin-translations",

diff --git a/tsconfig.base.json b/tsconfig.base.json
@@ -1154,6 +1154,8 @@
       "@kbn/evals-suite-observability-ai/*": ["x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/*"],
       "@kbn/evals-suite-security-ai-rules": ["x-pack/solutions/security/packages/kbn-evals-suite-security-ai-rules"],
       "@kbn/evals-suite-security-ai-rules/*": ["x-pack/solutions/security/packages/kbn-evals-suite-security-ai-rules/*"],
+      "@kbn/evals-suite-security-detection-engineering": ["x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering"],
+      "@kbn/evals-suite-security-detection-engineering/*": ["x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering/*"],
       "@kbn/evals-suite-streams": ["x-pack/platform/packages/shared/kbn-evals-suite-streams"],
       "@kbn/evals-suite-streams/*": ["x-pack/platform/packages/shared/kbn-evals-suite-streams/*"],
       "@kbn/event-annotation-common": ["src/platform/packages/shared/kbn-event-annotation-common"],

@@ -125,6 +125,14 @@
         "evals:entity-analytics"
       ],
       "serverConfigSet": "evals_entity_analytics"
+    },
+    {
+      "id": "security-detection-engineering",
+      "name": "Security Detection Engineering",
+      "slackChannel": "#security-detection-engine-alerts",
+      "configPath": "x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering/playwright.config.ts",
+      "tags": ["security", "detection-engineering"],
+      "ciLabels": ["evals:security-detection-engineering"]
     }
   ]
 }
diff --git a/...urity/packages/kbn-evals-suite-security-detection-engineering/datasets/skill_selection.ts b/...urity/packages/kbn-evals-suite-security-detection-engineering/datasets/skill_selection.ts
@@ -0,0 +1,116 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+export const skillSelectionExamples = [
+  {
+    input: {
+      question: 'Show me all enabled detection rules sorted by risk score',
+    },
+    output: {
+      criteria: [
+        'The response references detection rules and their properties.',
+        'The response includes information about rule status and risk scores.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.find_rules',
+        },
+      ],
+    },
+    metadata: { category: 'rule-search', difficulty: 'easy' },
+  },
+  {
+    input: {
+      question: 'What MITRE ATT&CK techniques do we currently have detection coverage for?',
+    },
+    output: {
+      criteria: [
+        'The response discusses MITRE ATT&CK coverage.',
+        'The response references specific techniques or tactics.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.coverage_overview',
+        },
+      ],
+    },
+    metadata: { category: 'coverage-analysis', difficulty: 'easy' },
+  },
+  {
+    input: {
+      question: 'Are any of our detection rules failing? Show me rules with execution errors.',
+    },
+    output: {
+      criteria: [
+        'The response discusses rule execution health.',
+        'The response identifies rules with errors or performance issues.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.rule_monitoring',
+        },
+      ],
+    },
+    metadata: { category: 'rule-monitoring', difficulty: 'easy' },
+  },
+  {
+    input: {
+      question:
+        'I need to add an exception for the rule "Suspicious PowerShell Execution" to exclude our admin scripts.',
+    },
+    output: {
+      criteria: [
+        'The response addresses creating an exception for the specified rule.',
+        'The response guides the user on defining exception conditions.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.manage_exceptions',
+        },
+      ],
+    },
+    metadata: { category: 'exception-management', difficulty: 'medium' },
+  },
+  {
+    input: {
+      question:
+        'Test this ES|QL detection query against the last 24 hours of data: FROM logs-endpoint.events.process-* | WHERE process.name == "mimikatz.exe"',
+    },
+    output: {
+      criteria: [
+        'The response provides results or analysis from previewing the rule query.',
+        'The response discusses the query execution against live data.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.preview_rule',
+        },
+      ],
+    },
+    metadata: { category: 'rule-preview', difficulty: 'medium' },
+  },
+  {
+    input: {
+      question: 'Enable all disabled rules that target credential access techniques.',
+    },
+    output: {
+      criteria: [
+        'The response addresses enabling rules related to credential access.',
+        'The response references bulk rule management operations.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.find_rules',
+        },
+        {
+          id: 'security.manage_rules',
+        },
+      ],
+    },
+    metadata: { category: 'rule-management', difficulty: 'medium' },
+  },
+];
diff --git a/...urity/packages/kbn-evals-suite-security-detection-engineering/datasets/tool_invocation.ts b/...urity/packages/kbn-evals-suite-security-detection-engineering/datasets/tool_invocation.ts
@@ -0,0 +1,114 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+export const toolInvocationExamples = [
+  {
+    input: {
+      question: 'Find all rules of type "eql" that are currently disabled',
+    },
+    output: {
+      criteria: ['The response lists EQL rules that are disabled.'],
+      toolCalls: [
+        {
+          id: 'security.find_rules',
+          criteria: [
+            'The tool call should filter for rules with type "eql".',
+            'The tool call should filter for rules that are disabled.',
+          ],
+        },
+      ],
+    },
+    metadata: { category: 'tool-invocation', difficulty: 'easy' },
+  },
+  {
+    input: {
+      question:
+        'Preview this KQL query for the last 7 days: host.os.type:windows AND process.name:("cmd.exe" OR "powershell.exe") AND event.action:"start"',
+    },
+    output: {
+      criteria: [
+        'The response includes a preview of the rule query results.',
+        'The response discusses the number of matches found.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.preview_rule',
+          criteria: [
+            'The tool call should use a "query" rule type with "kuery" language.',
+            'The tool call should set the timeframe to cover 7 days.',
+            'The tool call should include the provided KQL query.',
+          ],
+        },
+      ],
+    },
+    metadata: { category: 'tool-invocation', difficulty: 'medium' },
+  },
+  {
+    input: {
+      question:
+        'Get the MITRE ATT&CK coverage overview filtered to only Persistence and Privilege Escalation tactics',
+    },
+    output: {
+      criteria: [
+        'The response provides MITRE ATT&CK coverage information.',
+        'The response focuses on Persistence and Privilege Escalation tactics.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.coverage_overview',
+          criteria: [
+            'The tool call should filter by the Persistence (TA0003) and Privilege Escalation (TA0004) tactics.',
+          ],
+        },
+      ],
+    },
+    metadata: { category: 'tool-invocation', difficulty: 'medium' },
+  },
+  {
+    input: {
+      question:
+        'Check if there are any duplicate or overlapping exceptions across our rules for the process "svchost.exe"',
+    },
+    output: {
+      criteria: [
+        'The response addresses exception overlap analysis.',
+        'The response discusses exceptions related to svchost.exe.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.manage_exceptions',
+          criteria: [
+            'The tool call should use the find_overlaps or find operation.',
+            'The tool call should reference svchost.exe in the query parameters.',
+          ],
+        },
+      ],
+    },
+    metadata: { category: 'tool-invocation', difficulty: 'hard' },
+  },
+  {
+    input: {
+      question: 'Show me rules that have been failing in the last 24 hours with gap-related issues',
+    },
+    output: {
+      criteria: [
+        'The response identifies rules with execution issues.',
+        'The response discusses gap-related problems in rule execution.',
+      ],
+      toolCalls: [
+        {
+          id: 'security.rule_monitoring',
+          criteria: [
+            'The tool call should request rules with errors or warnings.',
+            'The tool call should filter for recent timeframe (last 24 hours).',
+          ],
+        },
+      ],
+    },
+    metadata: { category: 'tool-invocation', difficulty: 'medium' },
+  },
+];
diff --git a/...rity/packages/kbn-evals-suite-security-detection-engineering/datasets/workflow_quality.ts b/...rity/packages/kbn-evals-suite-security-detection-engineering/datasets/workflow_quality.ts
@@ -0,0 +1,81 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+export const workflowQualityExamples = [
+  {
+    input: {
+      question:
+        'I want to improve our detection coverage for Lateral Movement techniques. Analyze what we have now, identify gaps, and suggest what prebuilt rules we should enable.',
+    },
+    output: {
+      criteria: [
+        'The response analyzes current MITRE ATT&CK coverage for Lateral Movement.',
+        'The response identifies specific gaps in detection coverage.',
+        'The response recommends prebuilt rules to fill the gaps.',
+        'The response provides actionable steps to improve coverage.',
+      ],
+      toolCalls: [{ id: 'security.coverage_overview' }, { id: 'security.find_rules' }],
+    },
+    metadata: { category: 'e2e-workflow', difficulty: 'hard' },
+  },
+  {
+    input: {
+      question:
+        'The rule "Brute Force Detected" is generating too many false positives. Help me investigate why and tune it — check its recent execution, find any existing exceptions, and suggest improvements.',
+    },
+    output: {
+      criteria: [
+        'The response investigates the noisy rule by checking its execution health.',
+        'The response examines existing exceptions for the rule.',
+        'The response provides specific tuning recommendations.',
+        'The response offers actionable steps to reduce false positives.',
+      ],
+      toolCalls: [
+        { id: 'security.find_rules' },
+        { id: 'security.rule_monitoring' },
+        { id: 'security.manage_exceptions' },
+      ],
+    },
+    metadata: { category: 'e2e-workflow', difficulty: 'hard' },
+  },
+  {
+    input: {
+      question:
+        'We just got new endpoint logs from a Linux fleet. Help me set up detection coverage: check what prebuilt rules are available for Linux, review our current coverage gaps for Initial Access and Execution on Linux, and install relevant prebuilt rules.',
+    },
+    output: {
+      criteria: [
+        'The response identifies prebuilt rules relevant to Linux endpoints.',
+        'The response analyzes MITRE coverage gaps for Initial Access and Execution.',
+        'The response suggests installing specific prebuilt rules.',
+        'The response considers the Linux platform context throughout.',
+      ],
+      toolCalls: [
+        { id: 'security.find_rules' },
+        { id: 'security.coverage_overview' },
+        { id: 'security.manage_rules' },
+      ],
+    },
+    metadata: { category: 'e2e-workflow', difficulty: 'hard' },
+  },
+  {
+    input: {
+      question:
+        'Give me a health check on all our active detection rules — find any that are failing, have performance issues, or have gap warnings. Prioritize the critical and high severity ones.',
+    },
+    output: {
+      criteria: [
+        'The response provides a health overview of active detection rules.',
+        'The response identifies failing rules and performance issues.',
+        'The response prioritizes critical and high severity rules.',
+        'The response suggests remediation steps for unhealthy rules.',
+      ],
+      toolCalls: [{ id: 'security.rule_monitoring' }, { id: 'security.find_rules' }],
+    },
+    metadata: { category: 'e2e-workflow', difficulty: 'medium' },
+  },
+];
diff --git a/...ity/packages/kbn-evals-suite-security-detection-engineering/evals/skill_selection.spec.ts b/...ity/packages/kbn-evals-suite-security-detection-engineering/evals/skill_selection.spec.ts
@@ -0,0 +1,25 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { evaluate } from '../src/evaluate';
+import { skillSelectionExamples } from '../datasets/skill_selection';
+
+evaluate.describe('Detection Engineering Skill - Tool Selection', () => {
+  evaluate(
+    'selects correct tools for detection engineering queries',
+    async ({ evaluateDataset }) => {
+      await evaluateDataset({
+        dataset: {
+          name: 'detection-engineering: skill selection',
+          description:
+            'Validates that the detection engineering skill invokes the correct tools for different query types',
+          examples: skillSelectionExamples,
+        },
+      });
+    }
+  );
+});
diff --git a/...ity/packages/kbn-evals-suite-security-detection-engineering/evals/tool_invocation.spec.ts b/...ity/packages/kbn-evals-suite-security-detection-engineering/evals/tool_invocation.spec.ts
@@ -0,0 +1,22 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { evaluate } from '../src/evaluate';
+import { toolInvocationExamples } from '../datasets/tool_invocation';
+
+evaluate.describe('Detection Engineering Skill - Tool Invocation', () => {
+  evaluate('invokes tools with correct parameters', async ({ evaluateDataset }) => {
+    await evaluateDataset({
+      dataset: {
+        name: 'detection-engineering: tool invocation',
+        description:
+          'Validates that the detection engineering skill invokes tools with correct parameters and criteria',
+        examples: toolInvocationExamples,
+      },
+    });
+  });
+});