Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,7 @@
"@kbn/evals-suite-obs-ai-assistant": "link:x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant",
"@kbn/evals-suite-observability-ai": "link:x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai",
"@kbn/evals-suite-security-ai-rules": "link:x-pack/solutions/security/packages/kbn-evals-suite-security-ai-rules",
"@kbn/evals-suite-security-detection-engineering": "link:x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering",
"@kbn/evals-suite-streams": "link:x-pack/platform/packages/shared/kbn-evals-suite-streams",
"@kbn/expect": "link:src/platform/packages/shared/kbn-expect",
"@kbn/extract-plugin-translations": "link:packages/kbn-extract-plugin-translations",
Expand Down
2 changes: 2 additions & 0 deletions tsconfig.base.json
Original file line number Diff line number Diff line change
Expand Up @@ -1154,6 +1154,8 @@
"@kbn/evals-suite-observability-ai/*": ["x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/*"],
"@kbn/evals-suite-security-ai-rules": ["x-pack/solutions/security/packages/kbn-evals-suite-security-ai-rules"],
"@kbn/evals-suite-security-ai-rules/*": ["x-pack/solutions/security/packages/kbn-evals-suite-security-ai-rules/*"],
"@kbn/evals-suite-security-detection-engineering": ["x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering"],
"@kbn/evals-suite-security-detection-engineering/*": ["x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering/*"],
"@kbn/evals-suite-streams": ["x-pack/platform/packages/shared/kbn-evals-suite-streams"],
"@kbn/evals-suite-streams/*": ["x-pack/platform/packages/shared/kbn-evals-suite-streams/*"],
"@kbn/event-annotation-common": ["src/platform/packages/shared/kbn-event-annotation-common"],
Expand Down
8 changes: 8 additions & 0 deletions x-pack/platform/packages/shared/kbn-evals/evals.suites.json
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@
"evals:entity-analytics"
],
"serverConfigSet": "evals_entity_analytics"
},
{
"id": "security-detection-engineering",
"name": "Security Detection Engineering",
"slackChannel": "#security-detection-engine-alerts",
"configPath": "x-pack/solutions/security/packages/kbn-evals-suite-security-detection-engineering/playwright.config.ts",
"tags": ["security", "detection-engineering"],
"ciLabels": ["evals:security-detection-engineering"]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export const skillSelectionExamples = [
{
input: {
question: 'Show me all enabled detection rules sorted by risk score',
},
output: {
criteria: [
'The response references detection rules and their properties.',
'The response includes information about rule status and risk scores.',
],
toolCalls: [
{
id: 'security.find_rules',
},
],
},
metadata: { category: 'rule-search', difficulty: 'easy' },
},
{
input: {
question: 'What MITRE ATT&CK techniques do we currently have detection coverage for?',
},
output: {
criteria: [
'The response discusses MITRE ATT&CK coverage.',
'The response references specific techniques or tactics.',
],
toolCalls: [
{
id: 'security.coverage_overview',
},
],
},
metadata: { category: 'coverage-analysis', difficulty: 'easy' },
},
{
input: {
question: 'Are any of our detection rules failing? Show me rules with execution errors.',
},
output: {
criteria: [
'The response discusses rule execution health.',
'The response identifies rules with errors or performance issues.',
],
toolCalls: [
{
id: 'security.rule_monitoring',
},
],
},
metadata: { category: 'rule-monitoring', difficulty: 'easy' },
},
{
input: {
question:
'I need to add an exception for the rule "Suspicious PowerShell Execution" to exclude our admin scripts.',
},
output: {
criteria: [
'The response addresses creating an exception for the specified rule.',
'The response guides the user on defining exception conditions.',
],
toolCalls: [
{
id: 'security.manage_exceptions',
},
],
},
metadata: { category: 'exception-management', difficulty: 'medium' },
},
{
input: {
question:
'Test this ES|QL detection query against the last 24 hours of data: FROM logs-endpoint.events.process-* | WHERE process.name == "mimikatz.exe"',
},
output: {
criteria: [
'The response provides results or analysis from previewing the rule query.',
'The response discusses the query execution against live data.',
],
toolCalls: [
{
id: 'security.preview_rule',
},
],
},
metadata: { category: 'rule-preview', difficulty: 'medium' },
},
{
input: {
question: 'Enable all disabled rules that target credential access techniques.',
},
output: {
criteria: [
'The response addresses enabling rules related to credential access.',
'The response references bulk rule management operations.',
],
toolCalls: [
{
id: 'security.find_rules',
},
{
id: 'security.manage_rules',
},
],
},
metadata: { category: 'rule-management', difficulty: 'medium' },
},
];
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export const toolInvocationExamples = [
{
input: {
question: 'Find all rules of type "eql" that are currently disabled',
},
output: {
criteria: ['The response lists EQL rules that are disabled.'],
toolCalls: [
{
id: 'security.find_rules',
criteria: [
'The tool call should filter for rules with type "eql".',
'The tool call should filter for rules that are disabled.',
],
},
],
},
metadata: { category: 'tool-invocation', difficulty: 'easy' },
},
{
input: {
question:
'Preview this KQL query for the last 7 days: host.os.type:windows AND process.name:("cmd.exe" OR "powershell.exe") AND event.action:"start"',
},
output: {
criteria: [
'The response includes a preview of the rule query results.',
'The response discusses the number of matches found.',
],
toolCalls: [
{
id: 'security.preview_rule',
criteria: [
'The tool call should use a "query" rule type with "kuery" language.',
'The tool call should set the timeframe to cover 7 days.',
'The tool call should include the provided KQL query.',
],
},
],
},
metadata: { category: 'tool-invocation', difficulty: 'medium' },
},
{
input: {
question:
'Get the MITRE ATT&CK coverage overview filtered to only Persistence and Privilege Escalation tactics',
},
output: {
criteria: [
'The response provides MITRE ATT&CK coverage information.',
'The response focuses on Persistence and Privilege Escalation tactics.',
],
toolCalls: [
{
id: 'security.coverage_overview',
criteria: [
'The tool call should filter by the Persistence (TA0003) and Privilege Escalation (TA0004) tactics.',
],
},
],
},
metadata: { category: 'tool-invocation', difficulty: 'medium' },
},
{
input: {
question:
'Check if there are any duplicate or overlapping exceptions across our rules for the process "svchost.exe"',
},
output: {
criteria: [
'The response addresses exception overlap analysis.',
'The response discusses exceptions related to svchost.exe.',
],
toolCalls: [
{
id: 'security.manage_exceptions',
criteria: [
'The tool call should use the find_overlaps or find operation.',
'The tool call should reference svchost.exe in the query parameters.',
],
},
],
},
metadata: { category: 'tool-invocation', difficulty: 'hard' },
},
{
input: {
question: 'Show me rules that have been failing in the last 24 hours with gap-related issues',
},
output: {
criteria: [
'The response identifies rules with execution issues.',
'The response discusses gap-related problems in rule execution.',
],
toolCalls: [
{
id: 'security.rule_monitoring',
criteria: [
'The tool call should request rules with errors or warnings.',
'The tool call should filter for recent timeframe (last 24 hours).',
],
},
],
},
metadata: { category: 'tool-invocation', difficulty: 'medium' },
},
];
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export const workflowQualityExamples = [
{
input: {
question:
'I want to improve our detection coverage for Lateral Movement techniques. Analyze what we have now, identify gaps, and suggest what prebuilt rules we should enable.',
},
output: {
criteria: [
'The response analyzes current MITRE ATT&CK coverage for Lateral Movement.',
'The response identifies specific gaps in detection coverage.',
'The response recommends prebuilt rules to fill the gaps.',
'The response provides actionable steps to improve coverage.',
],
toolCalls: [{ id: 'security.coverage_overview' }, { id: 'security.find_rules' }],
},
metadata: { category: 'e2e-workflow', difficulty: 'hard' },
},
{
input: {
question:
'The rule "Brute Force Detected" is generating too many false positives. Help me investigate why and tune it — check its recent execution, find any existing exceptions, and suggest improvements.',
},
output: {
criteria: [
'The response investigates the noisy rule by checking its execution health.',
'The response examines existing exceptions for the rule.',
'The response provides specific tuning recommendations.',
'The response offers actionable steps to reduce false positives.',
],
toolCalls: [
{ id: 'security.find_rules' },
{ id: 'security.rule_monitoring' },
{ id: 'security.manage_exceptions' },
],
},
metadata: { category: 'e2e-workflow', difficulty: 'hard' },
},
{
input: {
question:
'We just got new endpoint logs from a Linux fleet. Help me set up detection coverage: check what prebuilt rules are available for Linux, review our current coverage gaps for Initial Access and Execution on Linux, and install relevant prebuilt rules.',
},
output: {
criteria: [
'The response identifies prebuilt rules relevant to Linux endpoints.',
'The response analyzes MITRE coverage gaps for Initial Access and Execution.',
'The response suggests installing specific prebuilt rules.',
'The response considers the Linux platform context throughout.',
],
toolCalls: [
{ id: 'security.find_rules' },
{ id: 'security.coverage_overview' },
{ id: 'security.manage_rules' },
],
},
metadata: { category: 'e2e-workflow', difficulty: 'hard' },
},
{
input: {
question:
'Give me a health check on all our active detection rules — find any that are failing, have performance issues, or have gap warnings. Prioritize the critical and high severity ones.',
},
output: {
criteria: [
'The response provides a health overview of active detection rules.',
'The response identifies failing rules and performance issues.',
'The response prioritizes critical and high severity rules.',
'The response suggests remediation steps for unhealthy rules.',
],
toolCalls: [{ id: 'security.rule_monitoring' }, { id: 'security.find_rules' }],
},
metadata: { category: 'e2e-workflow', difficulty: 'medium' },
},
];
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { evaluate } from '../src/evaluate';
import { skillSelectionExamples } from '../datasets/skill_selection';

evaluate.describe('Detection Engineering Skill - Tool Selection', () => {
evaluate(
'selects correct tools for detection engineering queries',
async ({ evaluateDataset }) => {
await evaluateDataset({
dataset: {
name: 'detection-engineering: skill selection',
description:
'Validates that the detection engineering skill invokes the correct tools for different query types',
examples: skillSelectionExamples,
},
});
}
);
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { evaluate } from '../src/evaluate';
import { toolInvocationExamples } from '../datasets/tool_invocation';

evaluate.describe('Detection Engineering Skill - Tool Invocation', () => {
evaluate('invokes tools with correct parameters', async ({ evaluateDataset }) => {
await evaluateDataset({
dataset: {
name: 'detection-engineering: tool invocation',
description:
'Validates that the detection engineering skill invokes tools with correct parameters and criteria',
examples: toolInvocationExamples,
},
});
});
});
Loading
Loading