Skip to content

Commit dbac173

Browse files
authored
Merge pull request #41 from asamal4/arg-val-regex-check
switch to regex check for tool arg value
2 parents 2bbafdd + b888ebf commit dbac173

File tree

3 files changed

+67
-10
lines changed

3 files changed

+67
-10
lines changed

lsc_agent_eval/README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ A framework for evaluating AI agent performance.
1010
- `action_eval`: Script-based evaluation using verification script (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench))
1111
- `response_eval:sub-string`: Simple substring matching evaluation (ALL keywords must be present in response; case-insensitive)
1212
- `response_eval:accuracy`: LLM-based evaluation using a judge model. Result is either accurate or not in comparison to expected response
13-
- `tool_eval`: Tool call evaluation comparing expected vs actual tool calls with arguments
13+
- `tool_eval`: Tool call evaluation comparing expected vs actual tool calls with arguments, Only regex pattern check (case sensitive) is done for argument value
1414
- **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after evaluation
1515
- **Result Tracking**: Result tracking with CSV output and JSON statistics
1616
- **Standalone Package**: Can be installed and used independently of the main lightspeed-core-evaluation package
@@ -66,7 +66,7 @@ Each evaluation within a conversation can include:
6666
- `eval_types`: List of evaluation types to run (action_eval, tool_eval, response_eval:sub-string, response_eval:accuracy)
6767
- `expected_response`: Expected response (for response_eval:accuracy evaluation)
6868
- `expected_keywords`: Keywords to look for (for response_eval:sub-string evaluation)
69-
- `expected_tool_calls`: Expected tool call sequences (list of lists) with tool_name and arguments (for tool_eval)
69+
- `expected_tool_calls`: Expected tool call sequences (list of lists) with tool_name and arguments (for tool_eval), Regex pattern check is done for argument value
7070
- `eval_verify_script`: Verification script (for action_eval evaluation)
7171
- `description`: Description of the evaluation (Optional)
7272

@@ -133,6 +133,13 @@ Note: `eval_id` can't contain duplicate values within a conversation group. But
133133
oc_get_args: [namespaces, openshift-lightspeed]
134134
expected_keywords: ["yes", "openshift-lightspeed"]
135135
description: Tool call with argument validation and response verification
136+
- eval_id: eval3
137+
eval_query: get the log for the abc-pod
138+
eval_types: [tool_eval]
139+
expected_tool_calls:
140+
- - tool_name: get_logs
141+
arguments:
142+
oc_get_args: abc-\\w+
136143

137144
# Single-turn Conversations
138145
- conversation_group: conv3

lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/tool_call_eval.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Tool call evaluation utilities."""
22

33
import logging
4+
import re
45
from typing import Any, Callable
56

67
logger = logging.getLogger(__name__)
@@ -68,7 +69,7 @@ def _compare_single_tool_call(expected: dict[str, Any], actual: dict[str, Any])
6869

6970

7071
def _compare_tool_arguments(expected: dict[str, Any], actual: dict[str, Any]) -> bool:
71-
"""Compare tool arguments."""
72+
"""Compare tool arguments name & value (regex pattern for the value)."""
7273
if not isinstance(expected, dict) or not isinstance(actual, dict):
7374
logger.debug(
7475
"Argument type mismatch: expected dict, got %s and %s",
@@ -77,19 +78,33 @@ def _compare_tool_arguments(expected: dict[str, Any], actual: dict[str, Any]) ->
7778
)
7879
return False
7980

80-
# Direct comparison is not done to have better debugging ability
81+
# Compare each expected argument
8182
for key, expected_value in expected.items():
8283
if key not in actual:
8384
logger.debug("Missing argument key: '%s'", key)
8485
return False
86+
8587
actual_value = actual[key]
86-
if expected_value != actual_value:
88+
89+
expected_str = str(expected_value)
90+
actual_str = str(actual_value)
91+
92+
# Use regex search for flexible matching
93+
# This is a quick work-around, enhance this to use both regex & exact match.
94+
try:
95+
if not re.search(expected_str, actual_str):
96+
logger.debug(
97+
"Argument value mismatch for '%s': pattern '%s' not found in '%s'",
98+
key,
99+
expected_str,
100+
actual_str,
101+
)
102+
return False
103+
except re.error as e:
87104
logger.debug(
88-
"Argument value mismatch for '%s': expected %s, got %s",
89-
key,
90-
expected_value,
91-
actual_value,
105+
"Invalid regex pattern '%s' for key '%s': %s", expected_str, key, e
92106
)
107+
# If regex is invalid, fail the comparison
93108
return False
94109

95110
# Check for extra keys in actual

lsc_agent_eval/tests/core/agent_goal_eval/test_tool_call_eval.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def test_wrong_argument_value(self):
9393

9494
# Check that the specific argument mismatch was logged
9595
mock_logger.debug.assert_any_call(
96-
"Argument value mismatch for '%s': expected %s, got %s",
96+
"Argument value mismatch for '%s': pattern '%s' not found in '%s'",
9797
"image",
9898
"nginx",
9999
"apache",
@@ -125,3 +125,38 @@ def test_missing_arguments_field(self):
125125
actual = [[{"tool_name": "simple_call"}]]
126126

127127
assert compare_tool_calls(expected, actual)
128+
129+
def test_multiple_arguments_with_regex(self):
130+
"""Test multiple arguments where some use regex patterns."""
131+
expected = [
132+
[
133+
{
134+
"tool_name": "get_log",
135+
"arguments": {
136+
"name": "pod-\\d+",
137+
"kind": "pod",
138+
"namespace": "default",
139+
},
140+
}
141+
]
142+
]
143+
actual = [
144+
[
145+
{
146+
"tool_name": "get_log",
147+
"arguments": {
148+
"name": "pod-123",
149+
"kind": "pod",
150+
"namespace": "default",
151+
},
152+
}
153+
]
154+
]
155+
assert compare_tool_calls(expected, actual) is True
156+
157+
def test_invalid_regex_fails(self):
158+
"""Test invalid regex patterns."""
159+
expected = [[{"tool_name": "oc_get", "arguments": {"name": "["}}]]
160+
actual = [[{"tool_name": "oc_get", "arguments": {"name": "["}}]]
161+
# Should fail due to invalid regex pattern
162+
assert compare_tool_calls(expected, actual) is False

0 commit comments

Comments
 (0)