Merge pull request #41 from asamal4/arg-val-regex-check

tisnik · web-flow · commit dbac173a9ee1 · 2025-09-02T11:52:45.000+02:00
switch to regex check for tool arg value
diff --git a/lsc_agent_eval/README.md b/lsc_agent_eval/README.md
@@ -10,7 +10,7 @@ A framework for evaluating AI agent performance.
   - `action_eval`: Script-based evaluation using verification script (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench))
   - `response_eval:sub-string`: Simple substring matching evaluation (ALL keywords must be present in response; case-insensitive)
   - `response_eval:accuracy`: LLM-based evaluation using a judge model. Result is either accurate or not in comparison to expected response
-  - `tool_eval`: Tool call evaluation comparing expected vs actual tool calls with arguments
+  - `tool_eval`: Tool call evaluation comparing expected vs actual tool calls with arguments, Only regex pattern check (case sensitive) is done for argument value
 - **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after evaluation
 - **Result Tracking**: Result tracking with CSV output and JSON statistics
 - **Standalone Package**: Can be installed and used independently of the main lightspeed-core-evaluation package
@@ -66,7 +66,7 @@ Each evaluation within a conversation can include:
 - `eval_types`: List of evaluation types to run (action_eval, tool_eval, response_eval:sub-string, response_eval:accuracy)
 - `expected_response`: Expected response (for response_eval:accuracy evaluation)
 - `expected_keywords`: Keywords to look for (for response_eval:sub-string evaluation)
-- `expected_tool_calls`: Expected tool call sequences (list of lists) with tool_name and arguments (for tool_eval)
+- `expected_tool_calls`: Expected tool call sequences (list of lists) with tool_name and arguments (for tool_eval), Regex pattern check is done for argument value
 - `eval_verify_script`: Verification script (for action_eval evaluation)
 - `description`: Description of the evaluation (Optional)
 
@@ -133,6 +133,13 @@ Note: `eval_id` can't contain duplicate values within a conversation group. But
               oc_get_args: [namespaces, openshift-lightspeed]
       expected_keywords: ["yes", "openshift-lightspeed"]
       description: Tool call with argument validation and response verification
+    - eval_id: eval3
+      eval_query: get the log for the abc-pod
+      eval_types: [tool_eval]
+      expected_tool_calls:
+        - - tool_name: get_logs
+            arguments:
+              oc_get_args: abc-\\w+
 
 # Single-turn Conversations
 - conversation_group: conv3
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/tool_call_eval.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/tool_call_eval.py
@@ -1,6 +1,7 @@
 """Tool call evaluation utilities."""
 
 import logging
+import re
 from typing import Any, Callable
 
 logger = logging.getLogger(__name__)
@@ -68,7 +69,7 @@ def _compare_single_tool_call(expected: dict[str, Any], actual: dict[str, Any])
 
 
 def _compare_tool_arguments(expected: dict[str, Any], actual: dict[str, Any]) -> bool:
-    """Compare tool arguments."""
+    """Compare tool arguments name & value (regex pattern for the value)."""
     if not isinstance(expected, dict) or not isinstance(actual, dict):
         logger.debug(
             "Argument type mismatch: expected dict, got %s and %s",
@@ -77,19 +78,33 @@ def _compare_tool_arguments(expected: dict[str, Any], actual: dict[str, Any]) ->
         )
         return False
 
-    # Direct comparison is not done to have better debugging ability
+    # Compare each expected argument
     for key, expected_value in expected.items():
         if key not in actual:
             logger.debug("Missing argument key: '%s'", key)
             return False
+
         actual_value = actual[key]
-        if expected_value != actual_value:
+
+        expected_str = str(expected_value)
+        actual_str = str(actual_value)
+
+        # Use regex search for flexible matching
+        # This is a quick work-around, enhance this to use both regex & exact match.
+        try:
+            if not re.search(expected_str, actual_str):
+                logger.debug(
+                    "Argument value mismatch for '%s': pattern '%s' not found in '%s'",
+                    key,
+                    expected_str,
+                    actual_str,
+                )
+                return False
+        except re.error as e:
             logger.debug(
-                "Argument value mismatch for '%s': expected %s, got %s",
-                key,
-                expected_value,
-                actual_value,
+                "Invalid regex pattern '%s' for key '%s': %s", expected_str, key, e
             )
+            # If regex is invalid, fail the comparison
             return False
 
     # Check for extra keys in actual
diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_tool_call_eval.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_tool_call_eval.py
@@ -93,7 +93,7 @@ def test_wrong_argument_value(self):
 
             # Check that the specific argument mismatch was logged
             mock_logger.debug.assert_any_call(
-                "Argument value mismatch for '%s': expected %s, got %s",
+                "Argument value mismatch for '%s': pattern '%s' not found in '%s'",
                 "image",
                 "nginx",
                 "apache",
@@ -125,3 +125,38 @@ def test_missing_arguments_field(self):
         actual = [[{"tool_name": "simple_call"}]]
 
         assert compare_tool_calls(expected, actual)
+
+    def test_multiple_arguments_with_regex(self):
+        """Test multiple arguments where some use regex patterns."""
+        expected = [
+            [
+                {
+                    "tool_name": "get_log",
+                    "arguments": {
+                        "name": "pod-\\d+",
+                        "kind": "pod",
+                        "namespace": "default",
+                    },
+                }
+            ]
+        ]
+        actual = [
+            [
+                {
+                    "tool_name": "get_log",
+                    "arguments": {
+                        "name": "pod-123",
+                        "kind": "pod",
+                        "namespace": "default",
+                    },
+                }
+            ]
+        ]
+        assert compare_tool_calls(expected, actual) is True
+
+    def test_invalid_regex_fails(self):
+        """Test invalid regex patterns."""
+        expected = [[{"tool_name": "oc_get", "arguments": {"name": "["}}]]
+        actual = [[{"tool_name": "oc_get", "arguments": {"name": "["}}]]
+        # Should fail due to invalid regex pattern
+        assert compare_tool_calls(expected, actual) is False