Skip to content

Commit f31911f

Browse files
authored
Merge pull request #93 from asamal4/keyword-eval
Add keyword eval metric
2 parents 39d7119 + 4e6ba3e commit f31911f

File tree

9 files changed

+504
-0
lines changed

9 files changed

+504
-0
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf
8686
- Response Evaluation
8787
- [`answer_correctness`](src/lightspeed_evaluation/core/metrics/custom.py)
8888
- [`intent_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Evaluates whether the response demonstrates the expected intent or purpose
89+
- [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive)
8990
- Tool Evaluation
9091
- [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls and arguments with regex pattern matching
9192
- **Script-based**
@@ -149,6 +150,9 @@ metrics_metadata:
149150

150151
"custom:tool_eval":
151152
description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"
153+
154+
"custom:keywords_eval": # Binary evaluation (0 or 1)
155+
description: "Keywords evaluation (ALL match) with sequential alternate checking (case insensitive)"
152156

153157
conversation_level:
154158
"deepeval:conversation_completeness":
@@ -226,12 +230,14 @@ embedding:
226230
contexts:
227231
- OpenShift Virtualization is an extension of the OpenShift ...
228232
attachments: [] # Attachments (Optional)
233+
expected_keywords: [["virtualization"], ["openshift"]] # For keywords_eval evaluation
229234
expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers
230235
expected_intent: "explain a concept" # Expected intent for intent evaluation
231236

232237
# Per-turn metrics (overrides system defaults)
233238
turn_metrics:
234239
- "ragas:faithfulness"
240+
- "custom:keywords_eval"
235241
- "custom:answer_correctness"
236242
- "custom:intent_eval"
237243

@@ -289,6 +295,7 @@ embedding:
289295
| `response` | string | 📋 | Actual response from system | ✅ (if API enabled) |
290296
| `contexts` | list[string] | 📋 | Context information for evaluation | ✅ (if API enabled) |
291297
| `attachments` | list[string] | ❌ | Attachments | ❌ |
298+
| `expected_keywords` | list[list[string]] | 📋 | Expected keywords for keyword evaluation (list of alternatives) | ❌ |
292299
| `expected_response` | string | 📋 | Expected response for comparison | ❌ |
293300
| `expected_intent` | string | 📋 | Expected intent for intent evaluation| ❌ |
294301
| `expected_tool_calls` | list[list[list[dict]]] | 📋 | Expected tool call sequences (multiple alternative sets) | ❌ |
@@ -300,6 +307,7 @@ embedding:
300307
> 📋 **Required based on metrics**: Some fields are required only when using specific metrics
301308

302309
Examples
310+
> - `expected_keywords`: Required for `custom:keywords_eval` (case insensitive matching)
303311
> - `expected_response`: Required for `custom:answer_correctness`
304312
> - `expected_intent`: Required for `custom:intent_eval`
305313
> - `expected_tool_calls`: Required for `custom:tool_eval` (multiple alternative sets format)

config/system.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ metrics_metadata:
7676
description: "Is what we retrieved actually relevant to user query?"
7777

7878
# Custom metrics
79+
"custom:keywords_eval": # boolean eval (either 0 or 1)
80+
description: "Keywords (ALL) matching evaluation with alternative sets"
81+
7982
"custom:answer_correctness":
8083
threshold: 0.75
8184
description: "Correctness vs expected answer using custom LLM evaluation"

src/lightspeed_evaluation/core/metrics/custom/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Custom metrics components package."""
22

33
from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
4+
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
45
from lightspeed_evaluation.core.metrics.custom.prompts import (
56
ANSWER_CORRECTNESS_PROMPT,
67
INTENT_EVALUATION_PROMPT,
@@ -9,6 +10,7 @@
910

1011
__all__ = [
1112
"CustomMetrics",
13+
"evaluate_keywords",
1214
"evaluate_tool_calls",
1315
# Prompts
1416
"ANSWER_CORRECTNESS_PROMPT",

src/lightspeed_evaluation/core/metrics/custom/custom.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
ANSWER_CORRECTNESS_PROMPT,
1010
INTENT_EVALUATION_PROMPT,
1111
)
12+
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
1213
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
1314
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
1415
from lightspeed_evaluation.core.system.exceptions import LLMError
@@ -28,6 +29,7 @@ def __init__(self, llm_manager: LLMManager):
2829
)
2930

3031
self.supported_metrics = {
32+
"keywords_eval": evaluate_keywords,
3133
"answer_correctness": self._evaluate_answer_correctness,
3234
"intent_eval": self._evaluate_intent,
3335
"tool_eval": self._evaluate_tool_calls,
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Keywords evaluation utilities."""
2+
3+
from typing import Any, Optional
4+
5+
from lightspeed_evaluation.core.models import TurnData
6+
7+
8+
def _validate_inputs(
9+
is_conversation: bool, turn_data: Optional[TurnData]
10+
) -> Optional[tuple[Optional[float], str]]:
11+
"""Validate inputs for keywords evaluation."""
12+
if is_conversation:
13+
return None, "Keywords eval is a turn-level metric"
14+
15+
if turn_data is None:
16+
return None, "TurnData is required for keywords eval evaluation"
17+
18+
if not turn_data.expected_keywords:
19+
return None, "No expected keywords provided for keywords eval evaluation"
20+
21+
if not turn_data.response:
22+
return 0.0, "No response provided for keywords eval evaluation"
23+
24+
return None
25+
26+
27+
def _check_keyword_list(
28+
keyword_list: list[str], response_lower: str
29+
) -> tuple[list[str], bool]:
30+
"""Check if all keywords in a list match the response."""
31+
matched_keywords = []
32+
all_matched = True
33+
34+
for keyword in keyword_list:
35+
if keyword.lower() in response_lower:
36+
matched_keywords.append(keyword)
37+
else:
38+
all_matched = False
39+
40+
return matched_keywords, all_matched
41+
42+
43+
def _create_success_result(
44+
list_index: int, matched_keywords: list[str]
45+
) -> tuple[float, str]:
46+
"""Create success result for keywords evaluation."""
47+
matched_str = ", ".join(f"'{kw}'" for kw in matched_keywords)
48+
reason = (
49+
f"Keywords eval successful: Option {list_index + 1} - "
50+
f"all keywords matched: {matched_str}"
51+
)
52+
return 1.0, reason
53+
54+
55+
def _create_failure_result(
56+
expected_keywords: list[list[str]], response_lower: str
57+
) -> tuple[float, str]:
58+
"""Create failure result for keywords evaluation."""
59+
failed_details = []
60+
61+
for list_index, keyword_list in enumerate(expected_keywords):
62+
matched_keywords, _ = _check_keyword_list(keyword_list, response_lower)
63+
unmatched_keywords = [
64+
kw for kw in keyword_list if kw.lower() not in response_lower
65+
]
66+
67+
if unmatched_keywords:
68+
unmatched_str = ", ".join(f"'{kw}'" for kw in unmatched_keywords)
69+
matched_str = (
70+
", ".join(f"'{kw}'" for kw in matched_keywords)
71+
if matched_keywords
72+
else "none"
73+
)
74+
failed_details.append(
75+
f"Option {list_index + 1}: unmatched [{unmatched_str}], matched [{matched_str}]"
76+
)
77+
78+
reason = f"Keywords eval failed: All options failed - {'; '.join(failed_details)}"
79+
return 0.0, reason
80+
81+
82+
def evaluate_keywords(
83+
_conv_data: Any,
84+
_turn_idx: Optional[int],
85+
turn_data: Optional[TurnData],
86+
is_conversation: bool,
87+
) -> tuple[Optional[float], str]:
88+
"""Evaluate keywords using substring matching with sequential list checking.
89+
90+
Logic: Check first option - if all keywords match, evaluation succeeds.
91+
If first option fails, try next alternative, and so on.
92+
If all alternatives fail, evaluation fails.
93+
94+
Args:
95+
_conv_data: Conversation data (unused)
96+
_turn_idx: Turn index (unused)
97+
turn_data: Turn data containing response and expected keywords
98+
is_conversation: Whether this is conversation-level evaluation
99+
100+
Returns:
101+
tuple: (score: float, reason: str)
102+
- score: 1.0 if any keyword list has all keywords matched, 0.0 otherwise
103+
- reason: Detailed explanation of evaluation results
104+
"""
105+
# Validate inputs
106+
validation_result = _validate_inputs(is_conversation, turn_data)
107+
if validation_result:
108+
return validation_result
109+
110+
if (
111+
turn_data is None
112+
or turn_data.response is None
113+
or turn_data.expected_keywords is None
114+
):
115+
return None, "Invalid turn data after validation"
116+
117+
response_lower = turn_data.response.lower()
118+
119+
# Check each expected keywords list
120+
for list_index, keyword_list in enumerate(turn_data.expected_keywords):
121+
matched_keywords, all_matched = _check_keyword_list(
122+
keyword_list, response_lower
123+
)
124+
125+
if all_matched:
126+
return _create_success_result(list_index, matched_keywords)
127+
128+
# If we reach here, all alternatives failed
129+
return _create_failure_result(turn_data.expected_keywords, response_lower)

src/lightspeed_evaluation/core/models/data.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ class TurnData(BaseModel):
5353
contexts: Optional[list[str]] = Field(
5454
default=None, min_length=1, description="Contexts"
5555
)
56+
expected_keywords: Optional[list[list[str]]] = Field(
57+
default=None,
58+
description="Expected keywords for keyword evaluation (list of alternatives)",
59+
)
5660
expected_response: Optional[str] = Field(
5761
default=None, min_length=1, description="Expected response for comparison"
5862
)
@@ -89,6 +93,36 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
8993
v = _validate_and_deduplicate_metrics(v, "Turn metric")
9094
return v
9195

96+
@field_validator("expected_keywords")
97+
@classmethod
98+
def validate_expected_keywords(
99+
cls, v: Optional[list[list[str]]]
100+
) -> Optional[list[list[str]]]:
101+
"""Validate expected keywords when provided."""
102+
if v is None:
103+
return None
104+
105+
if not isinstance(v, list):
106+
raise ValueError("expected_keywords must be a list of lists")
107+
108+
# Validate each alternative group
109+
for i, keyword_group in enumerate(v):
110+
if not isinstance(keyword_group, list):
111+
raise ValueError(f"expected_keywords[{i}] must be a list of strings")
112+
113+
if not keyword_group:
114+
raise ValueError(f"expected_keywords[{i}] cannot be empty")
115+
116+
for j, keyword in enumerate(keyword_group):
117+
if not isinstance(keyword, str):
118+
raise ValueError(f"expected_keywords[{i}][{j}] must be a string")
119+
if not keyword.strip():
120+
raise ValueError(
121+
f"expected_keywords[{i}][{j}] cannot be empty or whitespace"
122+
)
123+
124+
return v
125+
92126
@field_validator("expected_tool_calls", mode="before")
93127
@classmethod
94128
def validate_expected_tool_calls(

src/lightspeed_evaluation/core/system/validator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@
4040
"required_fields": ["response", "contexts"],
4141
"description": "requires 'response' and 'contexts' fields",
4242
},
43+
"custom:keywords_eval": {
44+
"required_fields": ["response", "expected_keywords"],
45+
"description": "requires 'response' and 'expected_keywords' fields",
46+
},
4347
"custom:answer_correctness": {
4448
"required_fields": ["response", "expected_response"],
4549
"description": "requires 'response' and 'expected_response' fields",

0 commit comments

Comments
 (0)