Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf
- Response Evaluation
- [`answer_correctness`](src/lightspeed_evaluation/core/metrics/custom.py)
- [`intent_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Evaluates whether the response demonstrates the expected intent or purpose
- [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive)
- Tool Evaluation
- [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls and arguments with regex pattern matching
- **Script-based**
Expand Down Expand Up @@ -149,6 +150,9 @@ metrics_metadata:

"custom:tool_eval":
description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"

"custom:keywords_eval": # Binary evaluation (0 or 1)
description: "Keywords evaluation (ALL match) with sequential alternate checking (case insensitive)"

conversation_level:
"deepeval:conversation_completeness":
Expand Down Expand Up @@ -226,12 +230,14 @@ embedding:
contexts:
- OpenShift Virtualization is an extension of the OpenShift ...
attachments: [] # Attachments (Optional)
expected_keywords: [["virtualization"], ["openshift"]] # For keywords_eval evaluation
expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers
expected_intent: "explain a concept" # Expected intent for intent evaluation

# Per-turn metrics (overrides system defaults)
turn_metrics:
- "ragas:faithfulness"
- "custom:keywords_eval"
- "custom:answer_correctness"
- "custom:intent_eval"

Expand Down Expand Up @@ -289,6 +295,7 @@ embedding:
| `response` | string | 📋 | Actual response from system | ✅ (if API enabled) |
| `contexts` | list[string] | 📋 | Context information for evaluation | ✅ (if API enabled) |
| `attachments` | list[string] | ❌ | Attachments | ❌ |
| `expected_keywords` | list[list[string]] | 📋 | Expected keywords for keyword evaluation (list of alternatives) | ❌ |
| `expected_response` | string | 📋 | Expected response for comparison | ❌ |
| `expected_intent` | string | 📋 | Expected intent for intent evaluation| ❌ |
| `expected_tool_calls` | list[list[list[dict]]] | 📋 | Expected tool call sequences (multiple alternative sets) | ❌ |
Expand All @@ -300,6 +307,7 @@ embedding:
> 📋 **Required based on metrics**: Some fields are required only when using specific metrics

Examples
> - `expected_keywords`: Required for `custom:keywords_eval` (case insensitive matching)
> - `expected_response`: Required for `custom:answer_correctness`
> - `expected_intent`: Required for `custom:intent_eval`
> - `expected_tool_calls`: Required for `custom:tool_eval` (multiple alternative sets format)
Expand Down
3 changes: 3 additions & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ metrics_metadata:
description: "Is what we retrieved actually relevant to user query?"

# Custom metrics
"custom:keywords_eval": # boolean eval (either 0 or 1)
description: "Keywords (ALL) matching evaluation with alternative sets"

"custom:answer_correctness":
threshold: 0.75
description: "Correctness vs expected answer using custom LLM evaluation"
Expand Down
2 changes: 2 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Custom metrics components package."""

from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
from lightspeed_evaluation.core.metrics.custom.prompts import (
ANSWER_CORRECTNESS_PROMPT,
INTENT_EVALUATION_PROMPT,
Expand All @@ -9,6 +10,7 @@

__all__ = [
"CustomMetrics",
"evaluate_keywords",
"evaluate_tool_calls",
# Prompts
"ANSWER_CORRECTNESS_PROMPT",
Expand Down
2 changes: 2 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
ANSWER_CORRECTNESS_PROMPT,
INTENT_EVALUATION_PROMPT,
)
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
from lightspeed_evaluation.core.system.exceptions import LLMError
Expand All @@ -28,6 +29,7 @@ def __init__(self, llm_manager: LLMManager):
)

self.supported_metrics = {
"keywords_eval": evaluate_keywords,
"answer_correctness": self._evaluate_answer_correctness,
"intent_eval": self._evaluate_intent,
"tool_eval": self._evaluate_tool_calls,
Expand Down
129 changes: 129 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""Keywords evaluation utilities."""

from typing import Any, Optional

from lightspeed_evaluation.core.models import TurnData


def _validate_inputs(
is_conversation: bool, turn_data: Optional[TurnData]
) -> Optional[tuple[Optional[float], str]]:
"""Validate inputs for keywords evaluation."""
if is_conversation:
return None, "Keywords eval is a turn-level metric"

if turn_data is None:
return None, "TurnData is required for keywords eval evaluation"

if not turn_data.expected_keywords:
return None, "No expected keywords provided for keywords eval evaluation"

if not turn_data.response:
return 0.0, "No response provided for keywords eval evaluation"

return None


def _check_keyword_list(
keyword_list: list[str], response_lower: str
) -> tuple[list[str], bool]:
"""Check if all keywords in a list match the response."""
matched_keywords = []
all_matched = True

for keyword in keyword_list:
if keyword.lower() in response_lower:
matched_keywords.append(keyword)
else:
all_matched = False

return matched_keywords, all_matched


def _create_success_result(
list_index: int, matched_keywords: list[str]
) -> tuple[float, str]:
"""Create success result for keywords evaluation."""
matched_str = ", ".join(f"'{kw}'" for kw in matched_keywords)
reason = (
f"Keywords eval successful: Option {list_index + 1} - "
f"all keywords matched: {matched_str}"
)
return 1.0, reason


def _create_failure_result(
expected_keywords: list[list[str]], response_lower: str
) -> tuple[float, str]:
"""Create failure result for keywords evaluation."""
failed_details = []

for list_index, keyword_list in enumerate(expected_keywords):
matched_keywords, _ = _check_keyword_list(keyword_list, response_lower)
unmatched_keywords = [
kw for kw in keyword_list if kw.lower() not in response_lower
]

if unmatched_keywords:
unmatched_str = ", ".join(f"'{kw}'" for kw in unmatched_keywords)
matched_str = (
", ".join(f"'{kw}'" for kw in matched_keywords)
if matched_keywords
else "none"
)
failed_details.append(
f"Option {list_index + 1}: unmatched [{unmatched_str}], matched [{matched_str}]"
)

reason = f"Keywords eval failed: All options failed - {'; '.join(failed_details)}"
return 0.0, reason


def evaluate_keywords(
_conv_data: Any,
_turn_idx: Optional[int],
turn_data: Optional[TurnData],
is_conversation: bool,
) -> tuple[Optional[float], str]:
"""Evaluate keywords using substring matching with sequential list checking.

Logic: Check first option - if all keywords match, evaluation succeeds.
If first option fails, try next alternative, and so on.
If all alternatives fail, evaluation fails.

Args:
_conv_data: Conversation data (unused)
_turn_idx: Turn index (unused)
turn_data: Turn data containing response and expected keywords
is_conversation: Whether this is conversation-level evaluation

Returns:
tuple: (score: float, reason: str)
- score: 1.0 if any keyword list has all keywords matched, 0.0 otherwise
- reason: Detailed explanation of evaluation results
"""
# Validate inputs
validation_result = _validate_inputs(is_conversation, turn_data)
if validation_result:
return validation_result

if (
turn_data is None
or turn_data.response is None
or turn_data.expected_keywords is None
):
return None, "Invalid turn data after validation"

response_lower = turn_data.response.lower()

# Check each expected keywords list
for list_index, keyword_list in enumerate(turn_data.expected_keywords):
matched_keywords, all_matched = _check_keyword_list(
keyword_list, response_lower
)

if all_matched:
return _create_success_result(list_index, matched_keywords)

# If we reach here, all alternatives failed
return _create_failure_result(turn_data.expected_keywords, response_lower)
34 changes: 34 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ class TurnData(BaseModel):
contexts: Optional[list[str]] = Field(
default=None, min_length=1, description="Contexts"
)
expected_keywords: Optional[list[list[str]]] = Field(
default=None,
description="Expected keywords for keyword evaluation (list of alternatives)",
)
expected_response: Optional[str] = Field(
default=None, min_length=1, description="Expected response for comparison"
)
Expand Down Expand Up @@ -89,6 +93,36 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
v = _validate_and_deduplicate_metrics(v, "Turn metric")
return v

@field_validator("expected_keywords")
@classmethod
def validate_expected_keywords(
cls, v: Optional[list[list[str]]]
) -> Optional[list[list[str]]]:
"""Validate expected keywords when provided."""
if v is None:
return None

if not isinstance(v, list):
raise ValueError("expected_keywords must be a list of lists")

# Validate each alternative group
for i, keyword_group in enumerate(v):
if not isinstance(keyword_group, list):
raise ValueError(f"expected_keywords[{i}] must be a list of strings")

if not keyword_group:
raise ValueError(f"expected_keywords[{i}] cannot be empty")

for j, keyword in enumerate(keyword_group):
if not isinstance(keyword, str):
raise ValueError(f"expected_keywords[{i}][{j}] must be a string")
if not keyword.strip():
raise ValueError(
f"expected_keywords[{i}][{j}] cannot be empty or whitespace"
)

return v

@field_validator("expected_tool_calls", mode="before")
@classmethod
def validate_expected_tool_calls(
Expand Down
4 changes: 4 additions & 0 deletions src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
"required_fields": ["response", "contexts"],
"description": "requires 'response' and 'contexts' fields",
},
"custom:keywords_eval": {
"required_fields": ["response", "expected_keywords"],
"description": "requires 'response' and 'expected_keywords' fields",
},
"custom:answer_correctness": {
"required_fields": ["response", "expected_response"],
"description": "requires 'response' and 'expected_response' fields",
Expand Down
Loading
Loading