diff --git a/README.md b/README.md index 5aedc683..83d1caee 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf - Response Evaluation - [`answer_correctness`](src/lightspeed_evaluation/core/metrics/custom.py) - [`intent_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Evaluates whether the response demonstrates the expected intent or purpose + - [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive) - Tool Evaluation - [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls and arguments with regex pattern matching - **Script-based** @@ -149,6 +150,9 @@ metrics_metadata: "custom:tool_eval": description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)" + + "custom:keywords_eval": # Binary evaluation (0 or 1) + description: "Keywords evaluation (ALL match) with sequential alternate checking (case insensitive)" conversation_level: "deepeval:conversation_completeness": @@ -226,12 +230,14 @@ embedding: contexts: - OpenShift Virtualization is an extension of the OpenShift ... attachments: [] # Attachments (Optional) + expected_keywords: [["virtualization"], ["openshift"]] # For keywords_eval evaluation expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers expected_intent: "explain a concept" # Expected intent for intent evaluation # Per-turn metrics (overrides system defaults) turn_metrics: - "ragas:faithfulness" + - "custom:keywords_eval" - "custom:answer_correctness" - "custom:intent_eval" @@ -289,6 +295,7 @@ embedding: | `response` | string | 📋 | Actual response from system | ✅ (if API enabled) | | `contexts` | list[string] | 📋 | Context information for evaluation | ✅ (if API enabled) | | `attachments` | list[string] | ❌ | Attachments | ❌ | +| `expected_keywords` | list[list[string]] | 📋 | Expected keywords for keyword evaluation (list of alternatives) | ❌ | | `expected_response` | string | 📋 | Expected response for comparison | ❌ | | `expected_intent` | string | 📋 | Expected intent for intent evaluation| ❌ | | `expected_tool_calls` | list[list[list[dict]]] | 📋 | Expected tool call sequences (multiple alternative sets) | ❌ | @@ -300,6 +307,7 @@ embedding: > 📋 **Required based on metrics**: Some fields are required only when using specific metrics Examples +> - `expected_keywords`: Required for `custom:keywords_eval` (case insensitive matching) > - `expected_response`: Required for `custom:answer_correctness` > - `expected_intent`: Required for `custom:intent_eval` > - `expected_tool_calls`: Required for `custom:tool_eval` (multiple alternative sets format) diff --git a/config/system.yaml b/config/system.yaml index 24cab1f2..435ec687 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -76,6 +76,9 @@ metrics_metadata: description: "Is what we retrieved actually relevant to user query?" # Custom metrics + "custom:keywords_eval": # boolean eval (either 0 or 1) + description: "Keywords (ALL) matching evaluation with alternative sets" + "custom:answer_correctness": threshold: 0.75 description: "Correctness vs expected answer using custom LLM evaluation" diff --git a/src/lightspeed_evaluation/core/metrics/custom/__init__.py b/src/lightspeed_evaluation/core/metrics/custom/__init__.py index a9985573..112dafa5 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/__init__.py +++ b/src/lightspeed_evaluation/core/metrics/custom/__init__.py @@ -1,6 +1,7 @@ """Custom metrics components package.""" from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics +from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords from lightspeed_evaluation.core.metrics.custom.prompts import ( ANSWER_CORRECTNESS_PROMPT, INTENT_EVALUATION_PROMPT, @@ -9,6 +10,7 @@ __all__ = [ "CustomMetrics", + "evaluate_keywords", "evaluate_tool_calls", # Prompts "ANSWER_CORRECTNESS_PROMPT", diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py index 079329db..e2639638 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/custom.py +++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py @@ -9,6 +9,7 @@ ANSWER_CORRECTNESS_PROMPT, INTENT_EVALUATION_PROMPT, ) +from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls from lightspeed_evaluation.core.models import EvaluationScope, TurnData from lightspeed_evaluation.core.system.exceptions import LLMError @@ -28,6 +29,7 @@ def __init__(self, llm_manager: LLMManager): ) self.supported_metrics = { + "keywords_eval": evaluate_keywords, "answer_correctness": self._evaluate_answer_correctness, "intent_eval": self._evaluate_intent, "tool_eval": self._evaluate_tool_calls, diff --git a/src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py b/src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py new file mode 100644 index 00000000..b8876547 --- /dev/null +++ b/src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py @@ -0,0 +1,129 @@ +"""Keywords evaluation utilities.""" + +from typing import Any, Optional + +from lightspeed_evaluation.core.models import TurnData + + +def _validate_inputs( + is_conversation: bool, turn_data: Optional[TurnData] +) -> Optional[tuple[Optional[float], str]]: + """Validate inputs for keywords evaluation.""" + if is_conversation: + return None, "Keywords eval is a turn-level metric" + + if turn_data is None: + return None, "TurnData is required for keywords eval evaluation" + + if not turn_data.expected_keywords: + return None, "No expected keywords provided for keywords eval evaluation" + + if not turn_data.response: + return 0.0, "No response provided for keywords eval evaluation" + + return None + + +def _check_keyword_list( + keyword_list: list[str], response_lower: str +) -> tuple[list[str], bool]: + """Check if all keywords in a list match the response.""" + matched_keywords = [] + all_matched = True + + for keyword in keyword_list: + if keyword.lower() in response_lower: + matched_keywords.append(keyword) + else: + all_matched = False + + return matched_keywords, all_matched + + +def _create_success_result( + list_index: int, matched_keywords: list[str] +) -> tuple[float, str]: + """Create success result for keywords evaluation.""" + matched_str = ", ".join(f"'{kw}'" for kw in matched_keywords) + reason = ( + f"Keywords eval successful: Option {list_index + 1} - " + f"all keywords matched: {matched_str}" + ) + return 1.0, reason + + +def _create_failure_result( + expected_keywords: list[list[str]], response_lower: str +) -> tuple[float, str]: + """Create failure result for keywords evaluation.""" + failed_details = [] + + for list_index, keyword_list in enumerate(expected_keywords): + matched_keywords, _ = _check_keyword_list(keyword_list, response_lower) + unmatched_keywords = [ + kw for kw in keyword_list if kw.lower() not in response_lower + ] + + if unmatched_keywords: + unmatched_str = ", ".join(f"'{kw}'" for kw in unmatched_keywords) + matched_str = ( + ", ".join(f"'{kw}'" for kw in matched_keywords) + if matched_keywords + else "none" + ) + failed_details.append( + f"Option {list_index + 1}: unmatched [{unmatched_str}], matched [{matched_str}]" + ) + + reason = f"Keywords eval failed: All options failed - {'; '.join(failed_details)}" + return 0.0, reason + + +def evaluate_keywords( + _conv_data: Any, + _turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, +) -> tuple[Optional[float], str]: + """Evaluate keywords using substring matching with sequential list checking. + + Logic: Check first option - if all keywords match, evaluation succeeds. + If first option fails, try next alternative, and so on. + If all alternatives fail, evaluation fails. + + Args: + _conv_data: Conversation data (unused) + _turn_idx: Turn index (unused) + turn_data: Turn data containing response and expected keywords + is_conversation: Whether this is conversation-level evaluation + + Returns: + tuple: (score: float, reason: str) + - score: 1.0 if any keyword list has all keywords matched, 0.0 otherwise + - reason: Detailed explanation of evaluation results + """ + # Validate inputs + validation_result = _validate_inputs(is_conversation, turn_data) + if validation_result: + return validation_result + + if ( + turn_data is None + or turn_data.response is None + or turn_data.expected_keywords is None + ): + return None, "Invalid turn data after validation" + + response_lower = turn_data.response.lower() + + # Check each expected keywords list + for list_index, keyword_list in enumerate(turn_data.expected_keywords): + matched_keywords, all_matched = _check_keyword_list( + keyword_list, response_lower + ) + + if all_matched: + return _create_success_result(list_index, matched_keywords) + + # If we reach here, all alternatives failed + return _create_failure_result(turn_data.expected_keywords, response_lower) diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py index 3041b025..d7a692a4 100644 --- a/src/lightspeed_evaluation/core/models/data.py +++ b/src/lightspeed_evaluation/core/models/data.py @@ -53,6 +53,10 @@ class TurnData(BaseModel): contexts: Optional[list[str]] = Field( default=None, min_length=1, description="Contexts" ) + expected_keywords: Optional[list[list[str]]] = Field( + default=None, + description="Expected keywords for keyword evaluation (list of alternatives)", + ) expected_response: Optional[str] = Field( default=None, min_length=1, description="Expected response for comparison" ) @@ -89,6 +93,36 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]: v = _validate_and_deduplicate_metrics(v, "Turn metric") return v + @field_validator("expected_keywords") + @classmethod + def validate_expected_keywords( + cls, v: Optional[list[list[str]]] + ) -> Optional[list[list[str]]]: + """Validate expected keywords when provided.""" + if v is None: + return None + + if not isinstance(v, list): + raise ValueError("expected_keywords must be a list of lists") + + # Validate each alternative group + for i, keyword_group in enumerate(v): + if not isinstance(keyword_group, list): + raise ValueError(f"expected_keywords[{i}] must be a list of strings") + + if not keyword_group: + raise ValueError(f"expected_keywords[{i}] cannot be empty") + + for j, keyword in enumerate(keyword_group): + if not isinstance(keyword, str): + raise ValueError(f"expected_keywords[{i}][{j}] must be a string") + if not keyword.strip(): + raise ValueError( + f"expected_keywords[{i}][{j}] cannot be empty or whitespace" + ) + + return v + @field_validator("expected_tool_calls", mode="before") @classmethod def validate_expected_tool_calls( diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py index bc5b3aba..98705d1f 100644 --- a/src/lightspeed_evaluation/core/system/validator.py +++ b/src/lightspeed_evaluation/core/system/validator.py @@ -40,6 +40,10 @@ "required_fields": ["response", "contexts"], "description": "requires 'response' and 'contexts' fields", }, + "custom:keywords_eval": { + "required_fields": ["response", "expected_keywords"], + "description": "requires 'response' and 'expected_keywords' fields", + }, "custom:answer_correctness": { "required_fields": ["response", "expected_response"], "description": "requires 'response' and 'expected_response' fields", diff --git a/tests/unit/core/metrics/test_keywords_eval.py b/tests/unit/core/metrics/test_keywords_eval.py new file mode 100644 index 00000000..df140918 --- /dev/null +++ b/tests/unit/core/metrics/test_keywords_eval.py @@ -0,0 +1,189 @@ +"""Tests for keywords eval metric.""" + +from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords +from lightspeed_evaluation.core.models import TurnData + + +class TestKeywordsEval: + """Test cases for keywords eval metric.""" + + def test_keywords_eval_first_list_all_matched(self): + """Test successful keywords evaluation when first list has all keywords matched.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="This response contains openshift-monitoring and yes it exists", + expected_keywords=[ + ["yes", "openshift-monitoring"], # Option 1: Both keywords should match + ["confirmed", "monitoring"], # Option 2: Should not be checked + ], + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 1.0 + assert "Keywords eval successful: Option 1" in reason + assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason + + def test_keywords_eval_first_list_fails_second_succeeds(self): + """Test keywords evaluation when first list fails but second list succeeds.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="This response contains monitoring and confirmed status", + expected_keywords=[ + [ + "yes", + "openshift-monitoring", + ], # Option 1: "yes" missing, "openshift-monitoring" missing + ["monitoring", "confirmed"], # Option 2: Both should match + ], + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 1.0 + assert "Keywords eval successful: Option 2" in reason + assert "all keywords matched: 'monitoring', 'confirmed'" in reason + + def test_keywords_eval_all_lists_fail(self): + """Test keywords evaluation when all lists fail.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="This response contains nothing relevant", + expected_keywords=[ + ["yes", "openshift-monitoring"], # Option 1: Both missing + ["confirmed", "monitoring"], # Option 2: Both missing + ], + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 0.0 + assert "Keywords eval failed: All options failed" in reason + assert ( + "Option 1: unmatched ['yes', 'openshift-monitoring'], matched [none]" + in reason + ) + assert ( + "Option 2: unmatched ['confirmed', 'monitoring'], matched [none]" in reason + ) + + def test_keywords_eval_partial_match_in_failed_list(self): + """Test keywords evaluation with partial matches in failed lists.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="This response contains monitoring but no confirmation", + expected_keywords=[ + ["yes", "confirmed"], # Option 1: Both missing + [ + "monitoring", + "openshift", + ], # Option 2: "monitoring" matches, "openshift" missing + ], + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 0.0 + assert "Keywords eval failed: All options failed" in reason + assert "Option 1: unmatched ['yes', 'confirmed'], matched [none]" in reason + assert "Option 2: unmatched ['openshift'], matched ['monitoring']" in reason + + def test_keywords_eval_case_insensitive(self): + """Test that keywords evaluation is case insensitive.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="This response contains YES and OPENSHIFT-MONITORING", + expected_keywords=[ + ["yes", "openshift-monitoring"] # Should match despite case differences + ], + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 1.0 + assert "Keywords eval successful: Option 1" in reason + assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason + + def test_keywords_eval_substring_matching(self): + """Test that keywords evaluation works with substring matching.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="The openshift-monitoring-operator is running successfully", + expected_keywords=[ + [ + "monitoring", + "success", + ] # Should match "monitoring" in "openshift-monitoring-operator" + ], + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 1.0 + assert "Keywords eval successful: Option 1" in reason + assert "all keywords matched: 'monitoring', 'success'" in reason + + def test_keywords_eval_no_expected_keywords(self): + """Test keywords evaluation when no expected keywords provided.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="Some response", + expected_keywords=None, + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score is None + assert "No expected keywords provided" in reason + + def test_keywords_eval_no_response(self): + """Test keywords evaluation when no response provided.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response=None, + expected_keywords=[["yes"], ["monitoring"]], + ) + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 0.0 + assert "No response provided" in reason + + def test_keywords_eval_empty_response(self): + """Test keywords evaluation with empty response.""" + # Create turn data with valid response first, then modify it + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + response="valid response", + expected_keywords=[["yes"], ["monitoring"]], + ) + # Manually set response to empty to bypass validation + turn_data.response = "" + + score, reason = evaluate_keywords(None, 0, turn_data, False) + + assert score == 0.0 + assert "No response provided" in reason + + def test_keywords_eval_conversation_level_error(self): + """Test that keywords_eval returns error for conversation-level evaluation.""" + score, reason = evaluate_keywords(None, None, None, True) + + assert score is None + assert "Keywords eval is a turn-level metric" in reason + + def test_keywords_eval_no_turn_data(self): + """Test keywords evaluation when no turn data provided.""" + score, reason = evaluate_keywords(None, 0, None, False) + + assert score is None + assert "TurnData is required" in reason diff --git a/tests/unit/core/models/test_data.py b/tests/unit/core/models/test_data.py index b6d86ff9..4c306dac 100644 --- a/tests/unit/core/models/test_data.py +++ b/tests/unit/core/models/test_data.py @@ -332,3 +332,136 @@ def test_is_single_set_format_detection(self): assert expected is not None assert len(expected) == 1 # One alternative set assert len(expected[0]) == 2 # Two sequences in that set + + +class TestTurnDataKeywordsValidation: + """Test cases for expected_keywords validation in TurnData.""" + + def test_valid_expected_keywords_single_group(self): + """Test valid expected_keywords with single group.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=[["keyword1", "keyword2"]], + ) + + assert turn_data.expected_keywords == [["keyword1", "keyword2"]] + + def test_valid_expected_keywords_multiple_groups(self): + """Test valid expected_keywords with multiple groups.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=[ + ["yes", "confirmed"], + ["monitoring", "namespace"], + ["success", "complete"], + ], + ) + + assert len(turn_data.expected_keywords) == 3 + assert turn_data.expected_keywords[0] == ["yes", "confirmed"] + assert turn_data.expected_keywords[1] == ["monitoring", "namespace"] + assert turn_data.expected_keywords[2] == ["success", "complete"] + + def test_valid_expected_keywords_none(self): + """Test that None is valid for expected_keywords.""" + turn_data = TurnData( + turn_id="test_turn", query="Test query", expected_keywords=None + ) + + assert turn_data.expected_keywords is None + + def test_invalid_expected_keywords_not_list(self): + """Test that non-list expected_keywords raises ValidationError.""" + with pytest.raises(ValidationError) as exc_info: + TurnData( + turn_id="test_turn", query="Test query", expected_keywords="not_a_list" + ) + + assert "Input should be a valid list" in str(exc_info.value) + + def test_invalid_expected_keywords_inner_not_list(self): + """Test that non-list inner elements raise ValidationError.""" + with pytest.raises(ValidationError) as exc_info: + TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=["not_a_list", ["valid_list"]], + ) + + assert "Input should be a valid list" in str(exc_info.value) + + def test_invalid_expected_keywords_empty_inner_list(self): + """Test that empty inner lists raise ValidationError.""" + with pytest.raises(ValidationError) as exc_info: + TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=[[], ["valid_list"]], + ) + + assert "expected_keywords[0] cannot be empty" in str(exc_info.value) + + def test_invalid_expected_keywords_non_string_element(self): + """Test that non-string elements in inner lists raise ValidationError.""" + with pytest.raises(ValidationError) as exc_info: + TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=[["valid_string", 123]], + ) + + assert "Input should be a valid string" in str(exc_info.value) + + def test_invalid_expected_keywords_empty_string_element(self): + """Test that empty string elements raise ValidationError.""" + with pytest.raises(ValidationError) as exc_info: + TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=[["valid_string", ""]], + ) + + assert "expected_keywords[0][1] cannot be empty or whitespace" in str( + exc_info.value + ) + + def test_invalid_expected_keywords_whitespace_only_element(self): + """Test that whitespace-only string elements raise ValidationError.""" + with pytest.raises(ValidationError) as exc_info: + TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=[["valid_string", " "]], + ) + + assert "expected_keywords[0][1] cannot be empty or whitespace" in str( + exc_info.value + ) + + def test_complex_valid_expected_keywords(self): + """Test complex but valid expected_keywords structure.""" + turn_data = TurnData( + turn_id="test_turn", + query="Test query", + expected_keywords=[ + ["yes", "confirmed", "affirmative"], + [ + "openshift-monitoring", + "monitoring namespace", + ], + [ + "created successfully", + "creation complete", + "successfully created", + ], + ["pod", "container", "workload"], + ], + ) + + assert len(turn_data.expected_keywords) == 4 + assert len(turn_data.expected_keywords[0]) == 3 + assert len(turn_data.expected_keywords[1]) == 2 + assert len(turn_data.expected_keywords[2]) == 3 + assert len(turn_data.expected_keywords[3]) == 3