add keyword eval test case

asamal4 · asamal4 · commit ab6a85ab9fd4 · 2025-11-06T19:20:42.000+05:30
diff --git a/tests/unit/core/metrics/test_keywords_eval.py b/tests/unit/core/metrics/test_keywords_eval.py
@@ -0,0 +1,189 @@
+"""Tests for keywords eval metric."""
+
+from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
+from lightspeed_evaluation.core.models import TurnData
+
+
+class TestKeywordsEval:
+    """Test cases for keywords eval metric."""
+
+    def test_keywords_eval_first_list_all_matched(self):
+        """Test successful keywords evaluation when first list has all keywords matched."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="This response contains openshift-monitoring and yes it exists",
+            expected_keywords=[
+                ["yes", "openshift-monitoring"],  # Option 1: Both keywords should match
+                ["confirmed", "monitoring"],  # Option 2: Should not be checked
+            ],
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 1.0
+        assert "Keywords eval successful: Option 1" in reason
+        assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason
+
+    def test_keywords_eval_first_list_fails_second_succeeds(self):
+        """Test keywords evaluation when first list fails but second list succeeds."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="This response contains monitoring and confirmed status",
+            expected_keywords=[
+                [
+                    "yes",
+                    "openshift-monitoring",
+                ],  # Option 1: "yes" missing, "openshift-monitoring" missing
+                ["monitoring", "confirmed"],  # Option 2: Both should match
+            ],
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 1.0
+        assert "Keywords eval successful: Option 2" in reason
+        assert "all keywords matched: 'monitoring', 'confirmed'" in reason
+
+    def test_keywords_eval_all_lists_fail(self):
+        """Test keywords evaluation when all lists fail."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="This response contains nothing relevant",
+            expected_keywords=[
+                ["yes", "openshift-monitoring"],  # Option 1: Both missing
+                ["confirmed", "monitoring"],  # Option 2: Both missing
+            ],
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 0.0
+        assert "Keywords eval failed: All options failed" in reason
+        assert (
+            "Option 1: unmatched ['yes', 'openshift-monitoring'], matched [none]"
+            in reason
+        )
+        assert (
+            "Option 2: unmatched ['confirmed', 'monitoring'], matched [none]" in reason
+        )
+
+    def test_keywords_eval_partial_match_in_failed_list(self):
+        """Test keywords evaluation with partial matches in failed lists."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="This response contains monitoring but no confirmation",
+            expected_keywords=[
+                ["yes", "confirmed"],  # Option 1: Both missing
+                [
+                    "monitoring",
+                    "openshift",
+                ],  # Option 2: "monitoring" matches, "openshift" missing
+            ],
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 0.0
+        assert "Keywords eval failed: All options failed" in reason
+        assert "Option 1: unmatched ['yes', 'confirmed'], matched [none]" in reason
+        assert "Option 2: unmatched ['openshift'], matched ['monitoring']" in reason
+
+    def test_keywords_eval_case_insensitive(self):
+        """Test that keywords evaluation is case insensitive."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="This response contains YES and OPENSHIFT-MONITORING",
+            expected_keywords=[
+                ["yes", "openshift-monitoring"]  # Should match despite case differences
+            ],
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 1.0
+        assert "Keywords eval successful: Option 1" in reason
+        assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason
+
+    def test_keywords_eval_substring_matching(self):
+        """Test that keywords evaluation works with substring matching."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="The openshift-monitoring-operator is running successfully",
+            expected_keywords=[
+                [
+                    "monitoring",
+                    "success",
+                ]  # Should match "monitoring" in "openshift-monitoring-operator"
+            ],
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 1.0
+        assert "Keywords eval successful: Option 1" in reason
+        assert "all keywords matched: 'monitoring', 'success'" in reason
+
+    def test_keywords_eval_no_expected_keywords(self):
+        """Test keywords evaluation when no expected keywords provided."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="Some response",
+            expected_keywords=None,
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score is None
+        assert "No expected keywords provided" in reason
+
+    def test_keywords_eval_no_response(self):
+        """Test keywords evaluation when no response provided."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response=None,
+            expected_keywords=[["yes"], ["monitoring"]],
+        )
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 0.0
+        assert "No response provided" in reason
+
+    def test_keywords_eval_empty_response(self):
+        """Test keywords evaluation with empty response."""
+        # Create turn data with valid response first, then modify it
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            response="valid response",
+            expected_keywords=[["yes"], ["monitoring"]],
+        )
+        # Manually set response to empty to bypass validation
+        turn_data.response = ""
+
+        score, reason = evaluate_keywords(None, 0, turn_data, False)
+
+        assert score == 0.0
+        assert "No response provided" in reason
+
+    def test_keywords_eval_conversation_level_error(self):
+        """Test that keywords_eval returns error for conversation-level evaluation."""
+        score, reason = evaluate_keywords(None, None, None, True)
+
+        assert score is None
+        assert "Keywords eval is a turn-level metric" in reason
+
+    def test_keywords_eval_no_turn_data(self):
+        """Test keywords evaluation when no turn data provided."""
+        score, reason = evaluate_keywords(None, 0, None, False)
+
+        assert score is None
+        assert "TurnData is required" in reason
diff --git a/tests/unit/core/models/test_data.py b/tests/unit/core/models/test_data.py
@@ -332,3 +332,136 @@ def test_is_single_set_format_detection(self):
         assert expected is not None
         assert len(expected) == 1  # One alternative set
         assert len(expected[0]) == 2  # Two sequences in that set
+
+
+class TestTurnDataKeywordsValidation:
+    """Test cases for expected_keywords validation in TurnData."""
+
+    def test_valid_expected_keywords_single_group(self):
+        """Test valid expected_keywords with single group."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            expected_keywords=[["keyword1", "keyword2"]],
+        )
+
+        assert turn_data.expected_keywords == [["keyword1", "keyword2"]]
+
+    def test_valid_expected_keywords_multiple_groups(self):
+        """Test valid expected_keywords with multiple groups."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            expected_keywords=[
+                ["yes", "confirmed"],
+                ["monitoring", "namespace"],
+                ["success", "complete"],
+            ],
+        )
+
+        assert len(turn_data.expected_keywords) == 3
+        assert turn_data.expected_keywords[0] == ["yes", "confirmed"]
+        assert turn_data.expected_keywords[1] == ["monitoring", "namespace"]
+        assert turn_data.expected_keywords[2] == ["success", "complete"]
+
+    def test_valid_expected_keywords_none(self):
+        """Test that None is valid for expected_keywords."""
+        turn_data = TurnData(
+            turn_id="test_turn", query="Test query", expected_keywords=None
+        )
+
+        assert turn_data.expected_keywords is None
+
+    def test_invalid_expected_keywords_not_list(self):
+        """Test that non-list expected_keywords raises ValidationError."""
+        with pytest.raises(ValidationError) as exc_info:
+            TurnData(
+                turn_id="test_turn", query="Test query", expected_keywords="not_a_list"
+            )
+
+        assert "Input should be a valid list" in str(exc_info.value)
+
+    def test_invalid_expected_keywords_inner_not_list(self):
+        """Test that non-list inner elements raise ValidationError."""
+        with pytest.raises(ValidationError) as exc_info:
+            TurnData(
+                turn_id="test_turn",
+                query="Test query",
+                expected_keywords=["not_a_list", ["valid_list"]],
+            )
+
+        assert "Input should be a valid list" in str(exc_info.value)
+
+    def test_invalid_expected_keywords_empty_inner_list(self):
+        """Test that empty inner lists raise ValidationError."""
+        with pytest.raises(ValidationError) as exc_info:
+            TurnData(
+                turn_id="test_turn",
+                query="Test query",
+                expected_keywords=[[], ["valid_list"]],
+            )
+
+        assert "expected_keywords[0] cannot be empty" in str(exc_info.value)
+
+    def test_invalid_expected_keywords_non_string_element(self):
+        """Test that non-string elements in inner lists raise ValidationError."""
+        with pytest.raises(ValidationError) as exc_info:
+            TurnData(
+                turn_id="test_turn",
+                query="Test query",
+                expected_keywords=[["valid_string", 123]],
+            )
+
+        assert "Input should be a valid string" in str(exc_info.value)
+
+    def test_invalid_expected_keywords_empty_string_element(self):
+        """Test that empty string elements raise ValidationError."""
+        with pytest.raises(ValidationError) as exc_info:
+            TurnData(
+                turn_id="test_turn",
+                query="Test query",
+                expected_keywords=[["valid_string", ""]],
+            )
+
+        assert "expected_keywords[0][1] cannot be empty or whitespace" in str(
+            exc_info.value
+        )
+
+    def test_invalid_expected_keywords_whitespace_only_element(self):
+        """Test that whitespace-only string elements raise ValidationError."""
+        with pytest.raises(ValidationError) as exc_info:
+            TurnData(
+                turn_id="test_turn",
+                query="Test query",
+                expected_keywords=[["valid_string", "   "]],
+            )
+
+        assert "expected_keywords[0][1] cannot be empty or whitespace" in str(
+            exc_info.value
+        )
+
+    def test_complex_valid_expected_keywords(self):
+        """Test complex but valid expected_keywords structure."""
+        turn_data = TurnData(
+            turn_id="test_turn",
+            query="Test query",
+            expected_keywords=[
+                ["yes", "confirmed", "affirmative"],
+                [
+                    "openshift-monitoring",
+                    "monitoring namespace",
+                ],
+                [
+                    "created successfully",
+                    "creation complete",
+                    "successfully created",
+                ],
+                ["pod", "container", "workload"],
+            ],
+        )
+
+        assert len(turn_data.expected_keywords) == 4
+        assert len(turn_data.expected_keywords[0]) == 3
+        assert len(turn_data.expected_keywords[1]) == 2
+        assert len(turn_data.expected_keywords[2]) == 3
+        assert len(turn_data.expected_keywords[3]) == 3