Skip to content

Commit ab6a85a

Browse files
committed
add keyword eval test case
1 parent 9741e10 commit ab6a85a

File tree

2 files changed

+322
-0
lines changed

2 files changed

+322
-0
lines changed
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
"""Tests for keywords eval metric."""
2+
3+
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
4+
from lightspeed_evaluation.core.models import TurnData
5+
6+
7+
class TestKeywordsEval:
8+
"""Test cases for keywords eval metric."""
9+
10+
def test_keywords_eval_first_list_all_matched(self):
11+
"""Test successful keywords evaluation when first list has all keywords matched."""
12+
turn_data = TurnData(
13+
turn_id="test_turn",
14+
query="Test query",
15+
response="This response contains openshift-monitoring and yes it exists",
16+
expected_keywords=[
17+
["yes", "openshift-monitoring"], # Option 1: Both keywords should match
18+
["confirmed", "monitoring"], # Option 2: Should not be checked
19+
],
20+
)
21+
22+
score, reason = evaluate_keywords(None, 0, turn_data, False)
23+
24+
assert score == 1.0
25+
assert "Keywords eval successful: Option 1" in reason
26+
assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason
27+
28+
def test_keywords_eval_first_list_fails_second_succeeds(self):
29+
"""Test keywords evaluation when first list fails but second list succeeds."""
30+
turn_data = TurnData(
31+
turn_id="test_turn",
32+
query="Test query",
33+
response="This response contains monitoring and confirmed status",
34+
expected_keywords=[
35+
[
36+
"yes",
37+
"openshift-monitoring",
38+
], # Option 1: "yes" missing, "openshift-monitoring" missing
39+
["monitoring", "confirmed"], # Option 2: Both should match
40+
],
41+
)
42+
43+
score, reason = evaluate_keywords(None, 0, turn_data, False)
44+
45+
assert score == 1.0
46+
assert "Keywords eval successful: Option 2" in reason
47+
assert "all keywords matched: 'monitoring', 'confirmed'" in reason
48+
49+
def test_keywords_eval_all_lists_fail(self):
50+
"""Test keywords evaluation when all lists fail."""
51+
turn_data = TurnData(
52+
turn_id="test_turn",
53+
query="Test query",
54+
response="This response contains nothing relevant",
55+
expected_keywords=[
56+
["yes", "openshift-monitoring"], # Option 1: Both missing
57+
["confirmed", "monitoring"], # Option 2: Both missing
58+
],
59+
)
60+
61+
score, reason = evaluate_keywords(None, 0, turn_data, False)
62+
63+
assert score == 0.0
64+
assert "Keywords eval failed: All options failed" in reason
65+
assert (
66+
"Option 1: unmatched ['yes', 'openshift-monitoring'], matched [none]"
67+
in reason
68+
)
69+
assert (
70+
"Option 2: unmatched ['confirmed', 'monitoring'], matched [none]" in reason
71+
)
72+
73+
def test_keywords_eval_partial_match_in_failed_list(self):
74+
"""Test keywords evaluation with partial matches in failed lists."""
75+
turn_data = TurnData(
76+
turn_id="test_turn",
77+
query="Test query",
78+
response="This response contains monitoring but no confirmation",
79+
expected_keywords=[
80+
["yes", "confirmed"], # Option 1: Both missing
81+
[
82+
"monitoring",
83+
"openshift",
84+
], # Option 2: "monitoring" matches, "openshift" missing
85+
],
86+
)
87+
88+
score, reason = evaluate_keywords(None, 0, turn_data, False)
89+
90+
assert score == 0.0
91+
assert "Keywords eval failed: All options failed" in reason
92+
assert "Option 1: unmatched ['yes', 'confirmed'], matched [none]" in reason
93+
assert "Option 2: unmatched ['openshift'], matched ['monitoring']" in reason
94+
95+
def test_keywords_eval_case_insensitive(self):
96+
"""Test that keywords evaluation is case insensitive."""
97+
turn_data = TurnData(
98+
turn_id="test_turn",
99+
query="Test query",
100+
response="This response contains YES and OPENSHIFT-MONITORING",
101+
expected_keywords=[
102+
["yes", "openshift-monitoring"] # Should match despite case differences
103+
],
104+
)
105+
106+
score, reason = evaluate_keywords(None, 0, turn_data, False)
107+
108+
assert score == 1.0
109+
assert "Keywords eval successful: Option 1" in reason
110+
assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason
111+
112+
def test_keywords_eval_substring_matching(self):
113+
"""Test that keywords evaluation works with substring matching."""
114+
turn_data = TurnData(
115+
turn_id="test_turn",
116+
query="Test query",
117+
response="The openshift-monitoring-operator is running successfully",
118+
expected_keywords=[
119+
[
120+
"monitoring",
121+
"success",
122+
] # Should match "monitoring" in "openshift-monitoring-operator"
123+
],
124+
)
125+
126+
score, reason = evaluate_keywords(None, 0, turn_data, False)
127+
128+
assert score == 1.0
129+
assert "Keywords eval successful: Option 1" in reason
130+
assert "all keywords matched: 'monitoring', 'success'" in reason
131+
132+
def test_keywords_eval_no_expected_keywords(self):
133+
"""Test keywords evaluation when no expected keywords provided."""
134+
turn_data = TurnData(
135+
turn_id="test_turn",
136+
query="Test query",
137+
response="Some response",
138+
expected_keywords=None,
139+
)
140+
141+
score, reason = evaluate_keywords(None, 0, turn_data, False)
142+
143+
assert score is None
144+
assert "No expected keywords provided" in reason
145+
146+
def test_keywords_eval_no_response(self):
147+
"""Test keywords evaluation when no response provided."""
148+
turn_data = TurnData(
149+
turn_id="test_turn",
150+
query="Test query",
151+
response=None,
152+
expected_keywords=[["yes"], ["monitoring"]],
153+
)
154+
155+
score, reason = evaluate_keywords(None, 0, turn_data, False)
156+
157+
assert score == 0.0
158+
assert "No response provided" in reason
159+
160+
def test_keywords_eval_empty_response(self):
161+
"""Test keywords evaluation with empty response."""
162+
# Create turn data with valid response first, then modify it
163+
turn_data = TurnData(
164+
turn_id="test_turn",
165+
query="Test query",
166+
response="valid response",
167+
expected_keywords=[["yes"], ["monitoring"]],
168+
)
169+
# Manually set response to empty to bypass validation
170+
turn_data.response = ""
171+
172+
score, reason = evaluate_keywords(None, 0, turn_data, False)
173+
174+
assert score == 0.0
175+
assert "No response provided" in reason
176+
177+
def test_keywords_eval_conversation_level_error(self):
178+
"""Test that keywords_eval returns error for conversation-level evaluation."""
179+
score, reason = evaluate_keywords(None, None, None, True)
180+
181+
assert score is None
182+
assert "Keywords eval is a turn-level metric" in reason
183+
184+
def test_keywords_eval_no_turn_data(self):
185+
"""Test keywords evaluation when no turn data provided."""
186+
score, reason = evaluate_keywords(None, 0, None, False)
187+
188+
assert score is None
189+
assert "TurnData is required" in reason

tests/unit/core/models/test_data.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,3 +332,136 @@ def test_is_single_set_format_detection(self):
332332
assert expected is not None
333333
assert len(expected) == 1 # One alternative set
334334
assert len(expected[0]) == 2 # Two sequences in that set
335+
336+
337+
class TestTurnDataKeywordsValidation:
338+
"""Test cases for expected_keywords validation in TurnData."""
339+
340+
def test_valid_expected_keywords_single_group(self):
341+
"""Test valid expected_keywords with single group."""
342+
turn_data = TurnData(
343+
turn_id="test_turn",
344+
query="Test query",
345+
expected_keywords=[["keyword1", "keyword2"]],
346+
)
347+
348+
assert turn_data.expected_keywords == [["keyword1", "keyword2"]]
349+
350+
def test_valid_expected_keywords_multiple_groups(self):
351+
"""Test valid expected_keywords with multiple groups."""
352+
turn_data = TurnData(
353+
turn_id="test_turn",
354+
query="Test query",
355+
expected_keywords=[
356+
["yes", "confirmed"],
357+
["monitoring", "namespace"],
358+
["success", "complete"],
359+
],
360+
)
361+
362+
assert len(turn_data.expected_keywords) == 3
363+
assert turn_data.expected_keywords[0] == ["yes", "confirmed"]
364+
assert turn_data.expected_keywords[1] == ["monitoring", "namespace"]
365+
assert turn_data.expected_keywords[2] == ["success", "complete"]
366+
367+
def test_valid_expected_keywords_none(self):
368+
"""Test that None is valid for expected_keywords."""
369+
turn_data = TurnData(
370+
turn_id="test_turn", query="Test query", expected_keywords=None
371+
)
372+
373+
assert turn_data.expected_keywords is None
374+
375+
def test_invalid_expected_keywords_not_list(self):
376+
"""Test that non-list expected_keywords raises ValidationError."""
377+
with pytest.raises(ValidationError) as exc_info:
378+
TurnData(
379+
turn_id="test_turn", query="Test query", expected_keywords="not_a_list"
380+
)
381+
382+
assert "Input should be a valid list" in str(exc_info.value)
383+
384+
def test_invalid_expected_keywords_inner_not_list(self):
385+
"""Test that non-list inner elements raise ValidationError."""
386+
with pytest.raises(ValidationError) as exc_info:
387+
TurnData(
388+
turn_id="test_turn",
389+
query="Test query",
390+
expected_keywords=["not_a_list", ["valid_list"]],
391+
)
392+
393+
assert "Input should be a valid list" in str(exc_info.value)
394+
395+
def test_invalid_expected_keywords_empty_inner_list(self):
396+
"""Test that empty inner lists raise ValidationError."""
397+
with pytest.raises(ValidationError) as exc_info:
398+
TurnData(
399+
turn_id="test_turn",
400+
query="Test query",
401+
expected_keywords=[[], ["valid_list"]],
402+
)
403+
404+
assert "expected_keywords[0] cannot be empty" in str(exc_info.value)
405+
406+
def test_invalid_expected_keywords_non_string_element(self):
407+
"""Test that non-string elements in inner lists raise ValidationError."""
408+
with pytest.raises(ValidationError) as exc_info:
409+
TurnData(
410+
turn_id="test_turn",
411+
query="Test query",
412+
expected_keywords=[["valid_string", 123]],
413+
)
414+
415+
assert "Input should be a valid string" in str(exc_info.value)
416+
417+
def test_invalid_expected_keywords_empty_string_element(self):
418+
"""Test that empty string elements raise ValidationError."""
419+
with pytest.raises(ValidationError) as exc_info:
420+
TurnData(
421+
turn_id="test_turn",
422+
query="Test query",
423+
expected_keywords=[["valid_string", ""]],
424+
)
425+
426+
assert "expected_keywords[0][1] cannot be empty or whitespace" in str(
427+
exc_info.value
428+
)
429+
430+
def test_invalid_expected_keywords_whitespace_only_element(self):
431+
"""Test that whitespace-only string elements raise ValidationError."""
432+
with pytest.raises(ValidationError) as exc_info:
433+
TurnData(
434+
turn_id="test_turn",
435+
query="Test query",
436+
expected_keywords=[["valid_string", " "]],
437+
)
438+
439+
assert "expected_keywords[0][1] cannot be empty or whitespace" in str(
440+
exc_info.value
441+
)
442+
443+
def test_complex_valid_expected_keywords(self):
444+
"""Test complex but valid expected_keywords structure."""
445+
turn_data = TurnData(
446+
turn_id="test_turn",
447+
query="Test query",
448+
expected_keywords=[
449+
["yes", "confirmed", "affirmative"],
450+
[
451+
"openshift-monitoring",
452+
"monitoring namespace",
453+
],
454+
[
455+
"created successfully",
456+
"creation complete",
457+
"successfully created",
458+
],
459+
["pod", "container", "workload"],
460+
],
461+
)
462+
463+
assert len(turn_data.expected_keywords) == 4
464+
assert len(turn_data.expected_keywords[0]) == 3
465+
assert len(turn_data.expected_keywords[1]) == 2
466+
assert len(turn_data.expected_keywords[2]) == 3
467+
assert len(turn_data.expected_keywords[3]) == 3

0 commit comments

Comments
 (0)