|
| 1 | +"""Keywords evaluation utilities.""" |
| 2 | + |
| 3 | +from typing import Any, Optional |
| 4 | + |
| 5 | +from lightspeed_evaluation.core.models import TurnData |
| 6 | + |
| 7 | + |
| 8 | +def _validate_inputs( |
| 9 | + is_conversation: bool, turn_data: Optional[TurnData] |
| 10 | +) -> Optional[tuple[Optional[float], str]]: |
| 11 | + """Validate inputs for keywords evaluation.""" |
| 12 | + if is_conversation: |
| 13 | + return None, "Keywords eval is a turn-level metric" |
| 14 | + |
| 15 | + if turn_data is None: |
| 16 | + return None, "TurnData is required for keywords eval evaluation" |
| 17 | + |
| 18 | + if not turn_data.expected_keywords: |
| 19 | + return None, "No expected keywords provided for keywords eval evaluation" |
| 20 | + |
| 21 | + if not turn_data.response: |
| 22 | + return 0.0, "No response provided for keywords eval evaluation" |
| 23 | + |
| 24 | + return None |
| 25 | + |
| 26 | + |
| 27 | +def _check_keyword_list( |
| 28 | + keyword_list: list[str], response_lower: str |
| 29 | +) -> tuple[list[str], bool]: |
| 30 | + """Check if all keywords in a list match the response.""" |
| 31 | + matched_keywords = [] |
| 32 | + all_matched = True |
| 33 | + |
| 34 | + for keyword in keyword_list: |
| 35 | + if keyword.lower() in response_lower: |
| 36 | + matched_keywords.append(keyword) |
| 37 | + else: |
| 38 | + all_matched = False |
| 39 | + |
| 40 | + return matched_keywords, all_matched |
| 41 | + |
| 42 | + |
| 43 | +def _create_success_result( |
| 44 | + list_index: int, matched_keywords: list[str] |
| 45 | +) -> tuple[float, str]: |
| 46 | + """Create success result for keywords evaluation.""" |
| 47 | + matched_str = ", ".join(f"'{kw}'" for kw in matched_keywords) |
| 48 | + reason = ( |
| 49 | + f"Keywords eval successful: Option {list_index + 1} - " |
| 50 | + f"all keywords matched: {matched_str}" |
| 51 | + ) |
| 52 | + return 1.0, reason |
| 53 | + |
| 54 | + |
| 55 | +def _create_failure_result( |
| 56 | + expected_keywords: list[list[str]], response_lower: str |
| 57 | +) -> tuple[float, str]: |
| 58 | + """Create failure result for keywords evaluation.""" |
| 59 | + failed_details = [] |
| 60 | + |
| 61 | + for list_index, keyword_list in enumerate(expected_keywords): |
| 62 | + matched_keywords, _ = _check_keyword_list(keyword_list, response_lower) |
| 63 | + unmatched_keywords = [ |
| 64 | + kw for kw in keyword_list if kw.lower() not in response_lower |
| 65 | + ] |
| 66 | + |
| 67 | + if unmatched_keywords: |
| 68 | + unmatched_str = ", ".join(f"'{kw}'" for kw in unmatched_keywords) |
| 69 | + matched_str = ( |
| 70 | + ", ".join(f"'{kw}'" for kw in matched_keywords) |
| 71 | + if matched_keywords |
| 72 | + else "none" |
| 73 | + ) |
| 74 | + failed_details.append( |
| 75 | + f"Option {list_index + 1}: unmatched [{unmatched_str}], matched [{matched_str}]" |
| 76 | + ) |
| 77 | + |
| 78 | + reason = f"Keywords eval failed: All options failed - {'; '.join(failed_details)}" |
| 79 | + return 0.0, reason |
| 80 | + |
| 81 | + |
| 82 | +def evaluate_keywords( |
| 83 | + _conv_data: Any, |
| 84 | + _turn_idx: Optional[int], |
| 85 | + turn_data: Optional[TurnData], |
| 86 | + is_conversation: bool, |
| 87 | +) -> tuple[Optional[float], str]: |
| 88 | + """Evaluate keywords using substring matching with sequential list checking. |
| 89 | +
|
| 90 | + Logic: Check first option - if all keywords match, evaluation succeeds. |
| 91 | + If first option fails, try next alternative, and so on. |
| 92 | + If all alternatives fail, evaluation fails. |
| 93 | +
|
| 94 | + Args: |
| 95 | + _conv_data: Conversation data (unused) |
| 96 | + _turn_idx: Turn index (unused) |
| 97 | + turn_data: Turn data containing response and expected keywords |
| 98 | + is_conversation: Whether this is conversation-level evaluation |
| 99 | +
|
| 100 | + Returns: |
| 101 | + tuple: (score: float, reason: str) |
| 102 | + - score: 1.0 if any keyword list has all keywords matched, 0.0 otherwise |
| 103 | + - reason: Detailed explanation of evaluation results |
| 104 | + """ |
| 105 | + # Validate inputs |
| 106 | + validation_result = _validate_inputs(is_conversation, turn_data) |
| 107 | + if validation_result: |
| 108 | + return validation_result |
| 109 | + |
| 110 | + if ( |
| 111 | + turn_data is None |
| 112 | + or turn_data.response is None |
| 113 | + or turn_data.expected_keywords is None |
| 114 | + ): |
| 115 | + return None, "Invalid turn data after validation" |
| 116 | + |
| 117 | + response_lower = turn_data.response.lower() |
| 118 | + |
| 119 | + # Check each expected keywords list |
| 120 | + for list_index, keyword_list in enumerate(turn_data.expected_keywords): |
| 121 | + matched_keywords, all_matched = _check_keyword_list( |
| 122 | + keyword_list, response_lower |
| 123 | + ) |
| 124 | + |
| 125 | + if all_matched: |
| 126 | + return _create_success_result(list_index, matched_keywords) |
| 127 | + |
| 128 | + # If we reach here, all alternatives failed |
| 129 | + return _create_failure_result(turn_data.expected_keywords, response_lower) |
0 commit comments