integrate keywords eval

asamal4 · asamal4 · commit 2b4fec927816 · 2025-11-06T18:59:27.000+05:30
diff --git a/README.md b/README.md
@@ -86,6 +86,7 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf
   - Response Evaluation
     - [`answer_correctness`](src/lightspeed_evaluation/core/metrics/custom.py)
     - [`intent_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Evaluates whether the response demonstrates the expected intent or purpose
+    - [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive)
   - Tool Evaluation
     - [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls and arguments with regex pattern matching
 - **Script-based**
@@ -149,6 +150,10 @@ metrics_metadata:
       
     "custom:tool_eval":
       description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"
+      
+    "custom:keywords_eval":
+      threshold: 1  # Binary evaluation (0 or 1)
+      description: "Keywords evaluation (ALL match) with sequential alternate checking (case insensitive)"
   
   conversation_level:
     "deepeval:conversation_completeness":
@@ -226,12 +231,14 @@ embedding:
       contexts:
         - OpenShift Virtualization is an extension of the OpenShift ...
       attachments: []                   # Attachments (Optional)
+      expected_keywords: [["virtualization"], ["openshift"]]  # For keywords_eval evaluation
       expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers
       expected_intent: "explain a concept"  # Expected intent for intent evaluation
       
       # Per-turn metrics (overrides system defaults)
       turn_metrics:
         - "ragas:faithfulness"
+        - "custom:keywords_eval"
         - "custom:answer_correctness"
         - "custom:intent_eval"
       
@@ -291,6 +298,7 @@ embedding:
 | `attachments`         | list[string]     | ❌       | Attachments                          | ❌                    |
 | `expected_response`   | string           | 📋       | Expected response for comparison     | ❌                    |
 | `expected_intent`     | string           | 📋       | Expected intent for intent evaluation| ❌                    |
+| `expected_keywords`   | list[list[string]] | 📋     | Expected keywords for keyword evaluation (list of alternatives) | ❌ |
 | `expected_tool_calls` | list[list[list[dict]]] | 📋 | Expected tool call sequences (multiple alternative sets) | ❌ |
 | `tool_calls`          | list[list[dict]] | ❌       | Actual tool calls from API           | ✅ (if API enabled)   |
 | `verify_script`       | string           | 📋       | Path to verification script          | ❌                    |
@@ -303,6 +311,7 @@ Examples
 > - `expected_response`: Required for `custom:answer_correctness`
 > - `expected_intent`: Required for `custom:intent_eval`
 > - `expected_tool_calls`: Required for `custom:tool_eval` (multiple alternative sets format)
+> - `expected_keywords`: Required for `custom:keywords_eval` (case insensitive matching)
 > - `verify_script`: Required for `script:action_eval` (used when API is enabled)
 > - `response`: Required for most metrics (auto-populated if API enabled)
 
diff --git a/config/system.yaml b/config/system.yaml
@@ -76,6 +76,9 @@ metrics_metadata:
       description: "Is what we retrieved actually relevant to user query?"
 
     # Custom metrics
+    "custom:keywords_eval":  # boolean eval (either 0 or 1)
+      description: "Keywords (ALL) matching evaluation with alternative sets"
+
     "custom:answer_correctness":
       threshold: 0.75
       description: "Correctness vs expected answer using custom LLM evaluation"
diff --git a/src/lightspeed_evaluation/core/metrics/custom/__init__.py b/src/lightspeed_evaluation/core/metrics/custom/__init__.py
@@ -1,6 +1,7 @@
 """Custom metrics components package."""
 
 from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
+from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
 from lightspeed_evaluation.core.metrics.custom.prompts import (
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
@@ -9,6 +10,7 @@
 
 __all__ = [
     "CustomMetrics",
+    "evaluate_keywords",
     "evaluate_tool_calls",
     # Prompts
     "ANSWER_CORRECTNESS_PROMPT",
diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py
@@ -9,6 +9,7 @@
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
 )
+from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
 from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
 from lightspeed_evaluation.core.system.exceptions import LLMError
@@ -28,6 +29,7 @@ def __init__(self, llm_manager: LLMManager):
         )
 
         self.supported_metrics = {
+            "keywords_eval": evaluate_keywords,
             "answer_correctness": self._evaluate_answer_correctness,
             "intent_eval": self._evaluate_intent,
             "tool_eval": self._evaluate_tool_calls,
diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py
@@ -62,6 +62,10 @@ class TurnData(BaseModel):
     expected_intent: Optional[str] = Field(
         default=None, min_length=1, description="Expected intent for intent evaluation"
     )
+    expected_keywords: Optional[list[list[str]]] = Field(
+        default=None,
+        description="Expected keywords for keyword evaluation (list of alternatives)",
+    )
     conversation_id: Optional[str] = Field(
         default=None, description="Conversation ID - populated by API if enabled"
     )
@@ -89,6 +93,36 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
             v = _validate_and_deduplicate_metrics(v, "Turn metric")
         return v
 
+    @field_validator("expected_keywords")
+    @classmethod
+    def validate_expected_keywords(
+        cls, v: Optional[list[list[str]]]
+    ) -> Optional[list[list[str]]]:
+        """Validate expected keywords when provided."""
+        if v is None:
+            return None
+
+        if not isinstance(v, list):
+            raise ValueError("expected_keywords must be a list of lists")
+
+        # Validate each alternative group
+        for i, keyword_group in enumerate(v):
+            if not isinstance(keyword_group, list):
+                raise ValueError(f"expected_keywords[{i}] must be a list of strings")
+
+            if not keyword_group:
+                raise ValueError(f"expected_keywords[{i}] cannot be empty")
+
+            for j, keyword in enumerate(keyword_group):
+                if not isinstance(keyword, str):
+                    raise ValueError(f"expected_keywords[{i}][{j}] must be a string")
+                if not keyword.strip():
+                    raise ValueError(
+                        f"expected_keywords[{i}][{j}] cannot be empty or whitespace"
+                    )
+
+        return v
+
     @field_validator("expected_tool_calls", mode="before")
     @classmethod
     def validate_expected_tool_calls(
diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py
@@ -40,6 +40,10 @@
         "required_fields": ["response", "contexts"],
         "description": "requires 'response' and 'contexts' fields",
     },
+    "custom:keywords_eval": {
+        "required_fields": ["response", "expected_keywords"],
+        "description": "requires 'response' and 'expected_keywords' fields",
+    },
     "custom:answer_correctness": {
         "required_fields": ["response", "expected_response"],
         "description": "requires 'response' and 'expected_response' fields",

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`ANSWER_CORRECTNESS_PROMPT,`
`10`	`10`	`INTENT_EVALUATION_PROMPT,`
`11`	`11`	`)`
	`12`	`+from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords`
`12`	`13`	`from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls`
`13`	`14`	`from lightspeed_evaluation.core.models import EvaluationScope, TurnData`
`14`	`15`	`from lightspeed_evaluation.core.system.exceptions import LLMError`
`@@ -28,6 +29,7 @@ def __init__(self, llm_manager: LLMManager):`
`28`	`29`	`)`
`29`	`30`
`30`	`31`	`self.supported_metrics = {`
	`32`	`+ "keywords_eval": evaluate_keywords,`
`31`	`33`	`"answer_correctness": self._evaluate_answer_correctness,`
`32`	`34`	`"intent_eval": self._evaluate_intent,`
`33`	`35`	`"tool_eval": self._evaluate_tool_calls,`