lightspeed-core · tisnik · Sep 22, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/README.md b/README.md
@@ -115,16 +115,18 @@ api:
   no_tools: null                       # Whether to bypass tools (optional)
   system_prompt: null                  # Custom system prompt (optional)
 
-# Metrics Configuration with thresholds
+# Metrics Configuration with thresholds and defaults
 metrics_metadata:
   turn_level:
-    "ragas:faithfulness":
-      threshold: 0.8
-      description: "How faithful the response is to the provided context"
-
     "ragas:response_relevancy":
       threshold: 0.8
       description: "How relevant the response is to the question"
+      default: true   # Used by default when turn_metrics is null
+
+    "ragas:faithfulness":
+      threshold: 0.8
+      description: "How faithful the response is to the provided context"
+      default: false  # Only used when explicitly specified
 
     "custom:tool_eval":
       description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"
@@ -160,16 +162,6 @@ visualization:
 - conversation_group_id: "test_conversation"
   description: "Sample evaluation"
 
-  # Turn-level metrics to evaluate
-  turn_metrics:
-    - "ragas:faithfulness"
-    - "custom:answer_correctness"
-
-  # Metric-specific configuration
-  turn_metrics_metadata:
-    "ragas:faithfulness": 
-      threshold: 0.8
-
   # Conversation-level metrics   
   conversation_metrics:
     - "deepeval:conversation_completeness"
@@ -186,8 +178,23 @@ visualization:
         - OpenShift Virtualization is an extension of the OpenShift ...
       attachments: []                   # Attachments (Optional)
       expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers
+
+      # Per-turn metrics (overrides system defaults)
+      turn_metrics:
+        - "ragas:faithfulness"
+        - "custom:answer_correctness"
+
+      # Per-turn metric configuration
+      turn_metrics_metadata:
+        "ragas:faithfulness": 
+          threshold: 0.9  # Override system default
+      # turn_metrics: null (omitted) → Use system defaults (metrics with default=true)
+
+    - turn_id: id2
+      query: Skip this turn evaluation
+      turn_metrics: []                  # Skip evaluation for this turn
 
-    - turn_id: id2  
+    - turn_id: id3
       query: How do I create a virtual machine in OpenShift Virtualization?
       response: null                    # Populated by API if enabled, otherwise provide
       contexts:
@@ -223,11 +230,21 @@ visualization:
 | `expected_response`   | string           | 📋       | Expected response for comparison     | ❌                    |
 | `expected_tool_calls` | list[list[dict]] | 📋       | Expected tool call sequences         | ❌                    |
 | `tool_calls`          | list[list[dict]] | ❌       | Actual tool calls from API           | ✅ (if API enabled)   |
+| `turn_metrics`        | list[string]     | ❌       | Turn-specific metrics to evaluate    | ❌                    |
+| `turn_metrics_metadata` | dict           | ❌       | Turn-specific metric configuration   | ❌                    |
 
 Note: Context will be collected automatically in the future.
 
 > 📋 **Required based on metrics**: Some fields are required only when using specific metrics
 
+#### Metrics override behavior
+
+| Override Value | Behavior |
+|---------------------|----------|
+| `null` (or omitted) | Use system defaults (metrics with `default: true`) |
+| `[]` (empty list)   | Skip evaluation for this turn |
+| `["metric1", ...]`  | Use specified metrics only |
+
 Examples
 > - `expected_response`: Required for `custom:answer_correctness`
 > - `expected_tool_calls`: Required for `custom:tool_eval`

diff --git a/config/evaluation_data.yaml b/config/evaluation_data.yaml
@@ -3,14 +3,6 @@
 - conversation_group_id: "conv_group_1"
   description: "conversation group description"
 
-  turn_metrics:
-    - "ragas:faithfulness"
-    - "ragas:response_relevancy"
-    - "ragas:context_precision_without_reference"
-
-  turn_metrics_metadata:
-    "ragas:faithfulness":
-      threshold: 0.99
   conversation_metrics: []
   conversation_metrics_metadata: {}
 
@@ -23,15 +15,17 @@
         - "Context 2"
       expected_response: "Expected Response"
 
-- conversation_group_id: "conv_group_2"
-  description: "conversation group description"
+      turn_metrics:
+        - "ragas:faithfulness"
+        - "ragas:response_relevancy"
+        - "ragas:context_precision_without_reference"
 
-  turn_metrics:
-    - "ragas:context_recall"
-    - "ragas:context_relevance"
-    - "ragas:context_precision_with_reference"
+      turn_metrics_metadata:
+        "ragas:faithfulness":
+          threshold: 0.99
 
-  turn_metrics_metadata: {}
+- conversation_group_id: "conv_group_2"
+  description: "conversation group description"
   conversation_metrics: []
   conversation_metrics_metadata: {}
 
@@ -43,14 +37,14 @@
         - "Context 1"
       expected_response: "Expected Response"
 
+      turn_metrics:
+        - "ragas:context_recall"
+        - "ragas:context_relevance"
+        - "ragas:context_precision_with_reference"
+
 - conversation_group_id: "conv_group_3"
   description: "conversation group description"
 
-  turn_metrics:
-    - "custom:answer_correctness"
-
-  turn_metrics_metadata: {}
-
   conversation_metrics:
     - "deepeval:conversation_completeness"
     - "deepeval:conversation_relevancy"
@@ -60,14 +54,10 @@
   turns:
     - turn_id: "1"
       query: "User Query 1"
-      response: "API Response 1"
-      contexts:
-        - "Context"
-      expected_response: "Expected Response 1"
+
+      turn_metrics: []  # Skip eval for this turn
 
     - turn_id: "2"
       query: "User Query 2"
       response: "API Response 2"
-      contexts:
-        - "Context"
-      expected_response: "Expected Response 2"
+      # turn_metrics: null (omitted) → Use system defaults (metrics with default=true)
diff --git a/config/system.yaml b/config/system.yaml
@@ -31,23 +31,21 @@ metrics_metadata:
   # Turn-level metrics metadata
   turn_level:
     # Ragas Response Evaluation metrics
-    "ragas:faithfulness":
-      threshold: 0.8
-      description: "How faithful the response is to the provided context"
-
     "ragas:response_relevancy":
       threshold: 0.8
       description: "How relevant the response is to the question"
+      default: true  # This metric is applied by default when no turn_metrics specified
+
+    "ragas:faithfulness":
+      threshold: 0.8
+      description: "How faithful the response is to the provided context"
+      default: false  # By default the value is false
 
     # Ragas Context/Retrieval Evaluation metrics
     "ragas:context_recall":
       threshold: 0.8
       description: "Did we fetch every fact the answer needs?"
 
-    "ragas:context_relevance":
-      threshold: 0.7
-      description: "Is what we retrieved actually relevant to user query?"
-
     "ragas:context_precision_with_reference":
       threshold: 0.7
       description: "How precise the retrieved context is (with reference)"
@@ -56,6 +54,10 @@ metrics_metadata:
       threshold: 0.7
       description: "How precise the retrieved context is (without reference)"
 
+    "ragas:context_relevance":
+      threshold: 0.7
+      description: "Is what we retrieved actually relevant to user query?"
+
     # Custom metrics
     "custom:answer_correctness":
       threshold: 0.75
@@ -70,6 +72,7 @@ metrics_metadata:
     "deepeval:conversation_completeness":
       threshold: 0.8
       description: "How completely the conversation addresses user intentions"
+      default: false
 
     "deepeval:conversation_relevancy":
       threshold: 0.7

diff --git a/src/lightspeed_evaluation/core/metrics/manager.py b/src/lightspeed_evaluation/core/metrics/manager.py
@@ -0,0 +1,136 @@
+"""Metrics mapping for evaluation."""
+
+from enum import Enum
+from typing import Any, Optional
+
+from ..models.data import EvaluationData, TurnData
+from ..models.system import SystemConfig
+
+
+class MetricLevel(Enum):
+    """Metric level enumeration."""
+
+    TURN = "turn"
+    CONVERSATION = "conversation"
+
+
+class MetricManager:
+    """Manager for both turn and conversation metrics."""
+
+    def __init__(self, system_config: SystemConfig):
+        """Initialize with system configuration."""
+        self.system_config = system_config
+
+    def resolve_metrics(
+        self, metrics: Optional[list[str]], level: MetricLevel
+    ) -> list[str]:
+        """Resolve metrics mapping.
+
+        Options:
+        - None: use system defaults (metrics with default=true)
+        - []: skip evaluation completely
+        - [metrics...]: use specified metrics from turn data
+
+        Args:
+            metrics: The metrics configuration (None, [], or list of metrics)
+            level: Whether this is TURN or CONVERSATION level
+
+        Returns:
+            List of metrics to evaluate
+        """
+        if metrics is None:
+            # None = use system defaults
+            return self._extract_default_metrics(level)
+        if metrics == []:
+            # [] = explicitly skip evaluation
+            return []
+        # Use specified metrics as-is
+        return metrics
+
+    def get_effective_threshold(
+        self,
+        metric_identifier: str,
+        level: MetricLevel,
+        conv_data: Optional[EvaluationData] = None,
+        turn_data: Optional[TurnData] = None,
+    ) -> Optional[float]:
+        """Get effective threshold with priority hierarchy.
+
+        Priority:
+        1. Level-specific metadata (turn-specific for turns, conversation-specific for convs)
+        2. System defaults
+
+        Args:
+            metric_identifier: The metric to get threshold for
+            level: Whether this is TURN or CONVERSATION level
+            conv_data: Conversation data for conversation-level metadata
+            turn_data: Turn data for turn-specific metadata
+
+        Returns:
+            Effective threshold or None if not found
+        """
+        # Check level-specific metadata first
+        level_metadata = self._get_level_metadata(level, conv_data, turn_data)
+        threshold = level_metadata.get(metric_identifier, {}).get("threshold")
+        if threshold is not None:
+            return threshold
+
+        # Fall back to system defaults
+        system_metadata = self._get_system_metadata(level)
+        return system_metadata.get(metric_identifier, {}).get("threshold")
+
+    def _get_level_metadata(
+        self,
+        level: MetricLevel,
+        conv_data: Optional[EvaluationData],
+        turn_data: Optional[TurnData],
+    ) -> dict[str, dict[str, Any]]:
+        """Get level-specific metadata (turn or conversation level)."""
+        if level == MetricLevel.TURN and turn_data and turn_data.turn_metrics_metadata:
+            return turn_data.turn_metrics_metadata
+        if (
+            level == MetricLevel.CONVERSATION
+            and conv_data
+            and conv_data.conversation_metrics_metadata
+        ):
+            return conv_data.conversation_metrics_metadata
+        return {}
+
+    def _get_system_metadata(self, level: MetricLevel) -> dict[str, dict[str, Any]]:
+        """Get system-level metadata for the given level."""
+        if level == MetricLevel.TURN:
+            return self.system_config.default_turn_metrics_metadata
+        return self.system_config.default_conversation_metrics_metadata
+
+    def _extract_default_metrics(self, level: MetricLevel) -> list[str]:
+        """Extract metrics that have default=true from metadata."""
+        metrics_metadata = self._get_system_metadata(level)
+
+        default_metrics = []
+        for metric_name, metadata in metrics_metadata.items():
+            if metadata.get("default", False):  # default=false if not specified
+                default_metrics.append(metric_name)
+        return default_metrics
+
+    def count_metrics_for_conversation(
+        self, conv_data: EvaluationData
+    ) -> dict[str, int]:
+        """Count total metrics that would be evaluated for a conversation."""
+        # Count turn metrics
+        total_turn_metrics = 0
+        for turn_data in conv_data.turns:
+            turn_metrics = self.resolve_metrics(
+                turn_data.turn_metrics, MetricLevel.TURN
+            )
+            total_turn_metrics += len(turn_metrics)
+
+        # Count conversation metrics
+        conversation_metrics = self.resolve_metrics(
+            conv_data.conversation_metrics, MetricLevel.CONVERSATION
+        )
+
+        return {
+            "turn_metrics": total_turn_metrics,
+            "conversation_metrics": len(conversation_metrics),
+            "total_turns": len(conv_data.turns),
+        }