Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 33 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,16 +115,18 @@ api:
no_tools: null # Whether to bypass tools (optional)
system_prompt: null # Custom system prompt (optional)

# Metrics Configuration with thresholds
# Metrics Configuration with thresholds and defaults
metrics_metadata:
turn_level:
"ragas:faithfulness":
threshold: 0.8
description: "How faithful the response is to the provided context"

"ragas:response_relevancy":
threshold: 0.8
description: "How relevant the response is to the question"
default: true # Used by default when turn_metrics is null

"ragas:faithfulness":
threshold: 0.8
description: "How faithful the response is to the provided context"
default: false # Only used when explicitly specified

"custom:tool_eval":
description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"
Expand Down Expand Up @@ -160,16 +162,6 @@ visualization:
- conversation_group_id: "test_conversation"
description: "Sample evaluation"

# Turn-level metrics to evaluate
turn_metrics:
- "ragas:faithfulness"
- "custom:answer_correctness"

# Metric-specific configuration
turn_metrics_metadata:
"ragas:faithfulness":
threshold: 0.8

# Conversation-level metrics
conversation_metrics:
- "deepeval:conversation_completeness"
Expand All @@ -186,8 +178,23 @@ visualization:
- OpenShift Virtualization is an extension of the OpenShift ...
attachments: [] # Attachments (Optional)
expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers

# Per-turn metrics (overrides system defaults)
turn_metrics:
- "ragas:faithfulness"
- "custom:answer_correctness"

# Per-turn metric configuration
turn_metrics_metadata:
"ragas:faithfulness":
threshold: 0.9 # Override system default
# turn_metrics: null (omitted) → Use system defaults (metrics with default=true)

- turn_id: id2
query: Skip this turn evaluation
turn_metrics: [] # Skip evaluation for this turn

- turn_id: id2
- turn_id: id3
query: How do I create a virtual machine in OpenShift Virtualization?
response: null # Populated by API if enabled, otherwise provide
contexts:
Expand Down Expand Up @@ -223,11 +230,21 @@ visualization:
| `expected_response` | string | 📋 | Expected response for comparison | ❌ |
| `expected_tool_calls` | list[list[dict]] | 📋 | Expected tool call sequences | ❌ |
| `tool_calls` | list[list[dict]] | ❌ | Actual tool calls from API | ✅ (if API enabled) |
| `turn_metrics` | list[string] | ❌ | Turn-specific metrics to evaluate | ❌ |
| `turn_metrics_metadata` | dict | ❌ | Turn-specific metric configuration | ❌ |

Note: Context will be collected automatically in the future.

> 📋 **Required based on metrics**: Some fields are required only when using specific metrics

#### Metrics override behavior

| Override Value | Behavior |
|---------------------|----------|
| `null` (or omitted) | Use system defaults (metrics with `default: true`) |
| `[]` (empty list) | Skip evaluation for this turn |
| `["metric1", ...]` | Use specified metrics only |

Examples
> - `expected_response`: Required for `custom:answer_correctness`
> - `expected_tool_calls`: Required for `custom:tool_eval`
Expand Down
44 changes: 17 additions & 27 deletions config/evaluation_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,6 @@
- conversation_group_id: "conv_group_1"
description: "conversation group description"

turn_metrics:
- "ragas:faithfulness"
- "ragas:response_relevancy"
- "ragas:context_precision_without_reference"

turn_metrics_metadata:
"ragas:faithfulness":
threshold: 0.99
conversation_metrics: []
conversation_metrics_metadata: {}

Expand All @@ -23,15 +15,17 @@
- "Context 2"
expected_response: "Expected Response"

- conversation_group_id: "conv_group_2"
description: "conversation group description"
turn_metrics:
- "ragas:faithfulness"
- "ragas:response_relevancy"
- "ragas:context_precision_without_reference"

turn_metrics:
- "ragas:context_recall"
- "ragas:context_relevance"
- "ragas:context_precision_with_reference"
turn_metrics_metadata:
"ragas:faithfulness":
threshold: 0.99

turn_metrics_metadata: {}
- conversation_group_id: "conv_group_2"
description: "conversation group description"
conversation_metrics: []
conversation_metrics_metadata: {}

Expand All @@ -43,14 +37,14 @@
- "Context 1"
expected_response: "Expected Response"

turn_metrics:
- "ragas:context_recall"
- "ragas:context_relevance"
- "ragas:context_precision_with_reference"

- conversation_group_id: "conv_group_3"
description: "conversation group description"

turn_metrics:
- "custom:answer_correctness"

turn_metrics_metadata: {}

conversation_metrics:
- "deepeval:conversation_completeness"
- "deepeval:conversation_relevancy"
Expand All @@ -60,14 +54,10 @@
turns:
- turn_id: "1"
query: "User Query 1"
response: "API Response 1"
contexts:
- "Context"
expected_response: "Expected Response 1"

turn_metrics: [] # Skip eval for this turn

- turn_id: "2"
query: "User Query 2"
response: "API Response 2"
contexts:
- "Context"
expected_response: "Expected Response 2"
# turn_metrics: null (omitted) → Use system defaults (metrics with default=true)
19 changes: 11 additions & 8 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,21 @@ metrics_metadata:
# Turn-level metrics metadata
turn_level:
# Ragas Response Evaluation metrics
"ragas:faithfulness":
threshold: 0.8
description: "How faithful the response is to the provided context"

"ragas:response_relevancy":
threshold: 0.8
description: "How relevant the response is to the question"
default: true # This metric is applied by default when no turn_metrics specified

"ragas:faithfulness":
threshold: 0.8
description: "How faithful the response is to the provided context"
default: false # By default the value is false

# Ragas Context/Retrieval Evaluation metrics
"ragas:context_recall":
threshold: 0.8
description: "Did we fetch every fact the answer needs?"

"ragas:context_relevance":
threshold: 0.7
description: "Is what we retrieved actually relevant to user query?"

"ragas:context_precision_with_reference":
threshold: 0.7
description: "How precise the retrieved context is (with reference)"
Expand All @@ -56,6 +54,10 @@ metrics_metadata:
threshold: 0.7
description: "How precise the retrieved context is (without reference)"

"ragas:context_relevance":
threshold: 0.7
description: "Is what we retrieved actually relevant to user query?"

# Custom metrics
"custom:answer_correctness":
threshold: 0.75
Expand All @@ -70,6 +72,7 @@ metrics_metadata:
"deepeval:conversation_completeness":
threshold: 0.8
description: "How completely the conversation addresses user intentions"
default: false

"deepeval:conversation_relevancy":
threshold: 0.7
Expand Down
136 changes: 136 additions & 0 deletions src/lightspeed_evaluation/core/metrics/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""Metrics mapping for evaluation."""

from enum import Enum
from typing import Any, Optional

from ..models.data import EvaluationData, TurnData
from ..models.system import SystemConfig


class MetricLevel(Enum):
"""Metric level enumeration."""

TURN = "turn"
CONVERSATION = "conversation"


class MetricManager:
"""Manager for both turn and conversation metrics."""

def __init__(self, system_config: SystemConfig):
"""Initialize with system configuration."""
self.system_config = system_config

def resolve_metrics(
self, metrics: Optional[list[str]], level: MetricLevel
) -> list[str]:
"""Resolve metrics mapping.

Options:
- None: use system defaults (metrics with default=true)
- []: skip evaluation completely
- [metrics...]: use specified metrics from turn data

Args:
metrics: The metrics configuration (None, [], or list of metrics)
level: Whether this is TURN or CONVERSATION level

Returns:
List of metrics to evaluate
"""
if metrics is None:
# None = use system defaults
return self._extract_default_metrics(level)
if metrics == []:
# [] = explicitly skip evaluation
return []
# Use specified metrics as-is
return metrics

def get_effective_threshold(
self,
metric_identifier: str,
level: MetricLevel,
conv_data: Optional[EvaluationData] = None,
turn_data: Optional[TurnData] = None,
) -> Optional[float]:
"""Get effective threshold with priority hierarchy.

Priority:
1. Level-specific metadata (turn-specific for turns, conversation-specific for convs)
2. System defaults

Args:
metric_identifier: The metric to get threshold for
level: Whether this is TURN or CONVERSATION level
conv_data: Conversation data for conversation-level metadata
turn_data: Turn data for turn-specific metadata

Returns:
Effective threshold or None if not found
"""
# Check level-specific metadata first
level_metadata = self._get_level_metadata(level, conv_data, turn_data)
threshold = level_metadata.get(metric_identifier, {}).get("threshold")
if threshold is not None:
return threshold

# Fall back to system defaults
system_metadata = self._get_system_metadata(level)
return system_metadata.get(metric_identifier, {}).get("threshold")

def _get_level_metadata(
self,
level: MetricLevel,
conv_data: Optional[EvaluationData],
turn_data: Optional[TurnData],
) -> dict[str, dict[str, Any]]:
"""Get level-specific metadata (turn or conversation level)."""
if level == MetricLevel.TURN and turn_data and turn_data.turn_metrics_metadata:
return turn_data.turn_metrics_metadata
if (
level == MetricLevel.CONVERSATION
and conv_data
and conv_data.conversation_metrics_metadata
):
return conv_data.conversation_metrics_metadata
return {}

def _get_system_metadata(self, level: MetricLevel) -> dict[str, dict[str, Any]]:
"""Get system-level metadata for the given level."""
if level == MetricLevel.TURN:
return self.system_config.default_turn_metrics_metadata
return self.system_config.default_conversation_metrics_metadata

def _extract_default_metrics(self, level: MetricLevel) -> list[str]:
"""Extract metrics that have default=true from metadata."""
metrics_metadata = self._get_system_metadata(level)

default_metrics = []
for metric_name, metadata in metrics_metadata.items():
if metadata.get("default", False): # default=false if not specified
default_metrics.append(metric_name)
return default_metrics

def count_metrics_for_conversation(
self, conv_data: EvaluationData
) -> dict[str, int]:
"""Count total metrics that would be evaluated for a conversation."""
# Count turn metrics
total_turn_metrics = 0
for turn_data in conv_data.turns:
turn_metrics = self.resolve_metrics(
turn_data.turn_metrics, MetricLevel.TURN
)
total_turn_metrics += len(turn_metrics)

# Count conversation metrics
conversation_metrics = self.resolve_metrics(
conv_data.conversation_metrics, MetricLevel.CONVERSATION
)

return {
"turn_metrics": total_turn_metrics,
"conversation_metrics": len(conversation_metrics),
"total_turns": len(conv_data.turns),
}
Loading