Skip to content

Commit e25dec0

Browse files
authored
Merge pull request #55 from asamal4/turn-metric-override
Turn metric override
2 parents 0232ecd + ad15be3 commit e25dec0

File tree

12 files changed

+434
-213
lines changed

12 files changed

+434
-213
lines changed

README.md

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -115,16 +115,18 @@ api:
115115
no_tools: null # Whether to bypass tools (optional)
116116
system_prompt: null # Custom system prompt (optional)
117117

118-
# Metrics Configuration with thresholds
118+
# Metrics Configuration with thresholds and defaults
119119
metrics_metadata:
120120
turn_level:
121-
"ragas:faithfulness":
122-
threshold: 0.8
123-
description: "How faithful the response is to the provided context"
124-
125121
"ragas:response_relevancy":
126122
threshold: 0.8
127123
description: "How relevant the response is to the question"
124+
default: true # Used by default when turn_metrics is null
125+
126+
"ragas:faithfulness":
127+
threshold: 0.8
128+
description: "How faithful the response is to the provided context"
129+
default: false # Only used when explicitly specified
128130

129131
"custom:tool_eval":
130132
description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"
@@ -160,16 +162,6 @@ visualization:
160162
- conversation_group_id: "test_conversation"
161163
description: "Sample evaluation"
162164
163-
# Turn-level metrics to evaluate
164-
turn_metrics:
165-
- "ragas:faithfulness"
166-
- "custom:answer_correctness"
167-
168-
# Metric-specific configuration
169-
turn_metrics_metadata:
170-
"ragas:faithfulness":
171-
threshold: 0.8
172-
173165
# Conversation-level metrics
174166
conversation_metrics:
175167
- "deepeval:conversation_completeness"
@@ -186,8 +178,23 @@ visualization:
186178
- OpenShift Virtualization is an extension of the OpenShift ...
187179
attachments: [] # Attachments (Optional)
188180
expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers
181+
182+
# Per-turn metrics (overrides system defaults)
183+
turn_metrics:
184+
- "ragas:faithfulness"
185+
- "custom:answer_correctness"
186+
187+
# Per-turn metric configuration
188+
turn_metrics_metadata:
189+
"ragas:faithfulness":
190+
threshold: 0.9 # Override system default
191+
# turn_metrics: null (omitted) → Use system defaults (metrics with default=true)
192+
193+
- turn_id: id2
194+
query: Skip this turn evaluation
195+
turn_metrics: [] # Skip evaluation for this turn
189196
190-
- turn_id: id2
197+
- turn_id: id3
191198
query: How do I create a virtual machine in OpenShift Virtualization?
192199
response: null # Populated by API if enabled, otherwise provide
193200
contexts:
@@ -223,11 +230,21 @@ visualization:
223230
| `expected_response` | string | 📋 | Expected response for comparison | ❌ |
224231
| `expected_tool_calls` | list[list[dict]] | 📋 | Expected tool call sequences | ❌ |
225232
| `tool_calls` | list[list[dict]] | ❌ | Actual tool calls from API | ✅ (if API enabled) |
233+
| `turn_metrics` | list[string] | ❌ | Turn-specific metrics to evaluate | ❌ |
234+
| `turn_metrics_metadata` | dict | ❌ | Turn-specific metric configuration | ❌ |
226235

227236
Note: Context will be collected automatically in the future.
228237

229238
> 📋 **Required based on metrics**: Some fields are required only when using specific metrics
230239

240+
#### Metrics override behavior
241+
242+
| Override Value | Behavior |
243+
|---------------------|----------|
244+
| `null` (or omitted) | Use system defaults (metrics with `default: true`) |
245+
| `[]` (empty list) | Skip evaluation for this turn |
246+
| `["metric1", ...]` | Use specified metrics only |
247+
231248
Examples
232249
> - `expected_response`: Required for `custom:answer_correctness`
233250
> - `expected_tool_calls`: Required for `custom:tool_eval`

config/evaluation_data.yaml

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,6 @@
33
- conversation_group_id: "conv_group_1"
44
description: "conversation group description"
55

6-
turn_metrics:
7-
- "ragas:faithfulness"
8-
- "ragas:response_relevancy"
9-
- "ragas:context_precision_without_reference"
10-
11-
turn_metrics_metadata:
12-
"ragas:faithfulness":
13-
threshold: 0.99
146
conversation_metrics: []
157
conversation_metrics_metadata: {}
168

@@ -23,15 +15,17 @@
2315
- "Context 2"
2416
expected_response: "Expected Response"
2517

26-
- conversation_group_id: "conv_group_2"
27-
description: "conversation group description"
18+
turn_metrics:
19+
- "ragas:faithfulness"
20+
- "ragas:response_relevancy"
21+
- "ragas:context_precision_without_reference"
2822

29-
turn_metrics:
30-
- "ragas:context_recall"
31-
- "ragas:context_relevance"
32-
- "ragas:context_precision_with_reference"
23+
turn_metrics_metadata:
24+
"ragas:faithfulness":
25+
threshold: 0.99
3326

34-
turn_metrics_metadata: {}
27+
- conversation_group_id: "conv_group_2"
28+
description: "conversation group description"
3529
conversation_metrics: []
3630
conversation_metrics_metadata: {}
3731

@@ -43,14 +37,14 @@
4337
- "Context 1"
4438
expected_response: "Expected Response"
4539

40+
turn_metrics:
41+
- "ragas:context_recall"
42+
- "ragas:context_relevance"
43+
- "ragas:context_precision_with_reference"
44+
4645
- conversation_group_id: "conv_group_3"
4746
description: "conversation group description"
4847

49-
turn_metrics:
50-
- "custom:answer_correctness"
51-
52-
turn_metrics_metadata: {}
53-
5448
conversation_metrics:
5549
- "deepeval:conversation_completeness"
5650
- "deepeval:conversation_relevancy"
@@ -60,14 +54,10 @@
6054
turns:
6155
- turn_id: "1"
6256
query: "User Query 1"
63-
response: "API Response 1"
64-
contexts:
65-
- "Context"
66-
expected_response: "Expected Response 1"
57+
58+
turn_metrics: [] # Skip eval for this turn
6759

6860
- turn_id: "2"
6961
query: "User Query 2"
7062
response: "API Response 2"
71-
contexts:
72-
- "Context"
73-
expected_response: "Expected Response 2"
63+
# turn_metrics: null (omitted) → Use system defaults (metrics with default=true)

config/system.yaml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,21 @@ metrics_metadata:
3131
# Turn-level metrics metadata
3232
turn_level:
3333
# Ragas Response Evaluation metrics
34-
"ragas:faithfulness":
35-
threshold: 0.8
36-
description: "How faithful the response is to the provided context"
37-
3834
"ragas:response_relevancy":
3935
threshold: 0.8
4036
description: "How relevant the response is to the question"
37+
default: true # This metric is applied by default when no turn_metrics specified
38+
39+
"ragas:faithfulness":
40+
threshold: 0.8
41+
description: "How faithful the response is to the provided context"
42+
default: false # By default the value is false
4143

4244
# Ragas Context/Retrieval Evaluation metrics
4345
"ragas:context_recall":
4446
threshold: 0.8
4547
description: "Did we fetch every fact the answer needs?"
4648

47-
"ragas:context_relevance":
48-
threshold: 0.7
49-
description: "Is what we retrieved actually relevant to user query?"
50-
5149
"ragas:context_precision_with_reference":
5250
threshold: 0.7
5351
description: "How precise the retrieved context is (with reference)"
@@ -56,6 +54,10 @@ metrics_metadata:
5654
threshold: 0.7
5755
description: "How precise the retrieved context is (without reference)"
5856

57+
"ragas:context_relevance":
58+
threshold: 0.7
59+
description: "Is what we retrieved actually relevant to user query?"
60+
5961
# Custom metrics
6062
"custom:answer_correctness":
6163
threshold: 0.75
@@ -70,6 +72,7 @@ metrics_metadata:
7072
"deepeval:conversation_completeness":
7173
threshold: 0.8
7274
description: "How completely the conversation addresses user intentions"
75+
default: false
7376

7477
"deepeval:conversation_relevancy":
7578
threshold: 0.7
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""Metrics mapping for evaluation."""
2+
3+
from enum import Enum
4+
from typing import Any, Optional
5+
6+
from ..models.data import EvaluationData, TurnData
7+
from ..models.system import SystemConfig
8+
9+
10+
class MetricLevel(Enum):
11+
"""Metric level enumeration."""
12+
13+
TURN = "turn"
14+
CONVERSATION = "conversation"
15+
16+
17+
class MetricManager:
18+
"""Manager for both turn and conversation metrics."""
19+
20+
def __init__(self, system_config: SystemConfig):
21+
"""Initialize with system configuration."""
22+
self.system_config = system_config
23+
24+
def resolve_metrics(
25+
self, metrics: Optional[list[str]], level: MetricLevel
26+
) -> list[str]:
27+
"""Resolve metrics mapping.
28+
29+
Options:
30+
- None: use system defaults (metrics with default=true)
31+
- []: skip evaluation completely
32+
- [metrics...]: use specified metrics from turn data
33+
34+
Args:
35+
metrics: The metrics configuration (None, [], or list of metrics)
36+
level: Whether this is TURN or CONVERSATION level
37+
38+
Returns:
39+
List of metrics to evaluate
40+
"""
41+
if metrics is None:
42+
# None = use system defaults
43+
return self._extract_default_metrics(level)
44+
if metrics == []:
45+
# [] = explicitly skip evaluation
46+
return []
47+
# Use specified metrics as-is
48+
return metrics
49+
50+
def get_effective_threshold(
51+
self,
52+
metric_identifier: str,
53+
level: MetricLevel,
54+
conv_data: Optional[EvaluationData] = None,
55+
turn_data: Optional[TurnData] = None,
56+
) -> Optional[float]:
57+
"""Get effective threshold with priority hierarchy.
58+
59+
Priority:
60+
1. Level-specific metadata (turn-specific for turns, conversation-specific for convs)
61+
2. System defaults
62+
63+
Args:
64+
metric_identifier: The metric to get threshold for
65+
level: Whether this is TURN or CONVERSATION level
66+
conv_data: Conversation data for conversation-level metadata
67+
turn_data: Turn data for turn-specific metadata
68+
69+
Returns:
70+
Effective threshold or None if not found
71+
"""
72+
# Check level-specific metadata first
73+
level_metadata = self._get_level_metadata(level, conv_data, turn_data)
74+
threshold = level_metadata.get(metric_identifier, {}).get("threshold")
75+
if threshold is not None:
76+
return threshold
77+
78+
# Fall back to system defaults
79+
system_metadata = self._get_system_metadata(level)
80+
return system_metadata.get(metric_identifier, {}).get("threshold")
81+
82+
def _get_level_metadata(
83+
self,
84+
level: MetricLevel,
85+
conv_data: Optional[EvaluationData],
86+
turn_data: Optional[TurnData],
87+
) -> dict[str, dict[str, Any]]:
88+
"""Get level-specific metadata (turn or conversation level)."""
89+
if level == MetricLevel.TURN and turn_data and turn_data.turn_metrics_metadata:
90+
return turn_data.turn_metrics_metadata
91+
if (
92+
level == MetricLevel.CONVERSATION
93+
and conv_data
94+
and conv_data.conversation_metrics_metadata
95+
):
96+
return conv_data.conversation_metrics_metadata
97+
return {}
98+
99+
def _get_system_metadata(self, level: MetricLevel) -> dict[str, dict[str, Any]]:
100+
"""Get system-level metadata for the given level."""
101+
if level == MetricLevel.TURN:
102+
return self.system_config.default_turn_metrics_metadata
103+
return self.system_config.default_conversation_metrics_metadata
104+
105+
def _extract_default_metrics(self, level: MetricLevel) -> list[str]:
106+
"""Extract metrics that have default=true from metadata."""
107+
metrics_metadata = self._get_system_metadata(level)
108+
109+
default_metrics = []
110+
for metric_name, metadata in metrics_metadata.items():
111+
if metadata.get("default", False): # default=false if not specified
112+
default_metrics.append(metric_name)
113+
return default_metrics
114+
115+
def count_metrics_for_conversation(
116+
self, conv_data: EvaluationData
117+
) -> dict[str, int]:
118+
"""Count total metrics that would be evaluated for a conversation."""
119+
# Count turn metrics
120+
total_turn_metrics = 0
121+
for turn_data in conv_data.turns:
122+
turn_metrics = self.resolve_metrics(
123+
turn_data.turn_metrics, MetricLevel.TURN
124+
)
125+
total_turn_metrics += len(turn_metrics)
126+
127+
# Count conversation metrics
128+
conversation_metrics = self.resolve_metrics(
129+
conv_data.conversation_metrics, MetricLevel.CONVERSATION
130+
)
131+
132+
return {
133+
"turn_metrics": total_turn_metrics,
134+
"conversation_metrics": len(conversation_metrics),
135+
"total_turns": len(conv_data.turns),
136+
}

0 commit comments

Comments
 (0)