Skip to content

Commit 222f347

Browse files
committed
process default & turn-level override
1 parent 59cda52 commit 222f347

File tree

7 files changed

+282
-111
lines changed

7 files changed

+282
-111
lines changed
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""Metrics mapping for evaluation."""
2+
3+
from enum import Enum
4+
from typing import Any, Optional
5+
6+
from ..models.data import EvaluationData, TurnData
7+
from ..models.system import SystemConfig
8+
9+
10+
class MetricLevel(Enum):
11+
"""Metric level enumeration."""
12+
13+
TURN = "turn"
14+
CONVERSATION = "conversation"
15+
16+
17+
class MetricManager:
18+
"""Manager for both turn and conversation metrics."""
19+
20+
def __init__(self, system_config: SystemConfig):
21+
"""Initialize with system configuration."""
22+
self.system_config = system_config
23+
24+
def resolve_metrics(
25+
self, metrics: Optional[list[str]], level: MetricLevel
26+
) -> list[str]:
27+
"""Resolve metrics mapping.
28+
29+
Options:
30+
- None: use system defaults (metrics with default=true)
31+
- []: skip evaluation completely
32+
- [metrics...]: use specified metrics from turn data
33+
34+
Args:
35+
metrics: The metrics configuration (None, [], or list of metrics)
36+
level: Whether this is TURN or CONVERSATION level
37+
38+
Returns:
39+
List of metrics to evaluate
40+
"""
41+
if metrics is None:
42+
# None = use system defaults
43+
return self._extract_default_metrics(level)
44+
if metrics == []:
45+
# [] = explicitly skip evaluation
46+
return []
47+
# Use specified metrics as-is
48+
return metrics
49+
50+
def get_effective_threshold(
51+
self,
52+
metric_identifier: str,
53+
level: MetricLevel,
54+
conv_data: Optional[EvaluationData] = None,
55+
turn_data: Optional[TurnData] = None,
56+
) -> Optional[float]:
57+
"""Get effective threshold with priority hierarchy.
58+
59+
Priority:
60+
1. Level-specific metadata (turn-specific for turns, conversation-specific for convs)
61+
2. System defaults
62+
63+
Args:
64+
metric_identifier: The metric to get threshold for
65+
level: Whether this is TURN or CONVERSATION level
66+
conv_data: Conversation data for conversation-level metadata
67+
turn_data: Turn data for turn-specific metadata
68+
69+
Returns:
70+
Effective threshold or None if not found
71+
"""
72+
# Check level-specific metadata first
73+
level_metadata = self._get_level_metadata(level, conv_data, turn_data)
74+
threshold = level_metadata.get(metric_identifier, {}).get("threshold")
75+
if threshold is not None:
76+
return threshold
77+
78+
# Fall back to system defaults
79+
system_metadata = self._get_system_metadata(level)
80+
return system_metadata.get(metric_identifier, {}).get("threshold")
81+
82+
def _get_level_metadata(
83+
self,
84+
level: MetricLevel,
85+
conv_data: Optional[EvaluationData],
86+
turn_data: Optional[TurnData],
87+
) -> dict[str, dict[str, Any]]:
88+
"""Get level-specific metadata (turn or conversation level)."""
89+
if level == MetricLevel.TURN and turn_data and turn_data.turn_metrics_metadata:
90+
return turn_data.turn_metrics_metadata
91+
if (
92+
level == MetricLevel.CONVERSATION
93+
and conv_data
94+
and conv_data.conversation_metrics_metadata
95+
):
96+
return conv_data.conversation_metrics_metadata
97+
return {}
98+
99+
def _get_system_metadata(self, level: MetricLevel) -> dict[str, dict[str, Any]]:
100+
"""Get system-level metadata for the given level."""
101+
if level == MetricLevel.TURN:
102+
return self.system_config.default_turn_metrics_metadata
103+
return self.system_config.default_conversation_metrics_metadata
104+
105+
def _extract_default_metrics(self, level: MetricLevel) -> list[str]:
106+
"""Extract metrics that have default=true from metadata."""
107+
metrics_metadata = self._get_system_metadata(level)
108+
109+
default_metrics = []
110+
for metric_name, metadata in metrics_metadata.items():
111+
if metadata.get("default", False): # default=false if not specified
112+
default_metrics.append(metric_name)
113+
return default_metrics
114+
115+
def count_metrics_for_conversation(
116+
self, conv_data: EvaluationData
117+
) -> dict[str, int]:
118+
"""Count total metrics that would be evaluated for a conversation."""
119+
# Count turn metrics
120+
total_turn_metrics = 0
121+
for turn_data in conv_data.turns:
122+
turn_metrics = self.resolve_metrics(
123+
turn_data.turn_metrics, MetricLevel.TURN
124+
)
125+
total_turn_metrics += len(turn_metrics)
126+
127+
# Count conversation metrics
128+
conversation_metrics = self.resolve_metrics(
129+
conv_data.conversation_metrics, MetricLevel.CONVERSATION
130+
)
131+
132+
return {
133+
"turn_metrics": total_turn_metrics,
134+
"conversation_metrics": len(conversation_metrics),
135+
"total_turns": len(conv_data.turns),
136+
}

src/lightspeed_evaluation/pipeline/evaluation/errors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ def mark_all_metrics_as_error(
3030
error_results = []
3131

3232
# Mark all turn-level metrics as ERROR
33-
if conv_data.turn_metrics:
34-
for turn_data in conv_data.turns:
35-
for metric_identifier in conv_data.turn_metrics:
33+
for turn_data in conv_data.turns:
34+
if turn_data.turn_metrics:
35+
for metric_identifier in turn_data.turn_metrics:
3636
error_result = EvaluationResult(
3737
conversation_group_id=conv_data.conversation_group_id,
3838
turn_id=turn_data.turn_id,

src/lightspeed_evaluation/pipeline/evaluation/evaluator.py

Lines changed: 19 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,9 @@
77
from ...core.llm.manager import LLMManager
88
from ...core.metrics.custom import CustomMetrics
99
from ...core.metrics.deepeval import DeepEvalMetrics
10+
from ...core.metrics.manager import MetricLevel, MetricManager
1011
from ...core.metrics.ragas import RagasMetrics
11-
from ...core.models import (
12-
EvaluationData,
13-
EvaluationRequest,
14-
EvaluationResult,
15-
EvaluationScope,
16-
)
12+
from ...core.models import EvaluationRequest, EvaluationResult, EvaluationScope
1713
from ...core.system import ConfigLoader
1814

1915
logger = logging.getLogger(__name__)
@@ -22,8 +18,13 @@
2218
class MetricsEvaluator:
2319
"""Handles individual metric evaluation with proper scoring and status determination."""
2420

25-
def __init__(self, llm_manager: LLMManager, config_loader: ConfigLoader) -> None:
26-
"""Initialize with LLM manager and config."""
21+
def __init__(
22+
self,
23+
llm_manager: LLMManager,
24+
config_loader: ConfigLoader,
25+
metric_manager: MetricManager,
26+
) -> None:
27+
"""Initialize with LLM manager, config, and metric manager."""
2728
self.config_loader = config_loader
2829
self.config = config_loader.system_config
2930

@@ -39,6 +40,8 @@ def __init__(self, llm_manager: LLMManager, config_loader: ConfigLoader) -> None
3940
"custom": self.custom_metrics,
4041
}
4142

43+
self.metric_manager = metric_manager
44+
4245
def evaluate_metric(self, request: EvaluationRequest) -> Optional[EvaluationResult]:
4346
"""Evaluate a single metric and return result."""
4447
start_time = time.time()
@@ -81,9 +84,14 @@ def evaluate_metric(self, request: EvaluationRequest) -> Optional[EvaluationResu
8184
if score is None:
8285
return self._create_error_result(request, reason, execution_time)
8386

84-
# Get threshold and determine status
85-
threshold = self._get_effective_threshold(
86-
request.conv_data, request.metric_identifier, request.is_conversation
87+
# Get threshold
88+
level = (
89+
MetricLevel.CONVERSATION
90+
if request.is_conversation
91+
else MetricLevel.TURN
92+
)
93+
threshold = self.metric_manager.get_effective_threshold(
94+
request.metric_identifier, level, request.conv_data, request.turn_data
8795
)
8896
status = self._determine_status(score, threshold)
8997

@@ -124,37 +132,6 @@ def _create_error_result(
124132
execution_time=execution_time,
125133
)
126134

127-
def _get_effective_threshold(
128-
self, conv_data: EvaluationData, metric_identifier: str, is_conversation: bool
129-
) -> Optional[float]:
130-
"""Get effective threshold for metric (conversation-specific or system default)."""
131-
# Check conversation-specific metadata first
132-
if is_conversation:
133-
metadata = (conv_data.conversation_metrics_metadata or {}).get(
134-
metric_identifier, {}
135-
)
136-
else:
137-
metadata = (conv_data.turn_metrics_metadata or {}).get(
138-
metric_identifier, {}
139-
)
140-
141-
if "threshold" in metadata:
142-
return metadata["threshold"]
143-
144-
# Fall back to system defaults
145-
if self.config is None:
146-
raise ValueError("SystemConfig must be loaded")
147-
if is_conversation:
148-
default_metadata = (
149-
self.config.default_conversation_metrics_metadata or {}
150-
).get(metric_identifier, {})
151-
else:
152-
default_metadata = (self.config.default_turn_metrics_metadata or {}).get(
153-
metric_identifier, {}
154-
)
155-
156-
return default_metadata.get("threshold")
157-
158135
def _determine_status(self, score: float, threshold: Optional[float]) -> str:
159136
"""Determine evaluation status based on score and threshold."""
160137
if threshold is None:

src/lightspeed_evaluation/pipeline/evaluation/pipeline.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55

66
from ...core.api import APIClient
77
from ...core.llm.manager import LLMManager
8+
from ...core.metrics.manager import MetricManager
89
from ...core.models import EvaluationData, EvaluationResult
910
from ...core.output.data_persistence import save_evaluation_data
1011
from ...core.system import ConfigLoader, DataValidator
1112
from .amender import APIDataAmender
1213
from .errors import EvaluationErrorHandler
1314
from .evaluator import MetricsEvaluator
14-
from .processor import ConversationProcessor
15+
from .processor import ConversationProcessor, ProcessorComponents
1516

1617
logger = logging.getLogger(__name__)
1718

@@ -52,25 +53,29 @@ def _initialize_components(self) -> None:
5253
# LLM Manager
5354
llm_manager = LLMManager.from_llm_config(self.config.llm)
5455

56+
# Metric manager
57+
metric_manager = MetricManager(self.config)
58+
5559
# Create pipeline components
56-
api_client = self._create_api_client()
57-
api_amender = APIDataAmender(api_client)
60+
self.api_client = self._create_api_client()
61+
api_amender = APIDataAmender(self.api_client)
5862
error_handler = EvaluationErrorHandler()
59-
metrics_evaluator = MetricsEvaluator(llm_manager, self.config_loader)
60-
# Group components for easier access
61-
self.components = {
62-
"api_client": api_client,
63-
"api_amender": api_amender,
64-
"error_handler": error_handler,
65-
"metrics_evaluator": metrics_evaluator,
66-
}
63+
metrics_evaluator = MetricsEvaluator(
64+
llm_manager, self.config_loader, metric_manager
65+
)
66+
67+
# Create processor components
68+
processor_components = ProcessorComponents(
69+
metrics_evaluator=metrics_evaluator,
70+
api_amender=api_amender,
71+
error_handler=error_handler,
72+
metric_manager=metric_manager,
73+
)
6774

6875
# Conversation processor
6976
self.conversation_processor = ConversationProcessor(
7077
self.config_loader,
71-
self.components["metrics_evaluator"],
72-
self.components["api_amender"],
73-
self.components["error_handler"],
78+
processor_components,
7479
)
7580

7681
def _create_api_client(self) -> Optional[APIClient]:
@@ -163,6 +168,5 @@ def _save_updated_data(self, evaluation_data: list[EvaluationData]) -> None:
163168

164169
def close(self) -> None:
165170
"""Clean up resources."""
166-
api_client = self.components.get("api_client")
167-
if api_client:
168-
api_client.close()
171+
if self.api_client:
172+
self.api_client.close()

0 commit comments

Comments
 (0)