77import time
88from typing import Any , Dict , List , Optional , Tuple , Union
99
10- from .core import ConfigLoader , DataValidator , EvaluationData , EvaluationResult , TurnData
10+ from .core import (
11+ ConfigLoader ,
12+ DataValidator ,
13+ EvaluationData ,
14+ EvaluationResult ,
15+ TurnData ,
16+ )
1117from .llm_managers .llm_manager import LLMManager
1218from .metrics .custom_metrics import CustomMetrics
1319from .metrics .deepeval_metrics import DeepEvalMetrics
@@ -29,7 +35,11 @@ def __init__(self, conv_data: EvaluationData, metric_identifier: str):
2935
3036 @classmethod
3137 def for_turn (
32- cls , conv_data : EvaluationData , metric_identifier : str , turn_idx : int , turn_data : TurnData
38+ cls ,
39+ conv_data : EvaluationData ,
40+ metric_identifier : str ,
41+ turn_idx : int ,
42+ turn_data : TurnData ,
3343 ) -> "EvaluationRequest" :
3444 """Create request for turn-level evaluation."""
3545 request = cls (conv_data , metric_identifier )
@@ -74,7 +84,9 @@ def __init__(self, llm_manager: LLMManager):
7484 self .custom_metrics = CustomMetrics (llm_manager )
7585
7686 # Metric routing map
77- self .handlers : Dict [str , Union [RagasMetrics , DeepEvalMetrics , CustomMetrics ]] = {
87+ self .handlers : Dict [
88+ str , Union [RagasMetrics , DeepEvalMetrics , CustomMetrics ]
89+ ] = {
7890 "ragas" : self .ragas_metrics ,
7991 "deepeval" : self .deepeval_metrics ,
8092 "custom" : self .custom_metrics ,
@@ -131,7 +143,9 @@ def validate_data(self, evaluation_data: List[EvaluationData]) -> bool:
131143 """Validate evaluation data using data validator."""
132144 return self .data_validator .validate_evaluation_data (evaluation_data )
133145
134- def run_evaluation (self , evaluation_data : List [EvaluationData ]) -> List [EvaluationResult ]:
146+ def run_evaluation (
147+ self , evaluation_data : List [EvaluationData ]
148+ ) -> List [EvaluationResult ]:
135149 """
136150 Run complete evaluation pipeline.
137151
@@ -177,10 +191,14 @@ def _process_conversation(self, conv_data: EvaluationData) -> None:
177191 print (f"🗣️ Conversation-level metrics: { conv_data .conversation_metrics } " )
178192 self ._evaluate_conversation (conv_data )
179193
180- def _evaluate_turn (self , conv_data : EvaluationData , turn_idx : int , turn_data : TurnData ) -> None :
194+ def _evaluate_turn (
195+ self , conv_data : EvaluationData , turn_idx : int , turn_data : TurnData
196+ ) -> None :
181197 """Evaluate single turn with specified turn metrics."""
182198 for metric_identifier in conv_data .turn_metrics :
183- request = EvaluationRequest .for_turn (conv_data , metric_identifier , turn_idx , turn_data )
199+ request = EvaluationRequest .for_turn (
200+ conv_data , metric_identifier , turn_idx , turn_data
201+ )
184202 result = self ._evaluate_metric (request )
185203 if result :
186204 self .results .append (result )
@@ -193,7 +211,9 @@ def _evaluate_conversation(self, conv_data: EvaluationData) -> None:
193211 if result :
194212 self .results .append (result )
195213
196- def _evaluate_metric (self , request : EvaluationRequest ) -> Optional [EvaluationResult ]:
214+ def _evaluate_metric (
215+ self , request : EvaluationRequest
216+ ) -> Optional [EvaluationResult ]:
197217 """
198218 Evaluate single metric using context.
199219
@@ -213,7 +233,9 @@ def _evaluate_metric(self, request: EvaluationRequest) -> Optional[EvaluationRes
213233 print (f" { request .metric_identifier } (threshold: { threshold } )" )
214234
215235 # Route to metrics manager
216- score , reason = self .metrics_manager .evaluate_metric (framework , metric_name , request )
236+ score , reason = self .metrics_manager .evaluate_metric (
237+ framework , metric_name , request
238+ )
217239
218240 # Determine result status
219241 if score is None :
@@ -222,7 +244,9 @@ def _evaluate_metric(self, request: EvaluationRequest) -> Optional[EvaluationRes
222244 else :
223245 result_status = self ._determine_status (score , threshold )
224246 status_emoji = (
225- "✅" if result_status == "PASS" else "❌" if result_status == "FAIL" else "⚠️"
247+ "✅"
248+ if result_status == "PASS"
249+ else "❌" if result_status == "FAIL" else "⚠️"
226250 )
227251 print (f" { status_emoji } { result_status } : { score :.3f} " )
228252
@@ -266,7 +290,9 @@ def _get_effective_threshold(
266290 """Get effective threshold for metric (conversation-specific or system default)."""
267291 # Check conversation-specific metadata first
268292 if is_conversation :
269- metadata = conv_data .conversation_metrics_metadata .get (metric_identifier , {})
293+ metadata = conv_data .conversation_metrics_metadata .get (
294+ metric_identifier , {}
295+ )
270296 else :
271297 metadata = conv_data .turn_metrics_metadata .get (metric_identifier , {})
272298
@@ -279,9 +305,9 @@ def _get_effective_threshold(
279305 return None
280306
281307 if is_conversation :
282- default_metadata = (system_config . default_conversation_metrics_metadata or {}). get (
283- metric_identifier , {}
284- )
308+ default_metadata = (
309+ system_config . default_conversation_metrics_metadata or {}
310+ ). get ( metric_identifier , {})
285311 else :
286312 default_metadata = (system_config .default_turn_metrics_metadata or {}).get (
287313 metric_identifier , {}
0 commit comments