33import re
44from typing import Any , Optional
55
6- import litellm
76from pydantic import BaseModel , Field
87
8+ from lightspeed_evaluation .core .llm .custom import BaseCustomLLM
99from lightspeed_evaluation .core .llm .manager import LLMManager
1010from lightspeed_evaluation .core .metrics .tool_eval import evaluate_tool_calls
1111from lightspeed_evaluation .core .models import EvaluationScope , TurnData
12+ from lightspeed_evaluation .core .system .exceptions import LLMError
1213
1314
1415class EvaluationPromptParams (BaseModel ):
@@ -35,15 +36,16 @@ def __init__(self, llm_manager: LLMManager):
3536 Args:
3637 llm_manager: Pre-configured LLMManager with validated parameters
3738 """
38- self .model_name = llm_manager .get_model_name ()
39- self .litellm_params = llm_manager .get_litellm_params ()
39+ self .llm = BaseCustomLLM (
40+ llm_manager .get_model_name (), llm_manager .get_litellm_params ()
41+ )
4042
4143 self .supported_metrics = {
4244 "answer_correctness" : self ._evaluate_answer_correctness ,
4345 "tool_eval" : self ._evaluate_tool_calls ,
4446 }
4547
46- print (f"✅ Custom Metrics initialized: { self .model_name } " )
48+ print (f"✅ Custom Metrics initialized: { self .llm . model_name } " )
4749
4850 def evaluate (
4951 self ,
@@ -62,31 +64,12 @@ def evaluate(
6264 except (ValueError , AttributeError , KeyError ) as e :
6365 return None , f"Custom { metric_name } evaluation failed: { str (e )} "
6466
65- def _call_llm (self , prompt : str , system_prompt : Optional [str ] = None ) -> str :
66- """Make a LiteLLM call with the configured parameters."""
67- # Prepare messages
68- messages = []
69- if system_prompt :
70- messages .append ({"role" : "system" , "content" : system_prompt })
71- messages .append ({"role" : "user" , "content" : prompt })
72-
73- try :
74- response = litellm .completion (
75- model = self .model_name ,
76- messages = messages ,
77- temperature = self .litellm_params .get ("temperature" , 0.0 ),
78- max_tokens = self .litellm_params .get ("max_tokens" ),
79- timeout = self .litellm_params .get ("timeout" ),
80- num_retries = self .litellm_params .get ("num_retries" , 3 ),
81- )
82-
83- content = response .choices [0 ].message .content # type: ignore
84- if content is None :
85- raise RuntimeError ("LLM returned empty response" )
86- return content .strip ()
87-
88- except Exception as e :
89- raise RuntimeError (f"LiteLLM call failed: { str (e )} " ) from e
67+ def _call_llm (self , prompt : str ) -> str :
68+ """Make an LLM call with the configured parameters."""
69+ result = self .llm .call (prompt , return_single = True )
70+ if isinstance (result , list ):
71+ return result [0 ] if result else ""
72+ return result
9073
9174 def _parse_score_response (self , response : str ) -> tuple [Optional [float ], str ]:
9275 r"""Parse LLM response to extract score and reason.
@@ -232,16 +215,19 @@ def _evaluate_answer_correctness(
232215 prompt += "- Absence of contradictory information"
233216
234217 # Make LLM call and parse response
235- llm_response = self ._call_llm (prompt )
236- score , reason = self ._parse_score_response (llm_response )
237-
238- if score is None :
239- return (
240- None ,
241- f"Could not parse score from LLM response: { llm_response [:100 ]} ..." ,
242- )
243-
244- return score , f"Custom answer correctness: { score :.2f} - { reason } "
218+ try :
219+ llm_response = self ._call_llm (prompt )
220+ score , reason = self ._parse_score_response (llm_response )
221+
222+ if score is None :
223+ return (
224+ None ,
225+ f"Could not parse score from LLM response: { llm_response [:100 ]} ..." ,
226+ )
227+
228+ return score , f"Custom answer correctness: { score :.2f} - { reason } "
229+ except LLMError as e :
230+ return None , f"Answer correctness evaluation failed: { str (e )} "
245231
246232 def _evaluate_tool_calls (
247233 self ,
0 commit comments