diff --git a/src/lightspeed_evaluation/core/llm/__init__.py b/src/lightspeed_evaluation/core/llm/__init__.py index 674f4bf7..1f3b3c82 100644 --- a/src/lightspeed_evaluation/core/llm/__init__.py +++ b/src/lightspeed_evaluation/core/llm/__init__.py @@ -1,5 +1,6 @@ """LLM management for Evaluation Framework.""" +from lightspeed_evaluation.core.llm.custom import BaseCustomLLM from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager from lightspeed_evaluation.core.llm.manager import LLMManager from lightspeed_evaluation.core.llm.ragas import RagasLLMManager @@ -11,6 +12,7 @@ "LLMConfig", "LLMError", "LLMManager", + "BaseCustomLLM", "DeepEvalLLMManager", "RagasLLMManager", "validate_provider_env", diff --git a/src/lightspeed_evaluation/core/llm/custom.py b/src/lightspeed_evaluation/core/llm/custom.py new file mode 100644 index 00000000..b139e19e --- /dev/null +++ b/src/lightspeed_evaluation/core/llm/custom.py @@ -0,0 +1,75 @@ +"""Base Custom LLM class for evaluation framework.""" + +from typing import Any, Optional, Union + +import litellm + +from lightspeed_evaluation.core.system.exceptions import LLMError + + +class BaseCustomLLM: # pylint: disable=too-few-public-methods + """Base LLM class with core calling functionality.""" + + def __init__(self, model_name: str, llm_params: dict[str, Any]): + """Initialize with model configuration.""" + self.model_name = model_name + self.llm_params = llm_params + + def call( + self, + prompt: str, + n: int = 1, + temperature: Optional[float] = None, + return_single: bool = True, + **kwargs: Any, + ) -> Union[str, list[str]]: + """Make LLM call and return response(s). + + Args: + prompt: Text prompt to send + n: Number of responses to generate (default 1) + temperature: Override temperature (uses config default if None) + return_single: If True and n=1, return single string. If False, always return list. + **kwargs: Additional LLM parameters + + Returns: + Single string if return_single=True and n=1, otherwise list of strings + """ + temp = ( + temperature + if temperature is not None + else self.llm_params.get("temperature", 0.0) + ) + + call_params = { + "model": self.model_name, + "messages": [{"role": "user", "content": prompt}], + "temperature": temp, + "n": n, + "max_tokens": self.llm_params.get("max_tokens"), + "timeout": self.llm_params.get("timeout"), + "num_retries": self.llm_params.get("num_retries", 3), + **kwargs, + } + + try: + response = litellm.completion(**call_params) + + # Extract content from all choices + results = [] + for choice in response.choices: # type: ignore + content = choice.message.content # type: ignore + if content is None: + content = "" + results.append(content.strip()) + + # Return format based on parameters + if return_single and n == 1: + if not results: + raise LLMError("LLM returned empty response") + return results[0] + + return results + + except Exception as e: + raise LLMError(f"LLM call failed: {str(e)}") from e diff --git a/src/lightspeed_evaluation/core/llm/deepeval.py b/src/lightspeed_evaluation/core/llm/deepeval.py index cb9696b0..50ee4716 100644 --- a/src/lightspeed_evaluation/core/llm/deepeval.py +++ b/src/lightspeed_evaluation/core/llm/deepeval.py @@ -1,4 +1,4 @@ -"""DeepEval LLM Manager - DeepEval-specific LLM wrapper that takes LiteLLM parameters.""" +"""DeepEval LLM Manager - DeepEval-specific LLM wrapper.""" from typing import Any @@ -11,32 +11,32 @@ class DeepEvalLLMManager: This manager focuses solely on DeepEval-specific LLM integration. """ - def __init__(self, model_name: str, litellm_params: dict[str, Any]): + def __init__(self, model_name: str, llm_params: dict[str, Any]): """Initialize with LLM parameters from LLMManager.""" self.model_name = model_name - self.litellm_params = litellm_params + self.llm_params = llm_params - # Create DeepEval's LiteLLMModel with provided parameters + # Create DeepEval's LLM model with provided parameters self.llm_model = LiteLLMModel( model=self.model_name, - temperature=litellm_params.get("temperature", 0.0), - max_tokens=litellm_params.get("max_tokens"), - timeout=litellm_params.get("timeout"), - num_retries=litellm_params.get("num_retries", 3), + temperature=llm_params.get("temperature", 0.0), + max_tokens=llm_params.get("max_tokens"), + timeout=llm_params.get("timeout"), + num_retries=llm_params.get("num_retries", 3), ) print(f"✅ DeepEval LLM Manager: {self.model_name}") def get_llm(self) -> LiteLLMModel: - """Get the configured DeepEval LiteLLM model.""" + """Get the configured DeepEval LLM model.""" return self.llm_model def get_model_info(self) -> dict[str, Any]: """Get information about the configured model.""" return { "model_name": self.model_name, - "temperature": self.litellm_params.get("temperature", 0.0), - "max_tokens": self.litellm_params.get("max_tokens"), - "timeout": self.litellm_params.get("timeout"), - "num_retries": self.litellm_params.get("num_retries", 3), + "temperature": self.llm_params.get("temperature", 0.0), + "max_tokens": self.llm_params.get("max_tokens"), + "timeout": self.llm_params.get("timeout"), + "num_retries": self.llm_params.get("num_retries", 3), } diff --git a/src/lightspeed_evaluation/core/llm/manager.py b/src/lightspeed_evaluation/core/llm/manager.py index fa000676..24643ae8 100644 --- a/src/lightspeed_evaluation/core/llm/manager.py +++ b/src/lightspeed_evaluation/core/llm/manager.py @@ -13,7 +13,7 @@ class LLMManager: Responsibilities: - Environment validation for multiple providers - Model name construction - - Provides LiteLLM parameters for consumption by framework-specific managers + - Provides LLM parameters for consumption by framework-specific managers """ def __init__(self, config: LLMConfig): @@ -25,7 +25,7 @@ def __init__(self, config: LLMConfig): ) def _construct_model_name_and_validate(self) -> str: - """Construct model name for LiteLLM and validate required environment variables.""" + """Construct model name and validate required environment variables.""" provider = self.config.provider.lower() # Provider-specific validation and model name construction @@ -89,11 +89,11 @@ def _handle_ollama_provider(self) -> str: return f"ollama/{self.config.model}" def get_model_name(self) -> str: - """Get the constructed LiteLLM model name.""" + """Get the constructed model name.""" return self.model_name - def get_litellm_params(self) -> dict[str, Any]: - """Get parameters for LiteLLM completion calls.""" + def get_llm_params(self) -> dict[str, Any]: + """Get parameters for LLM completion calls.""" return { "model": self.model_name, "temperature": self.config.temperature, diff --git a/src/lightspeed_evaluation/core/llm/ragas.py b/src/lightspeed_evaluation/core/llm/ragas.py index 419e456e..f1f55948 100644 --- a/src/lightspeed_evaluation/core/llm/ragas.py +++ b/src/lightspeed_evaluation/core/llm/ragas.py @@ -1,20 +1,21 @@ -"""Ragas LLM Manager - Ragas-specific LLM wrapper that takes LiteLLM parameters.""" +"""Ragas LLM Manager - Ragas-specific LLM wrapper.""" from typing import Any, Optional -import litellm from ragas.llms.base import BaseRagasLLM, Generation, LLMResult from ragas.metrics import answer_relevancy, faithfulness +from lightspeed_evaluation.core.llm.custom import BaseCustomLLM +from lightspeed_evaluation.core.system.exceptions import LLMError -class RagasCustomLLM(BaseRagasLLM): - """Custom LLM for Ragas using LiteLLM parameters.""" - def __init__(self, model_name: str, litellm_params: dict[str, Any]): - """Initialize Ragas custom LLM with model name and LiteLLM parameters.""" - super().__init__() - self.model_name = model_name - self.litellm_params = litellm_params +class RagasCustomLLM(BaseRagasLLM, BaseCustomLLM): + """Custom LLM for Ragas.""" + + def __init__(self, model_name: str, llm_params: dict[str, Any]): + """Initialize Ragas custom LLM with model name and LLM parameters.""" + BaseRagasLLM.__init__(self) + BaseCustomLLM.__init__(self, model_name, llm_params) print(f"✅ Ragas Custom LLM: {self.model_name}") def generate_text( # pylint: disable=too-many-arguments,too-many-positional-arguments @@ -25,34 +26,30 @@ def generate_text( # pylint: disable=too-many-arguments,too-many-positional-arg stop: Optional[list[str]] = None, callbacks: Optional[Any] = None, ) -> LLMResult: - """Generate text using LiteLLM with provided parameters.""" + """Generate text using LLM with provided parameters.""" prompt_text = str(prompt) # Use temperature from params unless explicitly overridden temp = ( temperature if temperature != 1e-08 - else self.litellm_params.get("temperature", 0.0) + else self.llm_params.get("temperature", 0.0) ) try: - response = litellm.completion( - model=self.model_name, - messages=[{"role": "user", "content": prompt_text}], - n=n, - temperature=temp, - max_tokens=self.litellm_params.get("max_tokens"), - timeout=self.litellm_params.get("timeout"), - num_retries=self.litellm_params.get("num_retries"), + # Use inherited BaseCustomLLM functionality + call_kwargs = {} + if stop is not None: + call_kwargs["stop"] = stop + + responses = self.call( + prompt_text, n=n, temperature=temp, return_single=False, **call_kwargs ) # Convert to Ragas format generations = [] - for choice in response.choices: # type: ignore - content = choice.message.content # type: ignore - if content is None: - content = "" - gen = Generation(text=content.strip()) + for response_text in responses: + gen = Generation(text=response_text) generations.append(gen) result = LLMResult(generations=[generations]) @@ -60,7 +57,7 @@ def generate_text( # pylint: disable=too-many-arguments,too-many-positional-arg except Exception as e: print(f"❌ Ragas LLM failed: {e}") - raise RuntimeError(f"Ragas LLM evaluation failed: {str(e)}") from e + raise LLMError(f"Ragas LLM evaluation failed: {str(e)}") from e async def agenerate_text( # pylint: disable=too-many-arguments,too-many-positional-arguments self, @@ -87,11 +84,11 @@ class RagasLLMManager: This manager focuses solely on Ragas-specific LLM integration. """ - def __init__(self, model_name: str, litellm_params: dict[str, Any]): + def __init__(self, model_name: str, llm_params: dict[str, Any]): """Initialize with LLM parameters from LLMManager.""" self.model_name = model_name - self.litellm_params = litellm_params - self.custom_llm = RagasCustomLLM(model_name, litellm_params) + self.llm_params = llm_params + self.custom_llm = RagasCustomLLM(model_name, llm_params) # Configure Ragas metrics to use our custom LLM answer_relevancy.llm = self.custom_llm @@ -107,5 +104,5 @@ def get_model_info(self) -> dict[str, Any]: """Get information about the configured model.""" return { "model_name": self.model_name, - "temperature": self.litellm_params.get("temperature", 0.0), + "temperature": self.llm_params.get("temperature", 0.0), } diff --git a/src/lightspeed_evaluation/core/metrics/custom.py b/src/lightspeed_evaluation/core/metrics/custom.py index 3a3363d0..c37a547c 100644 --- a/src/lightspeed_evaluation/core/metrics/custom.py +++ b/src/lightspeed_evaluation/core/metrics/custom.py @@ -3,12 +3,13 @@ import re from typing import Any, Optional -import litellm from pydantic import BaseModel, Field +from lightspeed_evaluation.core.llm.custom import BaseCustomLLM from lightspeed_evaluation.core.llm.manager import LLMManager from lightspeed_evaluation.core.metrics.tool_eval import evaluate_tool_calls from lightspeed_evaluation.core.models import EvaluationScope, TurnData +from lightspeed_evaluation.core.system.exceptions import LLMError class EvaluationPromptParams(BaseModel): @@ -27,7 +28,7 @@ class EvaluationPromptParams(BaseModel): class CustomMetrics: # pylint: disable=too-few-public-methods - """Handles custom metrics using LLMManager for direct LiteLLM calls.""" + """Handles custom metrics using LLMManager for direct LLM calls.""" def __init__(self, llm_manager: LLMManager): """Initialize with LLM Manager. @@ -35,15 +36,16 @@ def __init__(self, llm_manager: LLMManager): Args: llm_manager: Pre-configured LLMManager with validated parameters """ - self.model_name = llm_manager.get_model_name() - self.litellm_params = llm_manager.get_litellm_params() + self.llm = BaseCustomLLM( + llm_manager.get_model_name(), llm_manager.get_llm_params() + ) self.supported_metrics = { "answer_correctness": self._evaluate_answer_correctness, "tool_eval": self._evaluate_tool_calls, } - print(f"✅ Custom Metrics initialized: {self.model_name}") + print(f"✅ Custom Metrics initialized: {self.llm.model_name}") def evaluate( self, @@ -62,31 +64,12 @@ def evaluate( except (ValueError, AttributeError, KeyError) as e: return None, f"Custom {metric_name} evaluation failed: {str(e)}" - def _call_llm(self, prompt: str, system_prompt: Optional[str] = None) -> str: - """Make a LiteLLM call with the configured parameters.""" - # Prepare messages - messages = [] - if system_prompt: - messages.append({"role": "system", "content": system_prompt}) - messages.append({"role": "user", "content": prompt}) - - try: - response = litellm.completion( - model=self.model_name, - messages=messages, - temperature=self.litellm_params.get("temperature", 0.0), - max_tokens=self.litellm_params.get("max_tokens"), - timeout=self.litellm_params.get("timeout"), - num_retries=self.litellm_params.get("num_retries", 3), - ) - - content = response.choices[0].message.content # type: ignore - if content is None: - raise RuntimeError("LLM returned empty response") - return content.strip() - - except Exception as e: - raise RuntimeError(f"LiteLLM call failed: {str(e)}") from e + def _call_llm(self, prompt: str) -> str: + """Make an LLM call with the configured parameters.""" + result = self.llm.call(prompt, return_single=True) + if isinstance(result, list): + return result[0] if result else "" + return result def _parse_score_response(self, response: str) -> tuple[Optional[float], str]: r"""Parse LLM response to extract score and reason. @@ -232,16 +215,19 @@ def _evaluate_answer_correctness( prompt += "- Absence of contradictory information" # Make LLM call and parse response - llm_response = self._call_llm(prompt) - score, reason = self._parse_score_response(llm_response) - - if score is None: - return ( - None, - f"Could not parse score from LLM response: {llm_response[:100]}...", - ) - - return score, f"Custom answer correctness: {score:.2f} - {reason}" + try: + llm_response = self._call_llm(prompt) + score, reason = self._parse_score_response(llm_response) + + if score is None: + return ( + None, + f"Could not parse score from LLM response: {llm_response[:100]}...", + ) + + return score, f"Custom answer correctness: {score:.2f} - {reason}" + except LLMError as e: + return None, f"Answer correctness evaluation failed: {str(e)}" def _evaluate_tool_calls( self, diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py index 5ce97c71..fef2d0e9 100644 --- a/src/lightspeed_evaluation/core/metrics/deepeval.py +++ b/src/lightspeed_evaluation/core/metrics/deepeval.py @@ -26,7 +26,7 @@ def __init__(self, llm_manager: LLMManager): """ # Create LLM Manager for DeepEval metrics self.llm_manager = DeepEvalLLMManager( - llm_manager.get_model_name(), llm_manager.get_litellm_params() + llm_manager.get_model_name(), llm_manager.get_llm_params() ) self.supported_metrics = { diff --git a/src/lightspeed_evaluation/core/metrics/ragas.py b/src/lightspeed_evaluation/core/metrics/ragas.py index f78b9dd5..ac54248e 100644 --- a/src/lightspeed_evaluation/core/metrics/ragas.py +++ b/src/lightspeed_evaluation/core/metrics/ragas.py @@ -36,7 +36,7 @@ def __init__(self, llm_manager: LLMManager, embedding_manager: EmbeddingManager) # Note, it's not actually used, it modifies # global ragas.metrics settings during instance init self.llm_manager = RagasLLMManager( - llm_manager.get_model_name(), llm_manager.get_litellm_params() + llm_manager.get_model_name(), llm_manager.get_llm_params() ) self.embedding_manager = RagasEmbeddingManager(embedding_manager) diff --git a/tests/unit/core/llm/test_manager.py b/tests/unit/core/llm/test_manager.py index 0bfdf8e5..884a01c2 100644 --- a/tests/unit/core/llm/test_manager.py +++ b/tests/unit/core/llm/test_manager.py @@ -66,8 +66,8 @@ def test_get_model_name(self): manager = LLMManager(config) assert manager.get_model_name() == "gpt-4" - def test_get_litellm_params(self): - """Test get_litellm_params method.""" + def test_get_llm_params(self): + """Test get_llm_params method.""" config = LLMConfig( provider="openai", model="gpt-4", @@ -79,7 +79,7 @@ def test_get_litellm_params(self): with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): manager = LLMManager(config) - params = manager.get_litellm_params() + params = manager.get_llm_params() expected = { "model": "gpt-4",