Merge pull request #70 from asamal4/common-llm-call

tisnik · web-flow · commit 2ededad22a6e · 2025-10-08T17:09:31.000+02:00
add common custom llm
diff --git a/src/lightspeed_evaluation/core/llm/__init__.py b/src/lightspeed_evaluation/core/llm/__init__.py
@@ -1,5 +1,6 @@
 """LLM management for Evaluation Framework."""
 
+from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
 from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
 from lightspeed_evaluation.core.llm.manager import LLMManager
 from lightspeed_evaluation.core.llm.ragas import RagasLLMManager
@@ -11,6 +12,7 @@
     "LLMConfig",
     "LLMError",
     "LLMManager",
+    "BaseCustomLLM",
     "DeepEvalLLMManager",
     "RagasLLMManager",
     "validate_provider_env",
diff --git a/src/lightspeed_evaluation/core/llm/custom.py b/src/lightspeed_evaluation/core/llm/custom.py
@@ -0,0 +1,75 @@
+"""Base Custom LLM class for evaluation framework."""
+
+from typing import Any, Optional, Union
+
+import litellm
+
+from lightspeed_evaluation.core.system.exceptions import LLMError
+
+
+class BaseCustomLLM:  # pylint: disable=too-few-public-methods
+    """Base LLM class with core calling functionality."""
+
+    def __init__(self, model_name: str, llm_params: dict[str, Any]):
+        """Initialize with model configuration."""
+        self.model_name = model_name
+        self.llm_params = llm_params
+
+    def call(
+        self,
+        prompt: str,
+        n: int = 1,
+        temperature: Optional[float] = None,
+        return_single: bool = True,
+        **kwargs: Any,
+    ) -> Union[str, list[str]]:
+        """Make LLM call and return response(s).
+
+        Args:
+            prompt: Text prompt to send
+            n: Number of responses to generate (default 1)
+            temperature: Override temperature (uses config default if None)
+            return_single: If True and n=1, return single string. If False, always return list.
+            **kwargs: Additional LLM parameters
+
+        Returns:
+            Single string if return_single=True and n=1, otherwise list of strings
+        """
+        temp = (
+            temperature
+            if temperature is not None
+            else self.llm_params.get("temperature", 0.0)
+        )
+
+        call_params = {
+            "model": self.model_name,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": temp,
+            "n": n,
+            "max_tokens": self.llm_params.get("max_tokens"),
+            "timeout": self.llm_params.get("timeout"),
+            "num_retries": self.llm_params.get("num_retries", 3),
+            **kwargs,
+        }
+
+        try:
+            response = litellm.completion(**call_params)
+
+            # Extract content from all choices
+            results = []
+            for choice in response.choices:  # type: ignore
+                content = choice.message.content  # type: ignore
+                if content is None:
+                    content = ""
+                results.append(content.strip())
+
+            # Return format based on parameters
+            if return_single and n == 1:
+                if not results:
+                    raise LLMError("LLM returned empty response")
+                return results[0]
+
+            return results
+
+        except Exception as e:
+            raise LLMError(f"LLM call failed: {str(e)}") from e
diff --git a/src/lightspeed_evaluation/core/llm/deepeval.py b/src/lightspeed_evaluation/core/llm/deepeval.py
@@ -1,4 +1,4 @@
-"""DeepEval LLM Manager - DeepEval-specific LLM wrapper that takes LiteLLM parameters."""
+"""DeepEval LLM Manager - DeepEval-specific LLM wrapper."""
 
 from typing import Any
 
@@ -11,32 +11,32 @@ class DeepEvalLLMManager:
     This manager focuses solely on DeepEval-specific LLM integration.
     """
 
-    def __init__(self, model_name: str, litellm_params: dict[str, Any]):
+    def __init__(self, model_name: str, llm_params: dict[str, Any]):
         """Initialize with LLM parameters from LLMManager."""
         self.model_name = model_name
-        self.litellm_params = litellm_params
+        self.llm_params = llm_params
 
-        # Create DeepEval's LiteLLMModel with provided parameters
+        # Create DeepEval's LLM model with provided parameters
         self.llm_model = LiteLLMModel(
             model=self.model_name,
-            temperature=litellm_params.get("temperature", 0.0),
-            max_tokens=litellm_params.get("max_tokens"),
-            timeout=litellm_params.get("timeout"),
-            num_retries=litellm_params.get("num_retries", 3),
+            temperature=llm_params.get("temperature", 0.0),
+            max_tokens=llm_params.get("max_tokens"),
+            timeout=llm_params.get("timeout"),
+            num_retries=llm_params.get("num_retries", 3),
         )
 
         print(f"✅ DeepEval LLM Manager: {self.model_name}")
 
     def get_llm(self) -> LiteLLMModel:
-        """Get the configured DeepEval LiteLLM model."""
+        """Get the configured DeepEval LLM model."""
         return self.llm_model
 
     def get_model_info(self) -> dict[str, Any]:
         """Get information about the configured model."""
         return {
             "model_name": self.model_name,
-            "temperature": self.litellm_params.get("temperature", 0.0),
-            "max_tokens": self.litellm_params.get("max_tokens"),
-            "timeout": self.litellm_params.get("timeout"),
-            "num_retries": self.litellm_params.get("num_retries", 3),
+            "temperature": self.llm_params.get("temperature", 0.0),
+            "max_tokens": self.llm_params.get("max_tokens"),
+            "timeout": self.llm_params.get("timeout"),
+            "num_retries": self.llm_params.get("num_retries", 3),
         }
diff --git a/src/lightspeed_evaluation/core/llm/manager.py b/src/lightspeed_evaluation/core/llm/manager.py
@@ -13,7 +13,7 @@ class LLMManager:
     Responsibilities:
     - Environment validation for multiple providers
     - Model name construction
-    - Provides LiteLLM parameters for consumption by framework-specific managers
+    - Provides LLM parameters for consumption by framework-specific managers
     """
 
     def __init__(self, config: LLMConfig):
@@ -25,7 +25,7 @@ def __init__(self, config: LLMConfig):
         )
 
     def _construct_model_name_and_validate(self) -> str:
-        """Construct model name for LiteLLM and validate required environment variables."""
+        """Construct model name and validate required environment variables."""
         provider = self.config.provider.lower()
 
         # Provider-specific validation and model name construction
@@ -89,11 +89,11 @@ def _handle_ollama_provider(self) -> str:
         return f"ollama/{self.config.model}"
 
     def get_model_name(self) -> str:
-        """Get the constructed LiteLLM model name."""
+        """Get the constructed model name."""
         return self.model_name
 
-    def get_litellm_params(self) -> dict[str, Any]:
-        """Get parameters for LiteLLM completion calls."""
+    def get_llm_params(self) -> dict[str, Any]:
+        """Get parameters for LLM completion calls."""
         return {
             "model": self.model_name,
             "temperature": self.config.temperature,
diff --git a/src/lightspeed_evaluation/core/llm/ragas.py b/src/lightspeed_evaluation/core/llm/ragas.py
@@ -1,20 +1,21 @@
-"""Ragas LLM Manager - Ragas-specific LLM wrapper that takes LiteLLM parameters."""
+"""Ragas LLM Manager - Ragas-specific LLM wrapper."""
 
 from typing import Any, Optional
 
-import litellm
 from ragas.llms.base import BaseRagasLLM, Generation, LLMResult
 from ragas.metrics import answer_relevancy, faithfulness
 
+from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
+from lightspeed_evaluation.core.system.exceptions import LLMError
 
-class RagasCustomLLM(BaseRagasLLM):
-    """Custom LLM for Ragas using LiteLLM parameters."""
 
-    def __init__(self, model_name: str, litellm_params: dict[str, Any]):
-        """Initialize Ragas custom LLM with model name and LiteLLM parameters."""
-        super().__init__()
-        self.model_name = model_name
-        self.litellm_params = litellm_params
+class RagasCustomLLM(BaseRagasLLM, BaseCustomLLM):
+    """Custom LLM for Ragas."""
+
+    def __init__(self, model_name: str, llm_params: dict[str, Any]):
+        """Initialize Ragas custom LLM with model name and LLM parameters."""
+        BaseRagasLLM.__init__(self)
+        BaseCustomLLM.__init__(self, model_name, llm_params)
         print(f"✅ Ragas Custom LLM: {self.model_name}")
 
     def generate_text(  # pylint: disable=too-many-arguments,too-many-positional-arguments
@@ -25,42 +26,38 @@ def generate_text(  # pylint: disable=too-many-arguments,too-many-positional-arg
         stop: Optional[list[str]] = None,
         callbacks: Optional[Any] = None,
     ) -> LLMResult:
-        """Generate text using LiteLLM with provided parameters."""
+        """Generate text using LLM with provided parameters."""
         prompt_text = str(prompt)
 
         # Use temperature from params unless explicitly overridden
         temp = (
             temperature
             if temperature != 1e-08
-            else self.litellm_params.get("temperature", 0.0)
+            else self.llm_params.get("temperature", 0.0)
         )
 
         try:
-            response = litellm.completion(
-                model=self.model_name,
-                messages=[{"role": "user", "content": prompt_text}],
-                n=n,
-                temperature=temp,
-                max_tokens=self.litellm_params.get("max_tokens"),
-                timeout=self.litellm_params.get("timeout"),
-                num_retries=self.litellm_params.get("num_retries"),
+            # Use inherited BaseCustomLLM functionality
+            call_kwargs = {}
+            if stop is not None:
+                call_kwargs["stop"] = stop
+
+            responses = self.call(
+                prompt_text, n=n, temperature=temp, return_single=False, **call_kwargs
             )
 
             # Convert to Ragas format
             generations = []
-            for choice in response.choices:  # type: ignore
-                content = choice.message.content  # type: ignore
-                if content is None:
-                    content = ""
-                gen = Generation(text=content.strip())
+            for response_text in responses:
+                gen = Generation(text=response_text)
                 generations.append(gen)
 
             result = LLMResult(generations=[generations])
             return result
 
         except Exception as e:
             print(f"❌ Ragas LLM failed: {e}")
-            raise RuntimeError(f"Ragas LLM evaluation failed: {str(e)}") from e
+            raise LLMError(f"Ragas LLM evaluation failed: {str(e)}") from e
 
     async def agenerate_text(  # pylint: disable=too-many-arguments,too-many-positional-arguments
         self,
@@ -87,11 +84,11 @@ class RagasLLMManager:
     This manager focuses solely on Ragas-specific LLM integration.
     """
 
-    def __init__(self, model_name: str, litellm_params: dict[str, Any]):
+    def __init__(self, model_name: str, llm_params: dict[str, Any]):
         """Initialize with LLM parameters from LLMManager."""
         self.model_name = model_name
-        self.litellm_params = litellm_params
-        self.custom_llm = RagasCustomLLM(model_name, litellm_params)
+        self.llm_params = llm_params
+        self.custom_llm = RagasCustomLLM(model_name, llm_params)
 
         # Configure Ragas metrics to use our custom LLM
         answer_relevancy.llm = self.custom_llm
@@ -107,5 +104,5 @@ def get_model_info(self) -> dict[str, Any]:
         """Get information about the configured model."""
         return {
             "model_name": self.model_name,
-            "temperature": self.litellm_params.get("temperature", 0.0),
+            "temperature": self.llm_params.get("temperature", 0.0),
         }
diff --git a/src/lightspeed_evaluation/core/metrics/custom.py b/src/lightspeed_evaluation/core/metrics/custom.py
@@ -3,12 +3,13 @@
 import re
 from typing import Any, Optional
 
-import litellm
 from pydantic import BaseModel, Field
 
+from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
 from lightspeed_evaluation.core.llm.manager import LLMManager
 from lightspeed_evaluation.core.metrics.tool_eval import evaluate_tool_calls
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
+from lightspeed_evaluation.core.system.exceptions import LLMError
 
 
 class EvaluationPromptParams(BaseModel):
@@ -27,23 +28,24 @@ class EvaluationPromptParams(BaseModel):
 
 
 class CustomMetrics:  # pylint: disable=too-few-public-methods
-    """Handles custom metrics using LLMManager for direct LiteLLM calls."""
+    """Handles custom metrics using LLMManager for direct LLM calls."""
 
     def __init__(self, llm_manager: LLMManager):
         """Initialize with LLM Manager.
 
         Args:
             llm_manager: Pre-configured LLMManager with validated parameters
         """
-        self.model_name = llm_manager.get_model_name()
-        self.litellm_params = llm_manager.get_litellm_params()
+        self.llm = BaseCustomLLM(
+            llm_manager.get_model_name(), llm_manager.get_llm_params()
+        )
 
         self.supported_metrics = {
             "answer_correctness": self._evaluate_answer_correctness,
             "tool_eval": self._evaluate_tool_calls,
         }
 
-        print(f"✅ Custom Metrics initialized: {self.model_name}")
+        print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
 
     def evaluate(
         self,
@@ -62,31 +64,12 @@ def evaluate(
         except (ValueError, AttributeError, KeyError) as e:
             return None, f"Custom {metric_name} evaluation failed: {str(e)}"
 
-    def _call_llm(self, prompt: str, system_prompt: Optional[str] = None) -> str:
-        """Make a LiteLLM call with the configured parameters."""
-        # Prepare messages
-        messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": prompt})
-
-        try:
-            response = litellm.completion(
-                model=self.model_name,
-                messages=messages,
-                temperature=self.litellm_params.get("temperature", 0.0),
-                max_tokens=self.litellm_params.get("max_tokens"),
-                timeout=self.litellm_params.get("timeout"),
-                num_retries=self.litellm_params.get("num_retries", 3),
-            )
-
-            content = response.choices[0].message.content  # type: ignore
-            if content is None:
-                raise RuntimeError("LLM returned empty response")
-            return content.strip()
-
-        except Exception as e:
-            raise RuntimeError(f"LiteLLM call failed: {str(e)}") from e
+    def _call_llm(self, prompt: str) -> str:
+        """Make an LLM call with the configured parameters."""
+        result = self.llm.call(prompt, return_single=True)
+        if isinstance(result, list):
+            return result[0] if result else ""
+        return result
 
     def _parse_score_response(self, response: str) -> tuple[Optional[float], str]:
         r"""Parse LLM response to extract score and reason.
@@ -232,16 +215,19 @@ def _evaluate_answer_correctness(
         prompt += "- Absence of contradictory information"
 
         # Make LLM call and parse response
-        llm_response = self._call_llm(prompt)
-        score, reason = self._parse_score_response(llm_response)
-
-        if score is None:
-            return (
-                None,
-                f"Could not parse score from LLM response: {llm_response[:100]}...",
-            )
-
-        return score, f"Custom answer correctness: {score:.2f} - {reason}"
+        try:
+            llm_response = self._call_llm(prompt)
+            score, reason = self._parse_score_response(llm_response)
+
+            if score is None:
+                return (
+                    None,
+                    f"Could not parse score from LLM response: {llm_response[:100]}...",
+                )
+
+            return score, f"Custom answer correctness: {score:.2f} - {reason}"
+        except LLMError as e:
+            return None, f"Answer correctness evaluation failed: {str(e)}"
 
     def _evaluate_tool_calls(
         self,
diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py
diff --git a/src/lightspeed_evaluation/core/metrics/ragas.py b/src/lightspeed_evaluation/core/metrics/ragas.py
diff --git a/tests/unit/core/llm/test_manager.py b/tests/unit/core/llm/test_manager.py