Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/lightspeed_evaluation/core/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,9 @@
from lightspeed_evaluation.core.metrics.ragas import RagasMetrics
from lightspeed_evaluation.core.metrics.script import ScriptEvalMetrics

__all__ = ["RagasMetrics", "DeepEvalMetrics", "CustomMetrics", "ScriptEvalMetrics"]
__all__ = [
"RagasMetrics",
"DeepEvalMetrics",
"CustomMetrics",
"ScriptEvalMetrics",
]
73 changes: 59 additions & 14 deletions src/lightspeed_evaluation/core/metrics/deepeval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
"""DeepEval metrics evaluation using LLM Manager."""
"""DeepEval metrics evaluation using LLM Manager.

This module provides integration with DeepEval metrics including:
1. Standard DeepEval metrics (conversation completeness, relevancy, knowledge retention)
2. GEval integration for configurable custom evaluation criteria
"""

import logging
from typing import Any, Optional

import litellm
Expand All @@ -16,28 +22,45 @@
from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
from lightspeed_evaluation.core.llm.manager import LLMManager
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
from lightspeed_evaluation.core.metrics.geval import GEvalHandler

logger = logging.getLogger(__name__)


class DeepEvalMetrics: # pylint: disable=too-few-public-methods
"""Handles DeepEval metrics evaluation using LLM Manager."""
"""Handles DeepEval metrics evaluation using LLM Manager.

This class provides a unified interface for both standard DeepEval metrics
and GEval (configurable custom metrics). It shares LLM resources between
both evaluation types for efficiency.
"""

def __init__(self, llm_manager: LLMManager):
def __init__(self, llm_manager: LLMManager, registry_path: str | None = None):
"""Initialize with LLM Manager.

Args:
llm_manager: Pre-configured LLMManager with validated parameters
registry_path: Optional path to GEval metrics registry YAML
"""
# Setup cache if enabled (shared across all DeepEval operations)
if llm_manager.get_config().cache_enabled and litellm.cache is None:
cache_dir = llm_manager.get_config().cache_dir
# Modifying global litellm cache as there is no clear way how to do it per model
# Checking if the litellm.cache as there is potential conflict with Ragas code
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=cache_dir)

# Create LLM Manager for DeepEval metrics
# Create shared LLM Manager for all DeepEval metrics (standard + GEval)
self.llm_manager = DeepEvalLLMManager(
llm_manager.get_model_name(), llm_manager.get_llm_params()
)

# Initialize GEval handler with shared LLM manager
self.geval_handler = GEvalHandler(
deepeval_llm_manager=self.llm_manager,
registry_path=registry_path,
)

# Standard DeepEval metrics routing
self.supported_metrics = {
"conversation_completeness": self._evaluate_conversation_completeness,
"conversation_relevancy": self._evaluate_conversation_relevancy,
Expand Down Expand Up @@ -72,16 +95,38 @@ def evaluate(
conv_data: Any,
scope: EvaluationScope,
) -> tuple[Optional[float], str]:
"""Evaluate a DeepEval metric."""
if metric_name not in self.supported_metrics:
return None, f"Unsupported DeepEval metric: {metric_name}"

try:
return self.supported_metrics[metric_name](
conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
)
except (ValueError, AttributeError, KeyError) as e:
return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
"""Evaluate a DeepEval metric (standard or GEval).

This method routes evaluation to either:
- Standard DeepEval metrics (hardcoded implementations)
- GEval metrics (configuration-driven custom metrics)

Args:
metric_name: Name of metric (for GEval, this should NOT include "geval:" prefix)
conv_data: Conversation data object
scope: EvaluationScope containing turn info and conversation flag

Returns:
Tuple of (score, reason)
"""
# Route to standard DeepEval metrics
if metric_name in self.supported_metrics:
try:
return self.supported_metrics[metric_name](
conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
)
except (ValueError, AttributeError, KeyError) as e:
return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"

# Otherwise, assume it's a GEval metric
# Note: metric_name should NOT have "geval:" prefix here
return self.geval_handler.evaluate(
metric_name=metric_name,
conv_data=conv_data,
_turn_idx=scope.turn_idx,
turn_data=scope.turn_data,
is_conversation=scope.is_conversation,
)

def _evaluate_conversation_completeness(
self,
Expand Down
Loading