Skip to content
Merged
42 changes: 40 additions & 2 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ core:

# LLM as a judge configuration
llm:
provider: "openai" # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
provider: "openai" # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: only spaces, I'd keep the original indent

model: "gpt-4o-mini" # Model name for the provider
temperature: 0.0 # Generation temperature
max_tokens: 512 # Maximum tokens in response
Expand All @@ -28,7 +28,7 @@ embedding:
# To get real time data. Currently it supports lightspeed-stack API.
# But can be easily integrated with other APIs with minimal change.
api:
enabled: true # Enable API calls instead of using pre-filled data
enabled: true # Enable API calls instead of using pre-filled data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: only spaces, I'd keep the original indent

api_base: http://localhost:8080 # Base API URL
endpoint_type: streaming # Use "streaming" or "query" endpoint
timeout: 300 # API request timeout in seconds
Expand Down Expand Up @@ -91,6 +91,26 @@ metrics_metadata:
"script:action_eval":
description: "Script-based evaluation for infrastructure/environment validation"

# GEval turn-level metrics
"geval:technical_accuracy":
criteria: |
Assess whether the response provides technically accurate information,
commands, code, syntax, and follows relevant industry or
domain-specific best practices. The response should
contain valid syntax and use appropriate functions, modules, or tools.
evaluation_params:
- query
- response
- expected_response
evaluation_steps:
- "Verify that the provided syntax (e.g., code, commands, configuration) is valid and follows the language/tool's formatting rules."
- "Check if the response uses appropriate modules, functions, libraries, or parameters for the given task."
- "Assess whether the solution aligns with relevant official documentation or established best practices for the specific domain."
- "Verify the response directly and accurately addresses the user's specific query or task."
- "Check for potential security issues, significant inefficiencies, or anti-patterns."
threshold: 0.7
description: "General technical accuracy of provided commands, code, or technical information"

# Conversation-level metrics metadata
conversation_level:
# DeepEval metrics
Expand All @@ -107,6 +127,24 @@ metrics_metadata:
threshold: 0.7
description: "How well the model retains information from previous turns"

# GEval conversation-level metrics
"geval:conversation_coherence":
criteria: |
Evaluate whether the conversation maintains context and provides coherent
responses across multiple turns. The assistant should reference previous
exchanges and build upon earlier context.
evaluation_params:
- query
- response
evaluation_steps:
- "Check if the assistant remembers information from previous turns"
- "Verify responses build logically on previous context"
- "Assess whether the conversation flows naturally"
- "Check for contradictions with earlier statements"
threshold: 0.6
description: "Context maintenance and coherence across conversation turns"


# Output Configuration
output:
output_dir: "./eval_output"
Expand Down
7 changes: 6 additions & 1 deletion src/lightspeed_evaluation/core/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,9 @@
from lightspeed_evaluation.core.metrics.ragas import RagasMetrics
from lightspeed_evaluation.core.metrics.script import ScriptEvalMetrics

__all__ = ["RagasMetrics", "DeepEvalMetrics", "CustomMetrics", "ScriptEvalMetrics"]
__all__ = [
"RagasMetrics",
"DeepEvalMetrics",
"CustomMetrics",
"ScriptEvalMetrics",
]
82 changes: 68 additions & 14 deletions src/lightspeed_evaluation/core/metrics/deepeval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
"""DeepEval metrics evaluation using LLM Manager."""
"""DeepEval metrics evaluation using LLM Manager.

This module provides integration with DeepEval metrics including:
1. Standard DeepEval metrics (conversation completeness, relevancy, knowledge retention)
2. GEval integration for configurable custom evaluation criteria
"""

import logging
from typing import Any, Optional

import litellm
Expand All @@ -15,29 +21,51 @@

from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
from lightspeed_evaluation.core.llm.manager import LLMManager
from lightspeed_evaluation.core.metrics.geval import GEvalHandler
from lightspeed_evaluation.core.metrics.manager import MetricManager
from lightspeed_evaluation.core.models import EvaluationScope, TurnData

logger = logging.getLogger(__name__)


class DeepEvalMetrics: # pylint: disable=too-few-public-methods
"""Handles DeepEval metrics evaluation using LLM Manager."""
"""Handles DeepEval metrics evaluation using LLM Manager.

def __init__(self, llm_manager: LLMManager):
This class provides a unified interface for both standard DeepEval metrics
and GEval (configurable custom metrics). It shares LLM resources between
both evaluation types for efficiency.
"""

def __init__(
self,
llm_manager: LLMManager,
metric_manager: MetricManager,
):
"""Initialize with LLM Manager.

Args:
llm_manager: Pre-configured LLMManager with validated parameters
metric_manager: MetricManager for accessing metric metadata
"""
# Setup cache if enabled (shared across all DeepEval operations)
if llm_manager.get_config().cache_enabled and litellm.cache is None:
cache_dir = llm_manager.get_config().cache_dir
# Modifying global litellm cache as there is no clear way how to do it per model
# Checking if the litellm.cache as there is potential conflict with Ragas code
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=cache_dir)

# Create LLM Manager for DeepEval metrics
# Create shared LLM Manager for all DeepEval metrics (standard + GEval)
self.llm_manager = DeepEvalLLMManager(
llm_manager.get_model_name(), llm_manager.get_llm_params()
)

# Initialize GEval handler with shared LLM manager and metric manager
self.geval_handler = GEvalHandler(
deepeval_llm_manager=self.llm_manager,
metric_manager=metric_manager,
)

# Standard DeepEval metrics routing
self.supported_metrics = {
"conversation_completeness": self._evaluate_conversation_completeness,
"conversation_relevancy": self._evaluate_conversation_relevancy,
Expand Down Expand Up @@ -72,16 +100,42 @@ def evaluate(
conv_data: Any,
scope: EvaluationScope,
) -> tuple[Optional[float], str]:
"""Evaluate a DeepEval metric."""
if metric_name not in self.supported_metrics:
return None, f"Unsupported DeepEval metric: {metric_name}"

try:
return self.supported_metrics[metric_name](
conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
)
except (ValueError, AttributeError, KeyError) as e:
return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
"""Evaluate a DeepEval metric (standard or GEval).

This method routes evaluation to either:
- Standard DeepEval metrics (hardcoded implementations)
- GEval metrics (configuration-driven custom metrics)

Args:
metric_name: Name of metric (for GEval, this should NOT include "geval:" prefix)
conv_data: Conversation data object
scope: EvaluationScope containing turn info and conversation flag

Returns:
Tuple of (score, reason)
"""
# Route to standard DeepEval metrics
if metric_name in self.supported_metrics:
try:
return self.supported_metrics[metric_name](
conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
)
except (ValueError, AttributeError, KeyError) as e:
return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"

# Otherwise, assume it's a GEval metric
normalized_metric_name = (
metric_name.split(":", 1)[1]
if metric_name.startswith("geval:")
else metric_name
)
return self.geval_handler.evaluate(
metric_name=normalized_metric_name,
conv_data=conv_data,
_turn_idx=scope.turn_idx,
turn_data=scope.turn_data,
is_conversation=scope.is_conversation,
)

def _evaluate_conversation_completeness(
self,
Expand Down
Loading
Loading