Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions config/registry/geval_metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# GEval Metric Registry
# Define reusable GEval metrics here to avoid repetition in evaluation scenarios.
# These metrics can be referenced in evaluation YAMLs using "geval:<metric_name>"
# without needing to repeat the full configuration.
# Caveat: These are generated metrics and are not recommended for production without
# prior verification.

# ==============================================================================
# TURN-LEVEL METRICS
# ==============================================================================

technical_accuracy:
criteria: |
Assess whether the response provides technically accurate Ansible commands,
playbook syntax, and follows Red Hat best practices. The response should
contain valid YAML syntax and appropriate Ansible modules.
evaluation_params:
- input
- actual_output
- expected_output
evaluation_steps:
- "Verify that the Ansible syntax is valid and follows YAML formatting rules"
- "Check if the response uses appropriate Ansible modules and parameters"
- "Assess whether the solution aligns with Red Hat Ansible documentation"
- "Verify the response addresses the user's specific query or task"
- "Check for potential security issues or anti-patterns"
threshold: 0.7

command_validity:
criteria: |
Evaluate whether the generated commands or playbook tasks are syntactically
correct and would execute successfully in a real Ansible environment.
evaluation_params:
- input
- actual_output
evaluation_steps:
- "Verify proper YAML indentation and structure"
- "Check that module names are valid and correctly spelled"
- "Ensure required parameters for modules are present"
- "Validate that variables and facts are properly referenced"
threshold: 0.8

ansible_best_practices:
criteria: |
Determine whether the response follows Ansible best practices including
idempotency, task naming, handler usage, and role organization.
evaluation_params:
- input
- actual_output
evaluation_steps:
- "Check if tasks are idempotent (can be run multiple times safely)"
- "Verify tasks have descriptive names"
- "Assess proper use of handlers for service restarts"
- "Check for proper variable naming conventions"
- "Verify appropriate use of when conditions and loops"
threshold: 0.7

security_awareness:
criteria: |
Evaluate whether the response demonstrates security awareness and avoids
common security pitfalls in Ansible automation.
evaluation_params:
- input
- actual_output
evaluation_steps:
- "Check for hardcoded credentials or sensitive data"
- "Verify proper use of Ansible Vault references where needed"
- "Assess appropriate file permissions in file/template modules"
- "Check for secure defaults in configurations"
- "Verify no_log usage for sensitive tasks"
threshold: 0.8

# ==============================================================================
# CONVERSATION-LEVEL METRICS
# ==============================================================================

conversation_coherence:
criteria: |
Evaluate whether the conversation maintains context and provides coherent
responses across multiple turns. The assistant should reference previous
exchanges and build upon earlier context.
evaluation_params:
- input
- actual_output
evaluation_steps:
- "Check if the assistant remembers information from previous turns"
- "Verify responses build logically on previous context"
- "Assess whether the conversation flows naturally"
- "Check for contradictions with earlier statements"
threshold: 0.6

task_completion:
criteria: |
Assess whether the conversation successfully helps the user complete their
intended Ansible automation task from start to finish.
evaluation_params:
- input
- actual_output
evaluation_steps:
- "Determine if the user's original goal was identified"
- "Check if all necessary steps were provided"
- "Verify the solution is complete and actionable"
- "Assess if follow-up questions were addressed"
threshold: 0.7

progressive_refinement:
criteria: |
Evaluate whether the conversation demonstrates progressive improvement and
refinement of the Ansible solution based on user feedback and clarifications.
evaluation_params:
- input
- actual_output
evaluation_steps:
- "Check if responses incorporate user feedback"
- "Verify solutions become more specific over turns"
- "Assess whether earlier mistakes are corrected"
- "Check if the assistant adapts to user skill level"
threshold: 0.6

# ==============================================================================
# EXAMPLE USAGE
# ==============================================================================
#
# In your evaluation_data.yaml, reference these metrics as:
#
# turn_metrics:
# - "geval:technical_accuracy"
# - "geval:command_validity"
# - "geval:ansible_best_practices"
#
# conversation_metrics:
# - "geval:conversation_coherence"
# - "geval:task_completion"
#
# You can also override these definitions at runtime using turn_metrics_metadata
# or conversation_metrics_metadata in your evaluation data.
8 changes: 8 additions & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ api:
cache_dir: ".caches/api_cache" # Directory with lightspeed-stack cache
cache_enabled: true # Is lightspeed-stack cache enabled?
# Authentication via API_KEY environment variable only for MCP server

# GEval Configuration
# Configurable custom metrics using DeepEval's GEval framework
geval:
enabled: true # Enable GEval metrics evaluation
registry_path: "config/registry/geval_metrics.yaml" # Path to GEval metrics registry
default_turn_metrics: [] # Optional: auto-apply turn-level GEval metrics
default_conversation_metrics: [] # Optional: auto-apply conversation-level GEval metrics

# Default metrics metadata
metrics_metadata:
Expand Down
7 changes: 6 additions & 1 deletion src/lightspeed_evaluation/core/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,9 @@
from lightspeed_evaluation.core.metrics.ragas import RagasMetrics
from lightspeed_evaluation.core.metrics.script import ScriptEvalMetrics

__all__ = ["RagasMetrics", "DeepEvalMetrics", "CustomMetrics", "ScriptEvalMetrics"]
__all__ = [
"RagasMetrics",
"DeepEvalMetrics",
"CustomMetrics",
"ScriptEvalMetrics",
]
73 changes: 59 additions & 14 deletions src/lightspeed_evaluation/core/metrics/deepeval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
"""DeepEval metrics evaluation using LLM Manager."""
"""DeepEval metrics evaluation using LLM Manager.

This module provides integration with DeepEval metrics including:
1. Standard DeepEval metrics (conversation completeness, relevancy, knowledge retention)
2. GEval integration for configurable custom evaluation criteria
"""

import logging
from typing import Any, Optional

import litellm
Expand All @@ -16,28 +22,45 @@
from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
from lightspeed_evaluation.core.llm.manager import LLMManager
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
from lightspeed_evaluation.core.metrics.geval import GEvalHandler

logger = logging.getLogger(__name__)


class DeepEvalMetrics: # pylint: disable=too-few-public-methods
"""Handles DeepEval metrics evaluation using LLM Manager."""
"""Handles DeepEval metrics evaluation using LLM Manager.

This class provides a unified interface for both standard DeepEval metrics
and GEval (configurable custom metrics). It shares LLM resources between
both evaluation types for efficiency.
"""

def __init__(self, llm_manager: LLMManager):
def __init__(self, llm_manager: LLMManager, registry_path: str | None = None):
"""Initialize with LLM Manager.

Args:
llm_manager: Pre-configured LLMManager with validated parameters
registry_path: Optional path to GEval metrics registry YAML
"""
# Setup cache if enabled (shared across all DeepEval operations)
if llm_manager.get_config().cache_enabled and litellm.cache is None:
cache_dir = llm_manager.get_config().cache_dir
# Modifying global litellm cache as there is no clear way how to do it per model
# Checking if the litellm.cache as there is potential conflict with Ragas code
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=cache_dir)

# Create LLM Manager for DeepEval metrics
# Create shared LLM Manager for all DeepEval metrics (standard + GEval)
self.llm_manager = DeepEvalLLMManager(
llm_manager.get_model_name(), llm_manager.get_llm_params()
)

# Initialize GEval handler with shared LLM manager
self.geval_handler = GEvalHandler(
deepeval_llm_manager=self.llm_manager,
registry_path=registry_path,
)

# Standard DeepEval metrics routing
self.supported_metrics = {
"conversation_completeness": self._evaluate_conversation_completeness,
"conversation_relevancy": self._evaluate_conversation_relevancy,
Expand Down Expand Up @@ -72,16 +95,38 @@ def evaluate(
conv_data: Any,
scope: EvaluationScope,
) -> tuple[Optional[float], str]:
"""Evaluate a DeepEval metric."""
if metric_name not in self.supported_metrics:
return None, f"Unsupported DeepEval metric: {metric_name}"

try:
return self.supported_metrics[metric_name](
conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
)
except (ValueError, AttributeError, KeyError) as e:
return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
"""Evaluate a DeepEval metric (standard or GEval).

This method routes evaluation to either:
- Standard DeepEval metrics (hardcoded implementations)
- GEval metrics (configuration-driven custom metrics)

Args:
metric_name: Name of metric (for GEval, this should NOT include "geval:" prefix)
conv_data: Conversation data object
scope: EvaluationScope containing turn info and conversation flag

Returns:
Tuple of (score, reason)
"""
# Route to standard DeepEval metrics
if metric_name in self.supported_metrics:
try:
return self.supported_metrics[metric_name](
conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
)
except (ValueError, AttributeError, KeyError) as e:
return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"

# Otherwise, assume it's a GEval metric
# Note: metric_name should NOT have "geval:" prefix here
return self.geval_handler.evaluate(
metric_name=metric_name,
conv_data=conv_data,
turn_idx=scope.turn_idx,
turn_data=scope.turn_data,
is_conversation=scope.is_conversation,
)

def _evaluate_conversation_completeness(
self,
Expand Down
Loading
Loading