lightspeed-core · arin-deloatch · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 9, 2025
diff --git a/config/registry/geval_metrics.yaml b/config/registry/geval_metrics.yaml
@@ -0,0 +1,136 @@
+# GEval Metric Registry
+# Define reusable GEval metrics here to avoid repetition in evaluation scenarios.
+# These metrics can be referenced in evaluation YAMLs using "geval:<metric_name>"
+# without needing to repeat the full configuration.
+# Caveat: These are generated metrics and are not recommended for production without
+# prior verification.
+
+# ==============================================================================
+# TURN-LEVEL METRICS
+# ==============================================================================
+
+technical_accuracy:
+  criteria: |
+    Assess whether the response provides technically accurate Ansible commands,
+    playbook syntax, and follows Red Hat best practices. The response should
+    contain valid YAML syntax and appropriate Ansible modules.
+  evaluation_params:
+    - input
+    - actual_output
+    - expected_output
+  evaluation_steps:
+    - "Verify that the Ansible syntax is valid and follows YAML formatting rules"
+    - "Check if the response uses appropriate Ansible modules and parameters"
+    - "Assess whether the solution aligns with Red Hat Ansible documentation"
+    - "Verify the response addresses the user's specific query or task"
+    - "Check for potential security issues or anti-patterns"
+  threshold: 0.7
+
+command_validity:
+  criteria: |
+    Evaluate whether the generated commands or playbook tasks are syntactically
+    correct and would execute successfully in a real Ansible environment.
+  evaluation_params:
+    - input
+    - actual_output
+  evaluation_steps:
+    - "Verify proper YAML indentation and structure"
+    - "Check that module names are valid and correctly spelled"
+    - "Ensure required parameters for modules are present"
+    - "Validate that variables and facts are properly referenced"
+  threshold: 0.8
+
+ansible_best_practices:
+  criteria: |
+    Determine whether the response follows Ansible best practices including
+    idempotency, task naming, handler usage, and role organization.
+  evaluation_params:
+    - input
+    - actual_output
+  evaluation_steps:
+    - "Check if tasks are idempotent (can be run multiple times safely)"
+    - "Verify tasks have descriptive names"
+    - "Assess proper use of handlers for service restarts"
+    - "Check for proper variable naming conventions"
+    - "Verify appropriate use of when conditions and loops"
+  threshold: 0.7
+
+security_awareness:
+  criteria: |
+    Evaluate whether the response demonstrates security awareness and avoids
+    common security pitfalls in Ansible automation.
+  evaluation_params:
+    - input
+    - actual_output
+  evaluation_steps:
+    - "Check for hardcoded credentials or sensitive data"
+    - "Verify proper use of Ansible Vault references where needed"
+    - "Assess appropriate file permissions in file/template modules"
+    - "Check for secure defaults in configurations"
+    - "Verify no_log usage for sensitive tasks"
+  threshold: 0.8
+
+# ==============================================================================
+# CONVERSATION-LEVEL METRICS
+# ==============================================================================
+
+conversation_coherence:
+  criteria: |
+    Evaluate whether the conversation maintains context and provides coherent
+    responses across multiple turns. The assistant should reference previous
+    exchanges and build upon earlier context.
+  evaluation_params:
+    - input
+    - actual_output
+  evaluation_steps:
+    - "Check if the assistant remembers information from previous turns"
+    - "Verify responses build logically on previous context"
+    - "Assess whether the conversation flows naturally"
+    - "Check for contradictions with earlier statements"
+  threshold: 0.6
+
+task_completion:
+  criteria: |
+    Assess whether the conversation successfully helps the user complete their
+    intended Ansible automation task from start to finish.
+  evaluation_params:
+    - input
+    - actual_output
+  evaluation_steps:
+    - "Determine if the user's original goal was identified"
+    - "Check if all necessary steps were provided"
+    - "Verify the solution is complete and actionable"
+    - "Assess if follow-up questions were addressed"
+  threshold: 0.7
+
+progressive_refinement:
+  criteria: |
+    Evaluate whether the conversation demonstrates progressive improvement and
+    refinement of the Ansible solution based on user feedback and clarifications.
+  evaluation_params:
+    - input
+    - actual_output
+  evaluation_steps:
+    - "Check if responses incorporate user feedback"
+    - "Verify solutions become more specific over turns"
+    - "Assess whether earlier mistakes are corrected"
+    - "Check if the assistant adapts to user skill level"
+  threshold: 0.6
+
+# ==============================================================================
+# EXAMPLE USAGE
+# ==============================================================================
+#
+# In your evaluation_data.yaml, reference these metrics as:
+#
+# turn_metrics:
+#   - "geval:technical_accuracy"
+#   - "geval:command_validity"
+#   - "geval:ansible_best_practices"
+#
+# conversation_metrics:
+#   - "geval:conversation_coherence"
+#   - "geval:task_completion"
+#
+# You can also override these definitions at runtime using turn_metrics_metadata
+# or conversation_metrics_metadata in your evaluation data.
diff --git a/config/system.yaml b/config/system.yaml
@@ -42,6 +42,14 @@ api:
   cache_dir: ".caches/api_cache"  # Directory with lightspeed-stack cache
   cache_enabled: true                  # Is lightspeed-stack cache enabled?
   # Authentication via API_KEY environment variable only for MCP server
+
+# GEval Configuration
+# Configurable custom metrics using DeepEval's GEval framework
+geval:
+  enabled: true                                    # Enable GEval metrics evaluation
+  registry_path: "config/registry/geval_metrics.yaml"  # Path to GEval metrics registry
+  default_turn_metrics: []                         # Optional: auto-apply turn-level GEval metrics
+  default_conversation_metrics: []                 # Optional: auto-apply conversation-level GEval metrics
 
 # Default metrics metadata
 metrics_metadata:

diff --git a/src/lightspeed_evaluation/core/metrics/__init__.py b/src/lightspeed_evaluation/core/metrics/__init__.py
@@ -5,4 +5,9 @@
 from lightspeed_evaluation.core.metrics.ragas import RagasMetrics
 from lightspeed_evaluation.core.metrics.script import ScriptEvalMetrics
 
-__all__ = ["RagasMetrics", "DeepEvalMetrics", "CustomMetrics", "ScriptEvalMetrics"]
+__all__ = [
+    "RagasMetrics",
+    "DeepEvalMetrics",
+    "CustomMetrics",
+    "ScriptEvalMetrics",
+]
diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py
@@ -1,5 +1,11 @@
-"""DeepEval metrics evaluation using LLM Manager."""
+"""DeepEval metrics evaluation using LLM Manager.
 
+This module provides integration with DeepEval metrics including:
+1. Standard DeepEval metrics (conversation completeness, relevancy, knowledge retention)
+2. GEval integration for configurable custom evaluation criteria
+"""
+
+import logging
 from typing import Any, Optional
 
 import litellm
@@ -16,28 +22,45 @@
 from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
 from lightspeed_evaluation.core.llm.manager import LLMManager
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
+from lightspeed_evaluation.core.metrics.geval import GEvalHandler
+
+logger = logging.getLogger(__name__)
 
 
 class DeepEvalMetrics:  # pylint: disable=too-few-public-methods
-    """Handles DeepEval metrics evaluation using LLM Manager."""
+    """Handles DeepEval metrics evaluation using LLM Manager.
+
+    This class provides a unified interface for both standard DeepEval metrics
+    and GEval (configurable custom metrics). It shares LLM resources between
+    both evaluation types for efficiency.
+    """
 
-    def __init__(self, llm_manager: LLMManager):
+    def __init__(self, llm_manager: LLMManager, registry_path: str | None = None):
         """Initialize with LLM Manager.
 
         Args:
             llm_manager: Pre-configured LLMManager with validated parameters
+            registry_path: Optional path to GEval metrics registry YAML
         """
+        # Setup cache if enabled (shared across all DeepEval operations)
         if llm_manager.get_config().cache_enabled and litellm.cache is None:
             cache_dir = llm_manager.get_config().cache_dir
             # Modifying global litellm cache as there is no clear way how to do it per model
             # Checking if the litellm.cache as there is potential conflict with Ragas code
             litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=cache_dir)
 
-        # Create LLM Manager for DeepEval metrics
+        # Create shared LLM Manager for all DeepEval metrics (standard + GEval)
         self.llm_manager = DeepEvalLLMManager(
             llm_manager.get_model_name(), llm_manager.get_llm_params()
         )
 
+        # Initialize GEval handler with shared LLM manager
+        self.geval_handler = GEvalHandler(
+            deepeval_llm_manager=self.llm_manager,
+            registry_path=registry_path,
+        )
+
+        # Standard DeepEval metrics routing
         self.supported_metrics = {
             "conversation_completeness": self._evaluate_conversation_completeness,
             "conversation_relevancy": self._evaluate_conversation_relevancy,
@@ -72,16 +95,38 @@ def evaluate(
         conv_data: Any,
         scope: EvaluationScope,
     ) -> tuple[Optional[float], str]:
-        """Evaluate a DeepEval metric."""
-        if metric_name not in self.supported_metrics:
-            return None, f"Unsupported DeepEval metric: {metric_name}"
-
-        try:
-            return self.supported_metrics[metric_name](
-                conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
-            )
-        except (ValueError, AttributeError, KeyError) as e:
-            return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
+        """Evaluate a DeepEval metric (standard or GEval).
+
+        This method routes evaluation to either:
+        - Standard DeepEval metrics (hardcoded implementations)
+        - GEval metrics (configuration-driven custom metrics)
+
+        Args:
+            metric_name: Name of metric (for GEval, this should NOT include "geval:" prefix)
+            conv_data: Conversation data object
+            scope: EvaluationScope containing turn info and conversation flag
+
+        Returns:
+            Tuple of (score, reason)
+        """
+        # Route to standard DeepEval metrics
+        if metric_name in self.supported_metrics:
+            try:
+                return self.supported_metrics[metric_name](
+                    conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
+                )
+            except (ValueError, AttributeError, KeyError) as e:
+                return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
+
+        # Otherwise, assume it's a GEval metric
+        # Note: metric_name should NOT have "geval:" prefix here
+        return self.geval_handler.evaluate(
+            metric_name=metric_name,
+            conv_data=conv_data,
+            turn_idx=scope.turn_idx,
+            turn_data=scope.turn_data,
+            is_conversation=scope.is_conversation,
+        )
 
     def _evaluate_conversation_completeness(
         self,