lightspeed-core · tisnik · Nov 20, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 10, 2025
diff --git a/config/system.yaml b/config/system.yaml
@@ -6,7 +6,7 @@ core:
 
 # LLM as a judge configuration
 llm:
-  provider: "openai"          # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
+  provider: "openai"      # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
   model: "gpt-4o-mini"        # Model name for the provider
   temperature: 0.0            # Generation temperature
   max_tokens: 512             # Maximum tokens in response
@@ -28,7 +28,7 @@ embedding:
 # To get real time data. Currently it supports lightspeed-stack API.
 # But can be easily integrated with other APIs with minimal change.
 api:
-  enabled: true                        # Enable API calls instead of using pre-filled data
+  enabled: true                       # Enable API calls instead of using pre-filled data
   api_base: http://localhost:8080      # Base API URL
   endpoint_type: streaming             # Use "streaming" or "query" endpoint
   timeout: 300                         # API request timeout in seconds
@@ -91,6 +91,26 @@ metrics_metadata:
     "script:action_eval":
       description: "Script-based evaluation for infrastructure/environment validation"
 
+    # GEval turn-level metrics 
+    "geval:technical_accuracy":
+      criteria: |
+        Assess whether the response provides technically accurate information,
+        commands, code, syntax, and follows relevant industry or
+        domain-specific best practices. The response should
+        contain valid syntax and use appropriate functions, modules, or tools.
+      evaluation_params:
+        - query
+        - response
+        - expected_response
+      evaluation_steps:
+        - "Verify that the provided syntax (e.g., code, commands, configuration) is valid and follows the language/tool's formatting rules."
+        - "Check if the response uses appropriate modules, functions, libraries, or parameters for the given task."
+        - "Assess whether the solution aligns with relevant official documentation or established best practices for the specific domain."
+        - "Verify the response directly and accurately addresses the user's specific query or task."
+        - "Check for potential security issues, significant inefficiencies, or anti-patterns."
+      threshold: 0.7
+      description: "General technical accuracy of provided commands, code, or technical information"
+
   # Conversation-level metrics metadata
   conversation_level:
     # DeepEval metrics
@@ -107,6 +127,24 @@ metrics_metadata:
       threshold: 0.7
       description: "How well the model retains information from previous turns"
 
+    # GEval conversation-level metrics
+    "geval:conversation_coherence":
+      criteria: |
+        Evaluate whether the conversation maintains context and provides coherent
+        responses across multiple turns. The assistant should reference previous
+        exchanges and build upon earlier context.
+      evaluation_params:
+        - query
+        - response
+      evaluation_steps:
+        - "Check if the assistant remembers information from previous turns"
+        - "Verify responses build logically on previous context"
+        - "Assess whether the conversation flows naturally"
+        - "Check for contradictions with earlier statements"
+      threshold: 0.6
+      description: "Context maintenance and coherence across conversation turns"
+
+
 # Output Configuration
 output:
   output_dir: "./eval_output"

diff --git a/src/lightspeed_evaluation/core/metrics/__init__.py b/src/lightspeed_evaluation/core/metrics/__init__.py
@@ -5,4 +5,9 @@
 from lightspeed_evaluation.core.metrics.ragas import RagasMetrics
 from lightspeed_evaluation.core.metrics.script import ScriptEvalMetrics
 
-__all__ = ["RagasMetrics", "DeepEvalMetrics", "CustomMetrics", "ScriptEvalMetrics"]
+__all__ = [
+    "RagasMetrics",
+    "DeepEvalMetrics",
+    "CustomMetrics",
+    "ScriptEvalMetrics",
+]
diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py
@@ -1,5 +1,11 @@
-"""DeepEval metrics evaluation using LLM Manager."""
+"""DeepEval metrics evaluation using LLM Manager.
 
+This module provides integration with DeepEval metrics including:
+1. Standard DeepEval metrics (conversation completeness, relevancy, knowledge retention)
+2. GEval integration for configurable custom evaluation criteria
+"""
+
+import logging
 from typing import Any, Optional
 
 import litellm
@@ -15,29 +21,51 @@
 
 from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
 from lightspeed_evaluation.core.llm.manager import LLMManager
+from lightspeed_evaluation.core.metrics.geval import GEvalHandler
+from lightspeed_evaluation.core.metrics.manager import MetricManager
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
 
+logger = logging.getLogger(__name__)
+
 
 class DeepEvalMetrics:  # pylint: disable=too-few-public-methods
-    """Handles DeepEval metrics evaluation using LLM Manager."""
+    """Handles DeepEval metrics evaluation using LLM Manager.
 
-    def __init__(self, llm_manager: LLMManager):
+    This class provides a unified interface for both standard DeepEval metrics
+    and GEval (configurable custom metrics). It shares LLM resources between
+    both evaluation types for efficiency.
+    """
+
+    def __init__(
+        self,
+        llm_manager: LLMManager,
+        metric_manager: MetricManager,
+    ):
         """Initialize with LLM Manager.
 
         Args:
             llm_manager: Pre-configured LLMManager with validated parameters
+            metric_manager: MetricManager for accessing metric metadata
         """
+        # Setup cache if enabled (shared across all DeepEval operations)
         if llm_manager.get_config().cache_enabled and litellm.cache is None:
             cache_dir = llm_manager.get_config().cache_dir
             # Modifying global litellm cache as there is no clear way how to do it per model
             # Checking if the litellm.cache as there is potential conflict with Ragas code
             litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=cache_dir)
 
-        # Create LLM Manager for DeepEval metrics
+        # Create shared LLM Manager for all DeepEval metrics (standard + GEval)
         self.llm_manager = DeepEvalLLMManager(
             llm_manager.get_model_name(), llm_manager.get_llm_params()
         )
 
+        # Initialize GEval handler with shared LLM manager and metric manager
+        self.geval_handler = GEvalHandler(
+            deepeval_llm_manager=self.llm_manager,
+            metric_manager=metric_manager,
+        )
+
+        # Standard DeepEval metrics routing
         self.supported_metrics = {
             "conversation_completeness": self._evaluate_conversation_completeness,
             "conversation_relevancy": self._evaluate_conversation_relevancy,
@@ -72,16 +100,42 @@ def evaluate(
         conv_data: Any,
         scope: EvaluationScope,
     ) -> tuple[Optional[float], str]:
-        """Evaluate a DeepEval metric."""
-        if metric_name not in self.supported_metrics:
-            return None, f"Unsupported DeepEval metric: {metric_name}"
-
-        try:
-            return self.supported_metrics[metric_name](
-                conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
-            )
-        except (ValueError, AttributeError, KeyError) as e:
-            return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
+        """Evaluate a DeepEval metric (standard or GEval).
+
+        This method routes evaluation to either:
+        - Standard DeepEval metrics (hardcoded implementations)
+        - GEval metrics (configuration-driven custom metrics)
+
+        Args:
+            metric_name: Name of metric (for GEval, this should NOT include "geval:" prefix)
+            conv_data: Conversation data object
+            scope: EvaluationScope containing turn info and conversation flag
+
+        Returns:
+            Tuple of (score, reason)
+        """
+        # Route to standard DeepEval metrics
+        if metric_name in self.supported_metrics:
+            try:
+                return self.supported_metrics[metric_name](
+                    conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
+                )
+            except (ValueError, AttributeError, KeyError) as e:
+                return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
+
+        # Otherwise, assume it's a GEval metric
+        normalized_metric_name = (
+            metric_name.split(":", 1)[1]
+            if metric_name.startswith("geval:")
+            else metric_name
+        )
+        return self.geval_handler.evaluate(
+            metric_name=normalized_metric_name,
+            conv_data=conv_data,
+            _turn_idx=scope.turn_idx,
+            turn_data=scope.turn_data,
+            is_conversation=scope.is_conversation,
+        )
 
     def _evaluate_conversation_completeness(
         self,