From 9a179404e22b1dfba822da92674fda95a609ad77 Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Fri, 7 Nov 2025 15:15:00 -0700
Subject: [PATCH 1/5] feat: added GEval handler; integrated component into
 DeepEvalMetrics

---
 .../core/metrics/__init__.py                  |   7 +-
 .../core/metrics/deepeval.py                  |  73 ++-
 .../core/metrics/geval.py                     | 499 ++++++++++++++++++
 3 files changed, 564 insertions(+), 15 deletions(-)
 create mode 100644 src/lightspeed_evaluation/core/metrics/geval.py

diff --git a/src/lightspeed_evaluation/core/metrics/__init__.py b/src/lightspeed_evaluation/core/metrics/__init__.py
index 8f670d01..048d977e 100644
--- a/src/lightspeed_evaluation/core/metrics/__init__.py
+++ b/src/lightspeed_evaluation/core/metrics/__init__.py
@@ -5,4 +5,9 @@
 from lightspeed_evaluation.core.metrics.ragas import RagasMetrics
 from lightspeed_evaluation.core.metrics.script import ScriptEvalMetrics
 
-__all__ = ["RagasMetrics", "DeepEvalMetrics", "CustomMetrics", "ScriptEvalMetrics"]
+__all__ = [
+    "RagasMetrics",
+    "DeepEvalMetrics",
+    "CustomMetrics",
+    "ScriptEvalMetrics",
+]
diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py
index e8eccc2a..0585a581 100644
--- a/src/lightspeed_evaluation/core/metrics/deepeval.py
+++ b/src/lightspeed_evaluation/core/metrics/deepeval.py
@@ -1,5 +1,11 @@
-"""DeepEval metrics evaluation using LLM Manager."""
+"""DeepEval metrics evaluation using LLM Manager.
 
+This module provides integration with DeepEval metrics including:
+1. Standard DeepEval metrics (conversation completeness, relevancy, knowledge retention)
+2. GEval integration for configurable custom evaluation criteria
+"""
+
+import logging
 from typing import Any, Optional
 
 import litellm
@@ -16,28 +22,45 @@
 from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
 from lightspeed_evaluation.core.llm.manager import LLMManager
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
+from lightspeed_evaluation.core.metrics.geval import GEvalHandler
+
+logger = logging.getLogger(__name__)
 
 
 class DeepEvalMetrics:  # pylint: disable=too-few-public-methods
-    """Handles DeepEval metrics evaluation using LLM Manager."""
+    """Handles DeepEval metrics evaluation using LLM Manager.
+
+    This class provides a unified interface for both standard DeepEval metrics
+    and GEval (configurable custom metrics). It shares LLM resources between
+    both evaluation types for efficiency.
+    """
 
-    def __init__(self, llm_manager: LLMManager):
+    def __init__(self, llm_manager: LLMManager, registry_path: str | None = None):
         """Initialize with LLM Manager.
 
         Args:
             llm_manager: Pre-configured LLMManager with validated parameters
+            registry_path: Optional path to GEval metrics registry YAML
         """
+        # Setup cache if enabled (shared across all DeepEval operations)
         if llm_manager.get_config().cache_enabled and litellm.cache is None:
             cache_dir = llm_manager.get_config().cache_dir
             # Modifying global litellm cache as there is no clear way how to do it per model
             # Checking if the litellm.cache as there is potential conflict with Ragas code
             litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=cache_dir)
 
-        # Create LLM Manager for DeepEval metrics
+        # Create shared LLM Manager for all DeepEval metrics (standard + GEval)
         self.llm_manager = DeepEvalLLMManager(
             llm_manager.get_model_name(), llm_manager.get_llm_params()
         )
 
+        # Initialize GEval handler with shared LLM manager
+        self.geval_handler = GEvalHandler(
+            deepeval_llm_manager=self.llm_manager,
+            registry_path=registry_path,
+        )
+
+        # Standard DeepEval metrics routing
         self.supported_metrics = {
             "conversation_completeness": self._evaluate_conversation_completeness,
             "conversation_relevancy": self._evaluate_conversation_relevancy,
@@ -72,16 +95,38 @@ def evaluate(
         conv_data: Any,
         scope: EvaluationScope,
     ) -> tuple[Optional[float], str]:
-        """Evaluate a DeepEval metric."""
-        if metric_name not in self.supported_metrics:
-            return None, f"Unsupported DeepEval metric: {metric_name}"
-
-        try:
-            return self.supported_metrics[metric_name](
-                conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
-            )
-        except (ValueError, AttributeError, KeyError) as e:
-            return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
+        """Evaluate a DeepEval metric (standard or GEval).
+
+        This method routes evaluation to either:
+        - Standard DeepEval metrics (hardcoded implementations)
+        - GEval metrics (configuration-driven custom metrics)
+
+        Args:
+            metric_name: Name of metric (for GEval, this should NOT include "geval:" prefix)
+            conv_data: Conversation data object
+            scope: EvaluationScope containing turn info and conversation flag
+
+        Returns:
+            Tuple of (score, reason)
+        """
+        # Route to standard DeepEval metrics
+        if metric_name in self.supported_metrics:
+            try:
+                return self.supported_metrics[metric_name](
+                    conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
+                )
+            except (ValueError, AttributeError, KeyError) as e:
+                return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
+
+        # Otherwise, assume it's a GEval metric
+        # Note: metric_name should NOT have "geval:" prefix here
+        return self.geval_handler.evaluate(
+            metric_name=metric_name,
+            conv_data=conv_data,
+            turn_idx=scope.turn_idx,
+            turn_data=scope.turn_data,
+            is_conversation=scope.is_conversation,
+        )
 
     def _evaluate_conversation_completeness(
         self,
diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py
new file mode 100644
index 00000000..df0fafb2
--- /dev/null
+++ b/src/lightspeed_evaluation/core/metrics/geval.py
@@ -0,0 +1,499 @@
+"""GEval metrics handler using LLM Manager.
+
+This module provides integration with DeepEval's GEval for configurable custom evaluation criteria.
+GEval allows runtime-defined evaluation metrics through YAML configuration.
+"""
+
+import logging
+from pathlib import Path
+from typing import Any
+
+import yaml
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+
+from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
+
+logger = logging.getLogger(__name__)
+
+
+class GEvalHandler:
+    """Handler for configurable GEval metrics.
+
+    This class integrates with the lightspeed-evaluation framework
+    to provide GEval evaluation with criteria defined either in:
+    1. A centralized metric registry (config/registry/geval_metrics.yaml)
+    2. Runtime YAML configuration (turn_metrics_metadata)
+
+    Priority: Runtime metadata overrides registry definitions.
+    """
+
+    # Class-level registry cache (shared across instances)
+    _registry: dict[str, Any] | None = None
+    _registry_path: Path | None = None
+
+    def __init__(
+        self,
+        deepeval_llm_manager: DeepEvalLLMManager,
+        registry_path: str | None = None,
+    ) -> None:
+        """Initialize GEval handler.
+
+        Args:
+            deepeval_llm_manager: Shared DeepEvalLLMManager instance
+            registry_path: Optional path to metric registry YAML.
+                          If not provided, looks for config/registry/geval_metrics.yaml
+                          relative to project root.
+        """
+        self.deepeval_llm_manager = deepeval_llm_manager
+        self._load_registry(registry_path)
+
+    def _load_registry(self, registry_path: str | None = None) -> None:
+        """
+        Load the GEval metric registry from a YAML configuration file.
+
+        This method initializes the class-level `_registry`.
+        It supports both user-specified and auto-discovered paths, searching common
+        locations relative to the current working directory and the package root.
+
+        If no valid registry file is found, it logs a warning and initializes an
+        empty registry (meaning GEval will rely solely on runtime metadata).
+
+        Args:
+            registry_path (str | None): Optional explicit path to a registry YAML file.
+
+        Behavior:
+            - If the registry has already been loaded, the function returns immediately.
+            - If `registry_path` is provided, it is used directly.
+            - Otherwise, common fallback paths are checked for existence.
+            - If a registry is found, it is parsed with `yaml.safe_load`.
+            - Any exceptions during file access or parsing are logged, and an empty
+            registry is used as a fallback.
+        """
+        # Only load once per class
+        if GEvalHandler._registry is not None:
+            return
+
+        # Determine registry path
+        if registry_path:
+            path = Path(registry_path)
+        else:
+            # Look for config/registry/geval_metrics.yaml relative to project root
+            # Try multiple locations
+            possible_paths = [
+                Path.cwd() / "config" / "registry" / "geval_metrics.yaml",
+                Path(__file__).parent.parent.parent.parent
+                / "config"
+                / "registry"
+                / "geval_metrics.yaml",
+            ]
+            path = None
+            for p in possible_paths:
+                if p.exists():
+                    path = p
+                    break
+        # Handle missing or invalid registry
+        if path is None or not path.exists():
+            logger.warning(
+                f"GEval metric registry not found at expected locations. "
+                f"Tried: {[str(p) for p in possible_paths]}. "
+                f"Will fall back to runtime metadata only."
+            )
+            GEvalHandler._registry = {}
+            return
+
+        # Load registry file
+        try:
+            with open(path) as f:
+                GEvalHandler._registry = (
+                    yaml.safe_load(f) or {}
+                )  # Default to empty dict if file is empty
+                GEvalHandler._registry_path = path
+                num_metrics = (
+                    len(GEvalHandler._registry) if GEvalHandler._registry else 0
+                )
+                logger.info(f"Loaded {num_metrics} GEval metrics from {path}")
+        except Exception as e:
+            logger.error(f"Failed to load GEval registry from {path}: {e}")
+            GEvalHandler._registry = {}
+
+    def evaluate(
+        self,
+        metric_name: str,
+        conv_data: Any,
+        turn_idx: int | None,  # noqa: ARG002
+        turn_data: Any | None,
+        is_conversation: bool,
+    ) -> tuple[float | None, str]:
+        """
+        Evaluate using GEval with runtime configuration.
+
+        This method is the central entry point for running GEval evaluations.
+        It retrieves the appropriate metric configuration (from registry or runtime
+        metadata), extracts evaluation parameters, and delegates the actual scoring
+        to either conversation-level or turn-level evaluators.
+
+         Args:
+            metric_name (str):
+                The name of the metric to evaluate (e.g., "technical_accuracy").
+            conv_data (Any):
+                The conversation data object containing context, messages, and
+                associated metadata.
+            turn_idx (int | None):
+                The index of the current turn in the conversation.
+                (Currently unused but kept for interface compatibility.)
+            turn_data (Any | None):
+                The turn-level data object, required when evaluating turn-level metrics.
+            is_conversation (bool):
+                Indicates whether the evaluation should run on the entire
+                conversation (`True`) or on an individual turn (`False`).
+
+        Returns:
+        tuple[float | None, str]:
+            A tuple containing:
+              - **score** (float | None): The computed metric score, or None if evaluation failed.
+              - **reason** (str): A descriptive reason or error message.
+
+        Behavior:
+        1. Fetch GEval configuration from metadata using `_get_geval_config()`.
+        2. Validate that required fields (e.g., "criteria") are present.
+        3. Extract key parameters such as criteria, evaluation steps, and threshold.
+        4. Delegate to `_evaluate_conversation()` or `_evaluate_turn()` depending
+           on the `is_conversation` flag.
+        """
+        # Extract GEval configuration from metadata
+        # May come from runtime metadata or a preloaded registry
+        geval_config = self._get_geval_config(
+            metric_name, conv_data, turn_data, is_conversation
+        )
+
+        # If no configuration is available, return early with an informative message.
+        if not geval_config:
+            return None, f"GEval configuration not found for metric '{metric_name}'"
+
+        # Extract configuration parameters
+        criteria = geval_config.get("criteria")
+        evaluation_params = geval_config.get("evaluation_params", [])
+        evaluation_steps = geval_config.get("evaluation_steps")
+        threshold = geval_config.get("threshold", 0.5)
+
+        # The criteria field defines what the model is being judged on.
+        # Without it, we cannot perform evaluation. Evaluation steps can be generated
+        if not criteria:
+            return None, "GEval requires 'criteria' in configuration"
+
+        # Perform evaluation based on level (turn or conversation)
+        if is_conversation:
+            return self._evaluate_conversation(
+                conv_data, criteria, evaluation_params, evaluation_steps, threshold
+            )
+        else:
+            return self._evaluate_turn(
+                turn_data, criteria, evaluation_params, evaluation_steps, threshold
+            )
+
+    def _convert_evaluation_params(
+        self, params: list[str]
+    ) -> list[LLMTestCaseParams] | None:
+        """
+        Convert a list of string parameter names into `LLMTestCaseParams` enum values.
+
+        This helper ensures that the evaluation parameters passed into GEval are properly
+        typed as `LLMTestCaseParams` (used by DeepEval's test-case schema). If any parameter is not a
+        valid enum member, the function treats the entire parameter list as "custom" and returns `None`.
+        This allows GEval to automatically infer the required fields at runtime rather than forcing
+        strict schema compliance.
+
+        Args:
+            params (list[str]):
+                A list of string identifiers (e.g., ["input", "actual_output"]).
+                These typically come from a YAML or runtime configuration and
+                may not always match enum names exactly.
+        Returns:
+            List of LLMTestCaseParams enum values, or None if params are custom strings
+        """
+        # Return early if no parameters were supplied
+        if not params:
+            return None
+
+        # Try to convert strings to enum values
+        converted: list[LLMTestCaseParams] = []
+
+        # Attempt to convert each string into a valid enum value
+        for param in params:
+            try:
+                # Try to match as enum value (e.g., "INPUT", "ACTUAL_OUTPUT")
+                enum_value = LLMTestCaseParams[param.upper().replace(" ", "_")]
+                converted.append(enum_value)
+            except (KeyError, AttributeError):
+                # Not a valid enum - these are custom params, skip them
+                logger.debug(
+                    f"Skipping custom evaluation_param '{param}' - "
+                    f"not a valid LLMTestCaseParams enum. "
+                    f"GEval will auto-detect required fields."
+                )
+                return None
+
+        # Return the successfully converted list, or None if it ended up empty
+        return converted if converted else None
+
+    def _evaluate_turn(
+        self,
+        turn_data: Any,
+        criteria: str,
+        evaluation_params: list[str],
+        evaluation_steps: list[str] | None,
+        threshold: float,
+    ) -> tuple[float | None, str]:
+        """
+            Evaluate a single turn using GEval.
+
+            Args:
+            turn_data (Any):
+                The turn-level data object containing fields like query, response,
+                expected_response, and context.
+            criteria (str):
+                Natural-language description of what the evaluation should judge.
+                Example: "Assess factual correctness and command validity."
+            evaluation_params (list[str]):
+                A list of string parameters defining which fields to include
+                (e.g., ["input", "actual_output"]).
+            evaluation_steps (list[str] | None):
+                Optional step-by-step evaluation guidance for the model.
+            threshold (float):
+                Minimum score threshold that determines pass/fail behavior.
+
+        Returns:
+            tuple[float | None, str]:
+                A tuple of (score, reason). If evaluation fails, score will be None
+                and the reason will contain an error message.
+        """
+        # Validate that we actually have turn data
+        if not turn_data:
+            return None, "Turn data required for turn-level GEval"
+
+        # Convert evaluation_params to enum values if valid, otherwise use defaults
+        converted_params = self._convert_evaluation_params(evaluation_params)
+        if not converted_params:
+            # If no valid params, use sensible defaults for turn evaluation
+            converted_params = [
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ]
+
+        # Create GEval metric with runtime configuration
+        metric_kwargs: dict[str, Any] = {
+            "name": "GEval Turn Metric",
+            "criteria": criteria,
+            "evaluation_params": converted_params,
+            "model": self.deepeval_llm_manager.get_llm(),
+            "threshold": threshold,
+            "top_logprobs": 5,
+        }
+
+        # Add evaluation steps if provided
+        if evaluation_steps:
+            metric_kwargs["evaluation_steps"] = evaluation_steps
+
+        # Instantiate the GEval metric object
+        metric = GEval(**metric_kwargs)
+
+        # Prepare test case arguments, only including non-None optional fields
+        test_case_kwargs = {
+            "input": turn_data.query,
+            "actual_output": turn_data.response or "",
+        }
+
+        # Add optional fields only if they have values
+        if turn_data.expected_response:
+            test_case_kwargs["expected_output"] = turn_data.expected_response
+
+        if turn_data.contexts:
+            # Normalize contexts: handle both dict and string formats
+            normalized_contexts = [
+                ctx.get("content", str(ctx)) if isinstance(ctx, dict) else str(ctx)
+                for ctx in turn_data.contexts
+            ]
+            test_case_kwargs["context"] = normalized_contexts
+
+        # Create test case for a single turn
+        test_case = LLMTestCase(**test_case_kwargs)
+
+        # Evaluate
+        try:
+            metric.measure(test_case)
+            score = metric.score if metric.score is not None else 0.0
+            reason = (
+                str(metric.reason)
+                if hasattr(metric, "reason") and metric.reason
+                else "No reason provided"
+            )
+            return score, reason
+        except Exception as e:
+            logger.error(
+                f"GEval turn-level evaluation failed: {type(e).__name__}: {str(e)}"
+            )
+            logger.debug(
+                f"Test case input: {test_case.input[:100] if test_case.input else 'None'}..."
+            )
+            logger.debug(
+                f"Test case output: {test_case.actual_output[:100] if test_case.actual_output else 'None'}..."
+            )
+            return None, f"GEval evaluation error: {str(e)}"
+
+    def _evaluate_conversation(
+        self,
+        conv_data: Any,
+        criteria: str,
+        evaluation_params: list[str],
+        evaluation_steps: list[str] | None,
+        threshold: float,
+    ) -> tuple[float | None, str]:
+        """
+        Evaluate a conversation using GEval.
+
+        This method aggregates all conversation turns into a single LLMTestCase
+        and evaluates the conversation against the provided criteria.
+
+        Args:
+            conv_data (Any):
+                Conversation data object containing all turns.
+            criteria (str):
+                Description of the overall evaluation goal.
+            evaluation_params (list[str]):
+                List of field names to include (same semantics as turn-level).
+            evaluation_steps (list[str] | None):
+                Optional instructions guiding how the evaluation should proceed.
+            threshold (float):
+                Minimum acceptable score before the metric is considered failed.
+
+        Returns:
+            tuple[float | None, str]:
+                Tuple containing (score, reason). Returns None on error.
+        """
+        # Convert evaluation_params to enum values if valid, otherwise use defaults
+        converted_params = self._convert_evaluation_params(evaluation_params)
+        if not converted_params:
+            # If no valid params, use sensible defaults for conversation evaluation
+            converted_params = [
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ]
+
+        # Configure the GEval metric for conversation-level evaluation
+        metric_kwargs: dict[str, Any] = {
+            "name": "GEval Conversation Metric",
+            "criteria": criteria,
+            "evaluation_params": converted_params,
+            "model": self.deepeval_llm_manager.get_llm(),
+            "threshold": threshold,
+            "top_logprobs": 5,  # Vertex/Gemini throws an error if over 20.
+        }
+
+        # Add evaluation steps if provided
+        if evaluation_steps:
+            metric_kwargs["evaluation_steps"] = evaluation_steps
+
+        # Instantiate the GEval metric object
+        metric = GEval(**metric_kwargs)
+
+        # GEval only accepts LLMTestCase, not ConversationalTestCase
+        # Aggregate conversation turns into a single test case
+        conversation_input = []
+        conversation_output = []
+
+        for i, turn in enumerate(conv_data.turns, 1):
+            conversation_input.append(f"Turn {i} - User: {turn.query}")
+            conversation_output.append(f"Turn {i} - Assistant: {turn.response or ''}")
+
+        # Create aggregated test case for conversation evaluation
+        test_case = LLMTestCase(
+            input="\n".join(conversation_input),
+            actual_output="\n".join(conversation_output),
+        )
+
+        # Evaluate
+        try:
+            metric.measure(test_case)
+            score = metric.score if metric.score is not None else 0.0
+            reason = (
+                str(metric.reason)
+                if hasattr(metric, "reason") and metric.reason
+                else "No reason provided"
+            )
+            return score, reason
+        except Exception as e:
+            logger.error(
+                f"GEval conversation-level evaluation failed: {type(e).__name__}: {str(e)}"
+            )
+            logger.debug(f"Conversation turns: {len(conv_data.turns)}")
+            logger.debug(
+                f"Test case input preview: {test_case.input[:200] if test_case.input else 'None'}..."
+            )
+            return None, f"GEval evaluation error: {str(e)}"
+
+    def _get_geval_config(
+        self,
+        metric_name: str,
+        conv_data: Any,
+        turn_data: Any | None,
+        is_conversation: bool,
+    ) -> dict[str, Any] | None:
+        """Extract GEval configuration from metadata or registry.
+
+         The method checks multiple sources in priority order:
+            1. Turn-level metadata (runtime override)
+            2. Conversation-level metadata (runtime override)
+            3. Metric registry (shared, persistent YAML definitions)
+
+         Args:
+            metric_name (str):
+                Name of the metric to retrieve (e.g., "completeness").
+            conv_data (Any):
+                The full conversation data object, which may contain
+                conversation-level metadata.
+            turn_data (Any | None):
+                Optional turn-level data object, for per-turn metrics.
+            is_conversation (bool):
+                True if evaluating a conversation-level metric, False for turn-level.
+
+        Returns:
+            dict[str, Any] | None:
+                The GEval configuration dictionary if found, otherwise None.
+        """
+        metric_key = f"geval:{metric_name}"
+
+        # Turn level metadata override
+        # Used when individual turns define custom GEval settings
+        if (
+            not is_conversation
+            and turn_data
+            and hasattr(turn_data, "turn_metrics_metadata")
+            and turn_data.turn_metrics_metadata
+            and metric_key in turn_data.turn_metrics_metadata
+        ):
+            logger.debug(f"Using runtime metadata for metric '{metric_name}'")
+            return turn_data.turn_metrics_metadata[metric_key]
+
+        # Conversation-level metadata override
+        # Used when the conversation defines shared GEval settings
+        if (
+            hasattr(conv_data, "conversation_metrics_metadata")
+            and conv_data.conversation_metrics_metadata
+            and metric_key in conv_data.conversation_metrics_metadata
+        ):
+            logger.debug(f"Using runtime metadata for metric '{metric_name}'")
+            return conv_data.conversation_metrics_metadata[metric_key]
+
+        # Registry definition
+        # Fallback to shared YAML registry if no runtime metadata is found
+        if GEvalHandler._registry and metric_name in GEvalHandler._registry:
+            logger.debug(f"Using registry definition for metric '{metric_name}'")
+            return GEvalHandler._registry[metric_name]
+
+        # Config not found anywhere
+        logger.warning(
+            f"Metric '{metric_name}' not found in runtime metadata or registry. "
+            f"Available registry metrics: {list(GEvalHandler._registry.keys()) if GEvalHandler._registry else []}"
+        )
+        return None

From a4370b332ef063381920f981741011fcbce5eeb4 Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Fri, 7 Nov 2025 15:37:17 -0700
Subject: [PATCH 2/5] bug:small fix for undefined registry paths

---
 src/lightspeed_evaluation/core/metrics/geval.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py
index df0fafb2..ca992c3f 100644
--- a/src/lightspeed_evaluation/core/metrics/geval.py
+++ b/src/lightspeed_evaluation/core/metrics/geval.py
@@ -75,8 +75,10 @@ def _load_registry(self, registry_path: str | None = None) -> None:
             return
 
         # Determine registry path
+        possible_paths = []
         if registry_path:
             path = Path(registry_path)
+            possible_paths = [path]
         else:
             # Look for config/registry/geval_metrics.yaml relative to project root
             # Try multiple locations

From 4e37984cb1b0ef9ee392f167d0cfccddd33816a8 Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Mon, 10 Nov 2025 09:29:01 -0700
Subject: [PATCH 3/5] fix: resolving pylint and pydocstyle conflicts

---
 .../core/metrics/deepeval.py                  |   2 +-
 .../core/metrics/geval.py                     | 107 ++++++++++--------
 2 files changed, 60 insertions(+), 49 deletions(-)

diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py
index 0585a581..cb194fee 100644
--- a/src/lightspeed_evaluation/core/metrics/deepeval.py
+++ b/src/lightspeed_evaluation/core/metrics/deepeval.py
@@ -123,7 +123,7 @@ def evaluate(
         return self.geval_handler.evaluate(
             metric_name=metric_name,
             conv_data=conv_data,
-            turn_idx=scope.turn_idx,
+            _turn_idx=scope.turn_idx,
             turn_data=scope.turn_data,
             is_conversation=scope.is_conversation,
         )
diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py
index ca992c3f..f2cf0752 100644
--- a/src/lightspeed_evaluation/core/metrics/geval.py
+++ b/src/lightspeed_evaluation/core/metrics/geval.py
@@ -17,7 +17,7 @@
 logger = logging.getLogger(__name__)
 
 
-class GEvalHandler:
+class GEvalHandler:  # pylint: disable=R0903
     """Handler for configurable GEval metrics.
 
     This class integrates with the lightspeed-evaluation framework
@@ -49,8 +49,7 @@ def __init__(
         self._load_registry(registry_path)
 
     def _load_registry(self, registry_path: str | None = None) -> None:
-        """
-        Load the GEval metric registry from a YAML configuration file.
+        """Load the GEval metric registry from a YAML configuration file.
 
         This method initializes the class-level `_registry`.
         It supports both user-specified and auto-discovered paths, searching common
@@ -97,16 +96,16 @@ def _load_registry(self, registry_path: str | None = None) -> None:
         # Handle missing or invalid registry
         if path is None or not path.exists():
             logger.warning(
-                f"GEval metric registry not found at expected locations. "
-                f"Tried: {[str(p) for p in possible_paths]}. "
-                f"Will fall back to runtime metadata only."
+                "GEval metric registry not found at expected locations. "
+                "Tried: %s. Will fall back to runtime metadata only.",
+                [str(p) for p in possible_paths],
             )
             GEvalHandler._registry = {}
             return
 
         # Load registry file
         try:
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 GEvalHandler._registry = (
                     yaml.safe_load(f) or {}
                 )  # Default to empty dict if file is empty
@@ -114,21 +113,20 @@ def _load_registry(self, registry_path: str | None = None) -> None:
                 num_metrics = (
                     len(GEvalHandler._registry) if GEvalHandler._registry else 0
                 )
-                logger.info(f"Loaded {num_metrics} GEval metrics from {path}")
-        except Exception as e:
-            logger.error(f"Failed to load GEval registry from {path}: {e}")
+                logger.info("Loaded %d GEval metrics from %s", num_metrics, path)
+        except Exception as e:  # pylint: disable=W0718
+            logger.error("Failed to load GEval registry from %s: %s", path, e)
             GEvalHandler._registry = {}
 
-    def evaluate(
+    def evaluate(  # pylint: disable=R0913,R0917
         self,
         metric_name: str,
         conv_data: Any,
-        turn_idx: int | None,  # noqa: ARG002
+        _turn_idx: int | None,
         turn_data: Any | None,
         is_conversation: bool,
     ) -> tuple[float | None, str]:
-        """
-        Evaluate using GEval with runtime configuration.
+        """Evaluate using GEval with runtime configuration.
 
         This method is the central entry point for running GEval evaluations.
         It retrieves the appropriate metric configuration (from registry or runtime
@@ -189,28 +187,27 @@ def evaluate(
             return self._evaluate_conversation(
                 conv_data, criteria, evaluation_params, evaluation_steps, threshold
             )
-        else:
-            return self._evaluate_turn(
-                turn_data, criteria, evaluation_params, evaluation_steps, threshold
-            )
+        return self._evaluate_turn(
+            turn_data, criteria, evaluation_params, evaluation_steps, threshold
+        )
 
     def _convert_evaluation_params(
         self, params: list[str]
     ) -> list[LLMTestCaseParams] | None:
-        """
-        Convert a list of string parameter names into `LLMTestCaseParams` enum values.
+        """Convert a list of string parameter names into `LLMTestCaseParams` enum values.
 
         This helper ensures that the evaluation parameters passed into GEval are properly
-        typed as `LLMTestCaseParams` (used by DeepEval's test-case schema). If any parameter is not a
-        valid enum member, the function treats the entire parameter list as "custom" and returns `None`.
-        This allows GEval to automatically infer the required fields at runtime rather than forcing
-        strict schema compliance.
+        typed as `LLMTestCaseParams` (used by DeepEval's test-case schema). If any
+        parameter is not a valid enum member, the function treats the entire parameter
+        list as "custom" and returns `None`. This allows GEval to automatically infer
+        the required fields at runtime rather than forcing strict schema compliance.
 
         Args:
             params (list[str]):
                 A list of string identifiers (e.g., ["input", "actual_output"]).
                 These typically come from a YAML or runtime configuration and
                 may not always match enum names exactly.
+
         Returns:
             List of LLMTestCaseParams enum values, or None if params are custom strings
         """
@@ -230,16 +227,17 @@ def _convert_evaluation_params(
             except (KeyError, AttributeError):
                 # Not a valid enum - these are custom params, skip them
                 logger.debug(
-                    f"Skipping custom evaluation_param '{param}' - "
-                    f"not a valid LLMTestCaseParams enum. "
-                    f"GEval will auto-detect required fields."
+                    "Skipping custom evaluation_param '%s' - "
+                    "not a valid LLMTestCaseParams enum. "
+                    "GEval will auto-detect required fields.",
+                    param,
                 )
                 return None
 
         # Return the successfully converted list, or None if it ended up empty
         return converted if converted else None
 
-    def _evaluate_turn(
+    def _evaluate_turn(  # pylint: disable=R0913,R0917
         self,
         turn_data: Any,
         criteria: str,
@@ -247,8 +245,7 @@ def _evaluate_turn(
         evaluation_steps: list[str] | None,
         threshold: float,
     ) -> tuple[float | None, str]:
-        """
-            Evaluate a single turn using GEval.
+        """Evaluate a single turn using GEval.
 
             Args:
             turn_data (Any):
@@ -331,19 +328,21 @@ def _evaluate_turn(
                 else "No reason provided"
             )
             return score, reason
-        except Exception as e:
+        except Exception as e:  # pylint: disable=W0718
             logger.error(
-                f"GEval turn-level evaluation failed: {type(e).__name__}: {str(e)}"
+                "GEval turn-level evaluation failed: %s: %s", type(e).__name__, str(e)
             )
             logger.debug(
-                f"Test case input: {test_case.input[:100] if test_case.input else 'None'}..."
+                "Test case input: %s...",
+                test_case.input[:100] if test_case.input else "None",
             )
             logger.debug(
-                f"Test case output: {test_case.actual_output[:100] if test_case.actual_output else 'None'}..."
+                "Test case output: %s...",
+                test_case.actual_output[:100] if test_case.actual_output else "None",
             )
             return None, f"GEval evaluation error: {str(e)}"
 
-    def _evaluate_conversation(
+    def _evaluate_conversation(  # pylint: disable=R0913,R0917,R0914
         self,
         conv_data: Any,
         criteria: str,
@@ -351,8 +350,7 @@ def _evaluate_conversation(
         evaluation_steps: list[str] | None,
         threshold: float,
     ) -> tuple[float | None, str]:
-        """
-        Evaluate a conversation using GEval.
+        """Evaluate a conversation using GEval.
 
         This method aggregates all conversation turns into a single LLMTestCase
         and evaluates the conversation against the provided criteria.
@@ -424,13 +422,16 @@ def _evaluate_conversation(
                 else "No reason provided"
             )
             return score, reason
-        except Exception as e:
+        except Exception as e:  # pylint: disable=W0718
             logger.error(
-                f"GEval conversation-level evaluation failed: {type(e).__name__}: {str(e)}"
+                "GEval conversation-level evaluation failed: %s: %s",
+                type(e).__name__,
+                str(e),
             )
-            logger.debug(f"Conversation turns: {len(conv_data.turns)}")
+            logger.debug("Conversation turns: %d", len(conv_data.turns))
             logger.debug(
-                f"Test case input preview: {test_case.input[:200] if test_case.input else 'None'}..."
+                "Test case input preview: %s...",
+                test_case.input[:200] if test_case.input else "None",
             )
             return None, f"GEval evaluation error: {str(e)}"
 
@@ -474,7 +475,7 @@ def _get_geval_config(
             and turn_data.turn_metrics_metadata
             and metric_key in turn_data.turn_metrics_metadata
         ):
-            logger.debug(f"Using runtime metadata for metric '{metric_name}'")
+            logger.debug("Using runtime metadata for metric '%s'", metric_name)
             return turn_data.turn_metrics_metadata[metric_key]
 
         # Conversation-level metadata override
@@ -484,18 +485,28 @@ def _get_geval_config(
             and conv_data.conversation_metrics_metadata
             and metric_key in conv_data.conversation_metrics_metadata
         ):
-            logger.debug(f"Using runtime metadata for metric '{metric_name}'")
+            logger.debug("Using runtime metadata for metric '%s'", metric_name)
             return conv_data.conversation_metrics_metadata[metric_key]
 
         # Registry definition
         # Fallback to shared YAML registry if no runtime metadata is found
-        if GEvalHandler._registry and metric_name in GEvalHandler._registry:
-            logger.debug(f"Using registry definition for metric '{metric_name}'")
-            return GEvalHandler._registry[metric_name]
+        if (
+            GEvalHandler._registry
+            and metric_name in GEvalHandler._registry  # pylint: disable=E1135
+        ):  # pylint: disable=E1135
+            logger.debug("Using registry definition for metric '%s'", metric_name)
+            return GEvalHandler._registry[metric_name]  # pylint: disable=E1136
 
         # Config not found anywhere
+        available_metrics = (
+            list(GEvalHandler._registry.keys())  # pylint: disable=E1136
+            if GEvalHandler._registry
+            else []
+        )
         logger.warning(
-            f"Metric '{metric_name}' not found in runtime metadata or registry. "
-            f"Available registry metrics: {list(GEvalHandler._registry.keys()) if GEvalHandler._registry else []}"
+            "Metric '%s' not found in runtime metadata or registry. "
+            "Available registry metrics: %s",
+            metric_name,
+            available_metrics,
         )
         return None

From 9fd462dfea36f6dbc7f0507e8fbd7ac71e1080bb Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Mon, 10 Nov 2025 14:44:30 -0700
Subject: [PATCH 4/5] fix: unbound errors with registry loading function

---
 .../core/metrics/geval.py                     | 68 +++++++++++--------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py
index f2cf0752..3b691e91 100644
--- a/src/lightspeed_evaluation/core/metrics/geval.py
+++ b/src/lightspeed_evaluation/core/metrics/geval.py
@@ -6,7 +6,7 @@
 
 import logging
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
 import yaml
 from deepeval.metrics import GEval
@@ -73,50 +73,62 @@ def _load_registry(self, registry_path: str | None = None) -> None:
         if GEvalHandler._registry is not None:
             return
 
-        # Determine registry path
-        possible_paths = []
-        if registry_path:
-            path = Path(registry_path)
-            possible_paths = [path]
+        # Ensure variables are always bound for static analysis -
+        path: Optional[Path] = None
+        possible_paths: list[Path] = []
+
+        # Normalize user-specified path vs. auto-discovery
+        if registry_path is not None:
+            try:
+                path = Path(registry_path)
+            except TypeError:
+                # Bad type passed in; treat as no path provided
+                path = None
+            if path is not None:
+                possible_paths = [path]
         else:
-            # Look for config/registry/geval_metrics.yaml relative to project root
-            # Try multiple locations
+            package_root = Path(__file__).resolve().parents[3]
             possible_paths = [
                 Path.cwd() / "config" / "registry" / "geval_metrics.yaml",
-                Path(__file__).parent.parent.parent.parent
-                / "config"
-                / "registry"
-                / "geval_metrics.yaml",
+                package_root / "config" / "registry" / "geval_metrics.yaml",
             ]
-            path = None
-            for p in possible_paths:
-                if p.exists():
-                    path = p
+
+        # If no explicit file exists yet, search candidates
+        if path is None or not path.exists():
+            for candidate in possible_paths:
+                if candidate.exists():
+                    path = candidate
                     break
+
         # Handle missing or invalid registry
         if path is None or not path.exists():
+            GEvalHandler._registry = {}
+            GEvalHandler._registry_path = None
             logger.warning(
-                "GEval metric registry not found at expected locations. "
-                "Tried: %s. Will fall back to runtime metadata only.",
+                "GEval metric registry not found at expected locations. Tried: %s. "
+                "Will fall back to runtime metadata only.",
                 [str(p) for p in possible_paths],
             )
-            GEvalHandler._registry = {}
             return
 
         # Load registry file
         try:
-            with open(path, encoding="utf-8") as f:
-                GEvalHandler._registry = (
-                    yaml.safe_load(f) or {}
-                )  # Default to empty dict if file is empty
+            with path.open(encoding="utf-8") as f:
+                loaded = yaml.safe_load(f) or {}
+                # Guard against non-dict YAML (e.g., list/null)
+                if not isinstance(loaded, dict):
+                    logger.warning(
+                        "GEval registry file %s did not contain a mapping; using empty registry.",
+                        path,
+                    )
+                    loaded = {}
+                GEvalHandler._registry = loaded
                 GEvalHandler._registry_path = path
-                num_metrics = (
-                    len(GEvalHandler._registry) if GEvalHandler._registry else 0
-                )
-                logger.info("Loaded %d GEval metrics from %s", num_metrics, path)
-        except Exception as e:  # pylint: disable=W0718
+                logger.info("Loaded %d GEval metrics from %s", len(loaded), path)
+        except Exception as e:  # noqa: BLE001  # pylint: disable=broad-exception-caught
             logger.error("Failed to load GEval registry from %s: %s", path, e)
             GEvalHandler._registry = {}
+            GEvalHandler._registry_path = None
 
     def evaluate(  # pylint: disable=R0913,R0917
         self,

From 96e39ba1137c67853d448d4f0673f47abb232ba2 Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Tue, 11 Nov 2025 14:25:15 -0700
Subject: [PATCH 5/5] removing backward compatibility for turn data +
 coderabbit fixes

---
 .../core/metrics/deepeval.py                  |  8 +++-
 .../core/metrics/geval.py                     | 40 ++++++++++---------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py
index cb194fee..0f316233 100644
--- a/src/lightspeed_evaluation/core/metrics/deepeval.py
+++ b/src/lightspeed_evaluation/core/metrics/deepeval.py
@@ -119,9 +119,13 @@ def evaluate(
                 return None, f"DeepEval {metric_name} evaluation failed: {str(e)}"
 
         # Otherwise, assume it's a GEval metric
-        # Note: metric_name should NOT have "geval:" prefix here
+        normalized_metric_name = (
+            metric_name.split(":", 1)[1]
+            if metric_name.startswith("geval:")
+            else metric_name
+        )
         return self.geval_handler.evaluate(
-            metric_name=metric_name,
+            metric_name=normalized_metric_name,
             conv_data=conv_data,
             _turn_idx=scope.turn_idx,
             turn_data=scope.turn_data,
diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py
index 3b691e91..2ca53395 100644
--- a/src/lightspeed_evaluation/core/metrics/geval.py
+++ b/src/lightspeed_evaluation/core/metrics/geval.py
@@ -285,12 +285,6 @@ def _evaluate_turn(  # pylint: disable=R0913,R0917
 
         # Convert evaluation_params to enum values if valid, otherwise use defaults
         converted_params = self._convert_evaluation_params(evaluation_params)
-        if not converted_params:
-            # If no valid params, use sensible defaults for turn evaluation
-            converted_params = [
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-            ]
 
         # Create GEval metric with runtime configuration
         metric_kwargs: dict[str, Any] = {
@@ -302,6 +296,18 @@ def _evaluate_turn(  # pylint: disable=R0913,R0917
             "top_logprobs": 5,
         }
 
+        # Only set evaluation_params if we have valid enum conversions
+        # or if no params were provided at all (then use defaults)
+        if converted_params is None:
+            if not evaluation_params:
+                metric_kwargs["evaluation_params"] = [
+                    LLMTestCaseParams.INPUT,
+                    LLMTestCaseParams.ACTUAL_OUTPUT,
+                ]
+            # else: leave unset so GEval can auto-detect from custom strings
+        else:
+            metric_kwargs["evaluation_params"] = converted_params
+
         # Add evaluation steps if provided
         if evaluation_steps:
             metric_kwargs["evaluation_steps"] = evaluation_steps
@@ -320,12 +326,7 @@ def _evaluate_turn(  # pylint: disable=R0913,R0917
             test_case_kwargs["expected_output"] = turn_data.expected_response
 
         if turn_data.contexts:
-            # Normalize contexts: handle both dict and string formats
-            normalized_contexts = [
-                ctx.get("content", str(ctx)) if isinstance(ctx, dict) else str(ctx)
-                for ctx in turn_data.contexts
-            ]
-            test_case_kwargs["context"] = normalized_contexts
+            test_case_kwargs["context"] = turn_data.contexts
 
         # Create test case for a single turn
         test_case = LLMTestCase(**test_case_kwargs)
@@ -385,12 +386,6 @@ def _evaluate_conversation(  # pylint: disable=R0913,R0917,R0914
         """
         # Convert evaluation_params to enum values if valid, otherwise use defaults
         converted_params = self._convert_evaluation_params(evaluation_params)
-        if not converted_params:
-            # If no valid params, use sensible defaults for conversation evaluation
-            converted_params = [
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-            ]
 
         # Configure the GEval metric for conversation-level evaluation
         metric_kwargs: dict[str, Any] = {
@@ -402,6 +397,15 @@ def _evaluate_conversation(  # pylint: disable=R0913,R0917,R0914
             "top_logprobs": 5,  # Vertex/Gemini throws an error if over 20.
         }
 
+        if converted_params is None:
+            if not evaluation_params:
+                metric_kwargs["evaluation_params"] = [
+                    LLMTestCaseParams.INPUT,
+                    LLMTestCaseParams.ACTUAL_OUTPUT,
+                ]
+        else:
+            metric_kwargs["evaluation_params"] = converted_params
+
         # Add evaluation steps if provided
         if evaluation_steps:
             metric_kwargs["evaluation_steps"] = evaluation_steps