From 9a179404e22b1dfba822da92674fda95a609ad77 Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Fri, 7 Nov 2025 15:15:00 -0700 Subject: [PATCH 1/5] feat: added GEval handler; integrated component into DeepEvalMetrics --- .../core/metrics/__init__.py | 7 +- .../core/metrics/deepeval.py | 73 ++- .../core/metrics/geval.py | 499 ++++++++++++++++++ 3 files changed, 564 insertions(+), 15 deletions(-) create mode 100644 src/lightspeed_evaluation/core/metrics/geval.py diff --git a/src/lightspeed_evaluation/core/metrics/__init__.py b/src/lightspeed_evaluation/core/metrics/__init__.py index 8f670d01..048d977e 100644 --- a/src/lightspeed_evaluation/core/metrics/__init__.py +++ b/src/lightspeed_evaluation/core/metrics/__init__.py @@ -5,4 +5,9 @@ from lightspeed_evaluation.core.metrics.ragas import RagasMetrics from lightspeed_evaluation.core.metrics.script import ScriptEvalMetrics -__all__ = ["RagasMetrics", "DeepEvalMetrics", "CustomMetrics", "ScriptEvalMetrics"] +__all__ = [ + "RagasMetrics", + "DeepEvalMetrics", + "CustomMetrics", + "ScriptEvalMetrics", +] diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py index e8eccc2a..0585a581 100644 --- a/src/lightspeed_evaluation/core/metrics/deepeval.py +++ b/src/lightspeed_evaluation/core/metrics/deepeval.py @@ -1,5 +1,11 @@ -"""DeepEval metrics evaluation using LLM Manager.""" +"""DeepEval metrics evaluation using LLM Manager. +This module provides integration with DeepEval metrics including: +1. Standard DeepEval metrics (conversation completeness, relevancy, knowledge retention) +2. GEval integration for configurable custom evaluation criteria +""" + +import logging from typing import Any, Optional import litellm @@ -16,28 +22,45 @@ from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager from lightspeed_evaluation.core.llm.manager import LLMManager from lightspeed_evaluation.core.models import EvaluationScope, TurnData +from lightspeed_evaluation.core.metrics.geval import GEvalHandler + +logger = logging.getLogger(__name__) class DeepEvalMetrics: # pylint: disable=too-few-public-methods - """Handles DeepEval metrics evaluation using LLM Manager.""" + """Handles DeepEval metrics evaluation using LLM Manager. + + This class provides a unified interface for both standard DeepEval metrics + and GEval (configurable custom metrics). It shares LLM resources between + both evaluation types for efficiency. + """ - def __init__(self, llm_manager: LLMManager): + def __init__(self, llm_manager: LLMManager, registry_path: str | None = None): """Initialize with LLM Manager. Args: llm_manager: Pre-configured LLMManager with validated parameters + registry_path: Optional path to GEval metrics registry YAML """ + # Setup cache if enabled (shared across all DeepEval operations) if llm_manager.get_config().cache_enabled and litellm.cache is None: cache_dir = llm_manager.get_config().cache_dir # Modifying global litellm cache as there is no clear way how to do it per model # Checking if the litellm.cache as there is potential conflict with Ragas code litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=cache_dir) - # Create LLM Manager for DeepEval metrics + # Create shared LLM Manager for all DeepEval metrics (standard + GEval) self.llm_manager = DeepEvalLLMManager( llm_manager.get_model_name(), llm_manager.get_llm_params() ) + # Initialize GEval handler with shared LLM manager + self.geval_handler = GEvalHandler( + deepeval_llm_manager=self.llm_manager, + registry_path=registry_path, + ) + + # Standard DeepEval metrics routing self.supported_metrics = { "conversation_completeness": self._evaluate_conversation_completeness, "conversation_relevancy": self._evaluate_conversation_relevancy, @@ -72,16 +95,38 @@ def evaluate( conv_data: Any, scope: EvaluationScope, ) -> tuple[Optional[float], str]: - """Evaluate a DeepEval metric.""" - if metric_name not in self.supported_metrics: - return None, f"Unsupported DeepEval metric: {metric_name}" - - try: - return self.supported_metrics[metric_name]( - conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation - ) - except (ValueError, AttributeError, KeyError) as e: - return None, f"DeepEval {metric_name} evaluation failed: {str(e)}" + """Evaluate a DeepEval metric (standard or GEval). + + This method routes evaluation to either: + - Standard DeepEval metrics (hardcoded implementations) + - GEval metrics (configuration-driven custom metrics) + + Args: + metric_name: Name of metric (for GEval, this should NOT include "geval:" prefix) + conv_data: Conversation data object + scope: EvaluationScope containing turn info and conversation flag + + Returns: + Tuple of (score, reason) + """ + # Route to standard DeepEval metrics + if metric_name in self.supported_metrics: + try: + return self.supported_metrics[metric_name]( + conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation + ) + except (ValueError, AttributeError, KeyError) as e: + return None, f"DeepEval {metric_name} evaluation failed: {str(e)}" + + # Otherwise, assume it's a GEval metric + # Note: metric_name should NOT have "geval:" prefix here + return self.geval_handler.evaluate( + metric_name=metric_name, + conv_data=conv_data, + turn_idx=scope.turn_idx, + turn_data=scope.turn_data, + is_conversation=scope.is_conversation, + ) def _evaluate_conversation_completeness( self, diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py new file mode 100644 index 00000000..df0fafb2 --- /dev/null +++ b/src/lightspeed_evaluation/core/metrics/geval.py @@ -0,0 +1,499 @@ +"""GEval metrics handler using LLM Manager. + +This module provides integration with DeepEval's GEval for configurable custom evaluation criteria. +GEval allows runtime-defined evaluation metrics through YAML configuration. +""" + +import logging +from pathlib import Path +from typing import Any + +import yaml +from deepeval.metrics import GEval +from deepeval.test_case import LLMTestCase, LLMTestCaseParams + +from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager + +logger = logging.getLogger(__name__) + + +class GEvalHandler: + """Handler for configurable GEval metrics. + + This class integrates with the lightspeed-evaluation framework + to provide GEval evaluation with criteria defined either in: + 1. A centralized metric registry (config/registry/geval_metrics.yaml) + 2. Runtime YAML configuration (turn_metrics_metadata) + + Priority: Runtime metadata overrides registry definitions. + """ + + # Class-level registry cache (shared across instances) + _registry: dict[str, Any] | None = None + _registry_path: Path | None = None + + def __init__( + self, + deepeval_llm_manager: DeepEvalLLMManager, + registry_path: str | None = None, + ) -> None: + """Initialize GEval handler. + + Args: + deepeval_llm_manager: Shared DeepEvalLLMManager instance + registry_path: Optional path to metric registry YAML. + If not provided, looks for config/registry/geval_metrics.yaml + relative to project root. + """ + self.deepeval_llm_manager = deepeval_llm_manager + self._load_registry(registry_path) + + def _load_registry(self, registry_path: str | None = None) -> None: + """ + Load the GEval metric registry from a YAML configuration file. + + This method initializes the class-level `_registry`. + It supports both user-specified and auto-discovered paths, searching common + locations relative to the current working directory and the package root. + + If no valid registry file is found, it logs a warning and initializes an + empty registry (meaning GEval will rely solely on runtime metadata). + + Args: + registry_path (str | None): Optional explicit path to a registry YAML file. + + Behavior: + - If the registry has already been loaded, the function returns immediately. + - If `registry_path` is provided, it is used directly. + - Otherwise, common fallback paths are checked for existence. + - If a registry is found, it is parsed with `yaml.safe_load`. + - Any exceptions during file access or parsing are logged, and an empty + registry is used as a fallback. + """ + # Only load once per class + if GEvalHandler._registry is not None: + return + + # Determine registry path + if registry_path: + path = Path(registry_path) + else: + # Look for config/registry/geval_metrics.yaml relative to project root + # Try multiple locations + possible_paths = [ + Path.cwd() / "config" / "registry" / "geval_metrics.yaml", + Path(__file__).parent.parent.parent.parent + / "config" + / "registry" + / "geval_metrics.yaml", + ] + path = None + for p in possible_paths: + if p.exists(): + path = p + break + # Handle missing or invalid registry + if path is None or not path.exists(): + logger.warning( + f"GEval metric registry not found at expected locations. " + f"Tried: {[str(p) for p in possible_paths]}. " + f"Will fall back to runtime metadata only." + ) + GEvalHandler._registry = {} + return + + # Load registry file + try: + with open(path) as f: + GEvalHandler._registry = ( + yaml.safe_load(f) or {} + ) # Default to empty dict if file is empty + GEvalHandler._registry_path = path + num_metrics = ( + len(GEvalHandler._registry) if GEvalHandler._registry else 0 + ) + logger.info(f"Loaded {num_metrics} GEval metrics from {path}") + except Exception as e: + logger.error(f"Failed to load GEval registry from {path}: {e}") + GEvalHandler._registry = {} + + def evaluate( + self, + metric_name: str, + conv_data: Any, + turn_idx: int | None, # noqa: ARG002 + turn_data: Any | None, + is_conversation: bool, + ) -> tuple[float | None, str]: + """ + Evaluate using GEval with runtime configuration. + + This method is the central entry point for running GEval evaluations. + It retrieves the appropriate metric configuration (from registry or runtime + metadata), extracts evaluation parameters, and delegates the actual scoring + to either conversation-level or turn-level evaluators. + + Args: + metric_name (str): + The name of the metric to evaluate (e.g., "technical_accuracy"). + conv_data (Any): + The conversation data object containing context, messages, and + associated metadata. + turn_idx (int | None): + The index of the current turn in the conversation. + (Currently unused but kept for interface compatibility.) + turn_data (Any | None): + The turn-level data object, required when evaluating turn-level metrics. + is_conversation (bool): + Indicates whether the evaluation should run on the entire + conversation (`True`) or on an individual turn (`False`). + + Returns: + tuple[float | None, str]: + A tuple containing: + - **score** (float | None): The computed metric score, or None if evaluation failed. + - **reason** (str): A descriptive reason or error message. + + Behavior: + 1. Fetch GEval configuration from metadata using `_get_geval_config()`. + 2. Validate that required fields (e.g., "criteria") are present. + 3. Extract key parameters such as criteria, evaluation steps, and threshold. + 4. Delegate to `_evaluate_conversation()` or `_evaluate_turn()` depending + on the `is_conversation` flag. + """ + # Extract GEval configuration from metadata + # May come from runtime metadata or a preloaded registry + geval_config = self._get_geval_config( + metric_name, conv_data, turn_data, is_conversation + ) + + # If no configuration is available, return early with an informative message. + if not geval_config: + return None, f"GEval configuration not found for metric '{metric_name}'" + + # Extract configuration parameters + criteria = geval_config.get("criteria") + evaluation_params = geval_config.get("evaluation_params", []) + evaluation_steps = geval_config.get("evaluation_steps") + threshold = geval_config.get("threshold", 0.5) + + # The criteria field defines what the model is being judged on. + # Without it, we cannot perform evaluation. Evaluation steps can be generated + if not criteria: + return None, "GEval requires 'criteria' in configuration" + + # Perform evaluation based on level (turn or conversation) + if is_conversation: + return self._evaluate_conversation( + conv_data, criteria, evaluation_params, evaluation_steps, threshold + ) + else: + return self._evaluate_turn( + turn_data, criteria, evaluation_params, evaluation_steps, threshold + ) + + def _convert_evaluation_params( + self, params: list[str] + ) -> list[LLMTestCaseParams] | None: + """ + Convert a list of string parameter names into `LLMTestCaseParams` enum values. + + This helper ensures that the evaluation parameters passed into GEval are properly + typed as `LLMTestCaseParams` (used by DeepEval's test-case schema). If any parameter is not a + valid enum member, the function treats the entire parameter list as "custom" and returns `None`. + This allows GEval to automatically infer the required fields at runtime rather than forcing + strict schema compliance. + + Args: + params (list[str]): + A list of string identifiers (e.g., ["input", "actual_output"]). + These typically come from a YAML or runtime configuration and + may not always match enum names exactly. + Returns: + List of LLMTestCaseParams enum values, or None if params are custom strings + """ + # Return early if no parameters were supplied + if not params: + return None + + # Try to convert strings to enum values + converted: list[LLMTestCaseParams] = [] + + # Attempt to convert each string into a valid enum value + for param in params: + try: + # Try to match as enum value (e.g., "INPUT", "ACTUAL_OUTPUT") + enum_value = LLMTestCaseParams[param.upper().replace(" ", "_")] + converted.append(enum_value) + except (KeyError, AttributeError): + # Not a valid enum - these are custom params, skip them + logger.debug( + f"Skipping custom evaluation_param '{param}' - " + f"not a valid LLMTestCaseParams enum. " + f"GEval will auto-detect required fields." + ) + return None + + # Return the successfully converted list, or None if it ended up empty + return converted if converted else None + + def _evaluate_turn( + self, + turn_data: Any, + criteria: str, + evaluation_params: list[str], + evaluation_steps: list[str] | None, + threshold: float, + ) -> tuple[float | None, str]: + """ + Evaluate a single turn using GEval. + + Args: + turn_data (Any): + The turn-level data object containing fields like query, response, + expected_response, and context. + criteria (str): + Natural-language description of what the evaluation should judge. + Example: "Assess factual correctness and command validity." + evaluation_params (list[str]): + A list of string parameters defining which fields to include + (e.g., ["input", "actual_output"]). + evaluation_steps (list[str] | None): + Optional step-by-step evaluation guidance for the model. + threshold (float): + Minimum score threshold that determines pass/fail behavior. + + Returns: + tuple[float | None, str]: + A tuple of (score, reason). If evaluation fails, score will be None + and the reason will contain an error message. + """ + # Validate that we actually have turn data + if not turn_data: + return None, "Turn data required for turn-level GEval" + + # Convert evaluation_params to enum values if valid, otherwise use defaults + converted_params = self._convert_evaluation_params(evaluation_params) + if not converted_params: + # If no valid params, use sensible defaults for turn evaluation + converted_params = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ] + + # Create GEval metric with runtime configuration + metric_kwargs: dict[str, Any] = { + "name": "GEval Turn Metric", + "criteria": criteria, + "evaluation_params": converted_params, + "model": self.deepeval_llm_manager.get_llm(), + "threshold": threshold, + "top_logprobs": 5, + } + + # Add evaluation steps if provided + if evaluation_steps: + metric_kwargs["evaluation_steps"] = evaluation_steps + + # Instantiate the GEval metric object + metric = GEval(**metric_kwargs) + + # Prepare test case arguments, only including non-None optional fields + test_case_kwargs = { + "input": turn_data.query, + "actual_output": turn_data.response or "", + } + + # Add optional fields only if they have values + if turn_data.expected_response: + test_case_kwargs["expected_output"] = turn_data.expected_response + + if turn_data.contexts: + # Normalize contexts: handle both dict and string formats + normalized_contexts = [ + ctx.get("content", str(ctx)) if isinstance(ctx, dict) else str(ctx) + for ctx in turn_data.contexts + ] + test_case_kwargs["context"] = normalized_contexts + + # Create test case for a single turn + test_case = LLMTestCase(**test_case_kwargs) + + # Evaluate + try: + metric.measure(test_case) + score = metric.score if metric.score is not None else 0.0 + reason = ( + str(metric.reason) + if hasattr(metric, "reason") and metric.reason + else "No reason provided" + ) + return score, reason + except Exception as e: + logger.error( + f"GEval turn-level evaluation failed: {type(e).__name__}: {str(e)}" + ) + logger.debug( + f"Test case input: {test_case.input[:100] if test_case.input else 'None'}..." + ) + logger.debug( + f"Test case output: {test_case.actual_output[:100] if test_case.actual_output else 'None'}..." + ) + return None, f"GEval evaluation error: {str(e)}" + + def _evaluate_conversation( + self, + conv_data: Any, + criteria: str, + evaluation_params: list[str], + evaluation_steps: list[str] | None, + threshold: float, + ) -> tuple[float | None, str]: + """ + Evaluate a conversation using GEval. + + This method aggregates all conversation turns into a single LLMTestCase + and evaluates the conversation against the provided criteria. + + Args: + conv_data (Any): + Conversation data object containing all turns. + criteria (str): + Description of the overall evaluation goal. + evaluation_params (list[str]): + List of field names to include (same semantics as turn-level). + evaluation_steps (list[str] | None): + Optional instructions guiding how the evaluation should proceed. + threshold (float): + Minimum acceptable score before the metric is considered failed. + + Returns: + tuple[float | None, str]: + Tuple containing (score, reason). Returns None on error. + """ + # Convert evaluation_params to enum values if valid, otherwise use defaults + converted_params = self._convert_evaluation_params(evaluation_params) + if not converted_params: + # If no valid params, use sensible defaults for conversation evaluation + converted_params = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ] + + # Configure the GEval metric for conversation-level evaluation + metric_kwargs: dict[str, Any] = { + "name": "GEval Conversation Metric", + "criteria": criteria, + "evaluation_params": converted_params, + "model": self.deepeval_llm_manager.get_llm(), + "threshold": threshold, + "top_logprobs": 5, # Vertex/Gemini throws an error if over 20. + } + + # Add evaluation steps if provided + if evaluation_steps: + metric_kwargs["evaluation_steps"] = evaluation_steps + + # Instantiate the GEval metric object + metric = GEval(**metric_kwargs) + + # GEval only accepts LLMTestCase, not ConversationalTestCase + # Aggregate conversation turns into a single test case + conversation_input = [] + conversation_output = [] + + for i, turn in enumerate(conv_data.turns, 1): + conversation_input.append(f"Turn {i} - User: {turn.query}") + conversation_output.append(f"Turn {i} - Assistant: {turn.response or ''}") + + # Create aggregated test case for conversation evaluation + test_case = LLMTestCase( + input="\n".join(conversation_input), + actual_output="\n".join(conversation_output), + ) + + # Evaluate + try: + metric.measure(test_case) + score = metric.score if metric.score is not None else 0.0 + reason = ( + str(metric.reason) + if hasattr(metric, "reason") and metric.reason + else "No reason provided" + ) + return score, reason + except Exception as e: + logger.error( + f"GEval conversation-level evaluation failed: {type(e).__name__}: {str(e)}" + ) + logger.debug(f"Conversation turns: {len(conv_data.turns)}") + logger.debug( + f"Test case input preview: {test_case.input[:200] if test_case.input else 'None'}..." + ) + return None, f"GEval evaluation error: {str(e)}" + + def _get_geval_config( + self, + metric_name: str, + conv_data: Any, + turn_data: Any | None, + is_conversation: bool, + ) -> dict[str, Any] | None: + """Extract GEval configuration from metadata or registry. + + The method checks multiple sources in priority order: + 1. Turn-level metadata (runtime override) + 2. Conversation-level metadata (runtime override) + 3. Metric registry (shared, persistent YAML definitions) + + Args: + metric_name (str): + Name of the metric to retrieve (e.g., "completeness"). + conv_data (Any): + The full conversation data object, which may contain + conversation-level metadata. + turn_data (Any | None): + Optional turn-level data object, for per-turn metrics. + is_conversation (bool): + True if evaluating a conversation-level metric, False for turn-level. + + Returns: + dict[str, Any] | None: + The GEval configuration dictionary if found, otherwise None. + """ + metric_key = f"geval:{metric_name}" + + # Turn level metadata override + # Used when individual turns define custom GEval settings + if ( + not is_conversation + and turn_data + and hasattr(turn_data, "turn_metrics_metadata") + and turn_data.turn_metrics_metadata + and metric_key in turn_data.turn_metrics_metadata + ): + logger.debug(f"Using runtime metadata for metric '{metric_name}'") + return turn_data.turn_metrics_metadata[metric_key] + + # Conversation-level metadata override + # Used when the conversation defines shared GEval settings + if ( + hasattr(conv_data, "conversation_metrics_metadata") + and conv_data.conversation_metrics_metadata + and metric_key in conv_data.conversation_metrics_metadata + ): + logger.debug(f"Using runtime metadata for metric '{metric_name}'") + return conv_data.conversation_metrics_metadata[metric_key] + + # Registry definition + # Fallback to shared YAML registry if no runtime metadata is found + if GEvalHandler._registry and metric_name in GEvalHandler._registry: + logger.debug(f"Using registry definition for metric '{metric_name}'") + return GEvalHandler._registry[metric_name] + + # Config not found anywhere + logger.warning( + f"Metric '{metric_name}' not found in runtime metadata or registry. " + f"Available registry metrics: {list(GEvalHandler._registry.keys()) if GEvalHandler._registry else []}" + ) + return None From a4370b332ef063381920f981741011fcbce5eeb4 Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Fri, 7 Nov 2025 15:37:17 -0700 Subject: [PATCH 2/5] bug:small fix for undefined registry paths --- src/lightspeed_evaluation/core/metrics/geval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py index df0fafb2..ca992c3f 100644 --- a/src/lightspeed_evaluation/core/metrics/geval.py +++ b/src/lightspeed_evaluation/core/metrics/geval.py @@ -75,8 +75,10 @@ def _load_registry(self, registry_path: str | None = None) -> None: return # Determine registry path + possible_paths = [] if registry_path: path = Path(registry_path) + possible_paths = [path] else: # Look for config/registry/geval_metrics.yaml relative to project root # Try multiple locations From 4e37984cb1b0ef9ee392f167d0cfccddd33816a8 Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Mon, 10 Nov 2025 09:29:01 -0700 Subject: [PATCH 3/5] fix: resolving pylint and pydocstyle conflicts --- .../core/metrics/deepeval.py | 2 +- .../core/metrics/geval.py | 107 ++++++++++-------- 2 files changed, 60 insertions(+), 49 deletions(-) diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py index 0585a581..cb194fee 100644 --- a/src/lightspeed_evaluation/core/metrics/deepeval.py +++ b/src/lightspeed_evaluation/core/metrics/deepeval.py @@ -123,7 +123,7 @@ def evaluate( return self.geval_handler.evaluate( metric_name=metric_name, conv_data=conv_data, - turn_idx=scope.turn_idx, + _turn_idx=scope.turn_idx, turn_data=scope.turn_data, is_conversation=scope.is_conversation, ) diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py index ca992c3f..f2cf0752 100644 --- a/src/lightspeed_evaluation/core/metrics/geval.py +++ b/src/lightspeed_evaluation/core/metrics/geval.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) -class GEvalHandler: +class GEvalHandler: # pylint: disable=R0903 """Handler for configurable GEval metrics. This class integrates with the lightspeed-evaluation framework @@ -49,8 +49,7 @@ def __init__( self._load_registry(registry_path) def _load_registry(self, registry_path: str | None = None) -> None: - """ - Load the GEval metric registry from a YAML configuration file. + """Load the GEval metric registry from a YAML configuration file. This method initializes the class-level `_registry`. It supports both user-specified and auto-discovered paths, searching common @@ -97,16 +96,16 @@ def _load_registry(self, registry_path: str | None = None) -> None: # Handle missing or invalid registry if path is None or not path.exists(): logger.warning( - f"GEval metric registry not found at expected locations. " - f"Tried: {[str(p) for p in possible_paths]}. " - f"Will fall back to runtime metadata only." + "GEval metric registry not found at expected locations. " + "Tried: %s. Will fall back to runtime metadata only.", + [str(p) for p in possible_paths], ) GEvalHandler._registry = {} return # Load registry file try: - with open(path) as f: + with open(path, encoding="utf-8") as f: GEvalHandler._registry = ( yaml.safe_load(f) or {} ) # Default to empty dict if file is empty @@ -114,21 +113,20 @@ def _load_registry(self, registry_path: str | None = None) -> None: num_metrics = ( len(GEvalHandler._registry) if GEvalHandler._registry else 0 ) - logger.info(f"Loaded {num_metrics} GEval metrics from {path}") - except Exception as e: - logger.error(f"Failed to load GEval registry from {path}: {e}") + logger.info("Loaded %d GEval metrics from %s", num_metrics, path) + except Exception as e: # pylint: disable=W0718 + logger.error("Failed to load GEval registry from %s: %s", path, e) GEvalHandler._registry = {} - def evaluate( + def evaluate( # pylint: disable=R0913,R0917 self, metric_name: str, conv_data: Any, - turn_idx: int | None, # noqa: ARG002 + _turn_idx: int | None, turn_data: Any | None, is_conversation: bool, ) -> tuple[float | None, str]: - """ - Evaluate using GEval with runtime configuration. + """Evaluate using GEval with runtime configuration. This method is the central entry point for running GEval evaluations. It retrieves the appropriate metric configuration (from registry or runtime @@ -189,28 +187,27 @@ def evaluate( return self._evaluate_conversation( conv_data, criteria, evaluation_params, evaluation_steps, threshold ) - else: - return self._evaluate_turn( - turn_data, criteria, evaluation_params, evaluation_steps, threshold - ) + return self._evaluate_turn( + turn_data, criteria, evaluation_params, evaluation_steps, threshold + ) def _convert_evaluation_params( self, params: list[str] ) -> list[LLMTestCaseParams] | None: - """ - Convert a list of string parameter names into `LLMTestCaseParams` enum values. + """Convert a list of string parameter names into `LLMTestCaseParams` enum values. This helper ensures that the evaluation parameters passed into GEval are properly - typed as `LLMTestCaseParams` (used by DeepEval's test-case schema). If any parameter is not a - valid enum member, the function treats the entire parameter list as "custom" and returns `None`. - This allows GEval to automatically infer the required fields at runtime rather than forcing - strict schema compliance. + typed as `LLMTestCaseParams` (used by DeepEval's test-case schema). If any + parameter is not a valid enum member, the function treats the entire parameter + list as "custom" and returns `None`. This allows GEval to automatically infer + the required fields at runtime rather than forcing strict schema compliance. Args: params (list[str]): A list of string identifiers (e.g., ["input", "actual_output"]). These typically come from a YAML or runtime configuration and may not always match enum names exactly. + Returns: List of LLMTestCaseParams enum values, or None if params are custom strings """ @@ -230,16 +227,17 @@ def _convert_evaluation_params( except (KeyError, AttributeError): # Not a valid enum - these are custom params, skip them logger.debug( - f"Skipping custom evaluation_param '{param}' - " - f"not a valid LLMTestCaseParams enum. " - f"GEval will auto-detect required fields." + "Skipping custom evaluation_param '%s' - " + "not a valid LLMTestCaseParams enum. " + "GEval will auto-detect required fields.", + param, ) return None # Return the successfully converted list, or None if it ended up empty return converted if converted else None - def _evaluate_turn( + def _evaluate_turn( # pylint: disable=R0913,R0917 self, turn_data: Any, criteria: str, @@ -247,8 +245,7 @@ def _evaluate_turn( evaluation_steps: list[str] | None, threshold: float, ) -> tuple[float | None, str]: - """ - Evaluate a single turn using GEval. + """Evaluate a single turn using GEval. Args: turn_data (Any): @@ -331,19 +328,21 @@ def _evaluate_turn( else "No reason provided" ) return score, reason - except Exception as e: + except Exception as e: # pylint: disable=W0718 logger.error( - f"GEval turn-level evaluation failed: {type(e).__name__}: {str(e)}" + "GEval turn-level evaluation failed: %s: %s", type(e).__name__, str(e) ) logger.debug( - f"Test case input: {test_case.input[:100] if test_case.input else 'None'}..." + "Test case input: %s...", + test_case.input[:100] if test_case.input else "None", ) logger.debug( - f"Test case output: {test_case.actual_output[:100] if test_case.actual_output else 'None'}..." + "Test case output: %s...", + test_case.actual_output[:100] if test_case.actual_output else "None", ) return None, f"GEval evaluation error: {str(e)}" - def _evaluate_conversation( + def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914 self, conv_data: Any, criteria: str, @@ -351,8 +350,7 @@ def _evaluate_conversation( evaluation_steps: list[str] | None, threshold: float, ) -> tuple[float | None, str]: - """ - Evaluate a conversation using GEval. + """Evaluate a conversation using GEval. This method aggregates all conversation turns into a single LLMTestCase and evaluates the conversation against the provided criteria. @@ -424,13 +422,16 @@ def _evaluate_conversation( else "No reason provided" ) return score, reason - except Exception as e: + except Exception as e: # pylint: disable=W0718 logger.error( - f"GEval conversation-level evaluation failed: {type(e).__name__}: {str(e)}" + "GEval conversation-level evaluation failed: %s: %s", + type(e).__name__, + str(e), ) - logger.debug(f"Conversation turns: {len(conv_data.turns)}") + logger.debug("Conversation turns: %d", len(conv_data.turns)) logger.debug( - f"Test case input preview: {test_case.input[:200] if test_case.input else 'None'}..." + "Test case input preview: %s...", + test_case.input[:200] if test_case.input else "None", ) return None, f"GEval evaluation error: {str(e)}" @@ -474,7 +475,7 @@ def _get_geval_config( and turn_data.turn_metrics_metadata and metric_key in turn_data.turn_metrics_metadata ): - logger.debug(f"Using runtime metadata for metric '{metric_name}'") + logger.debug("Using runtime metadata for metric '%s'", metric_name) return turn_data.turn_metrics_metadata[metric_key] # Conversation-level metadata override @@ -484,18 +485,28 @@ def _get_geval_config( and conv_data.conversation_metrics_metadata and metric_key in conv_data.conversation_metrics_metadata ): - logger.debug(f"Using runtime metadata for metric '{metric_name}'") + logger.debug("Using runtime metadata for metric '%s'", metric_name) return conv_data.conversation_metrics_metadata[metric_key] # Registry definition # Fallback to shared YAML registry if no runtime metadata is found - if GEvalHandler._registry and metric_name in GEvalHandler._registry: - logger.debug(f"Using registry definition for metric '{metric_name}'") - return GEvalHandler._registry[metric_name] + if ( + GEvalHandler._registry + and metric_name in GEvalHandler._registry # pylint: disable=E1135 + ): # pylint: disable=E1135 + logger.debug("Using registry definition for metric '%s'", metric_name) + return GEvalHandler._registry[metric_name] # pylint: disable=E1136 # Config not found anywhere + available_metrics = ( + list(GEvalHandler._registry.keys()) # pylint: disable=E1136 + if GEvalHandler._registry + else [] + ) logger.warning( - f"Metric '{metric_name}' not found in runtime metadata or registry. " - f"Available registry metrics: {list(GEvalHandler._registry.keys()) if GEvalHandler._registry else []}" + "Metric '%s' not found in runtime metadata or registry. " + "Available registry metrics: %s", + metric_name, + available_metrics, ) return None From 9fd462dfea36f6dbc7f0507e8fbd7ac71e1080bb Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Mon, 10 Nov 2025 14:44:30 -0700 Subject: [PATCH 4/5] fix: unbound errors with registry loading function --- .../core/metrics/geval.py | 68 +++++++++++-------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py index f2cf0752..3b691e91 100644 --- a/src/lightspeed_evaluation/core/metrics/geval.py +++ b/src/lightspeed_evaluation/core/metrics/geval.py @@ -6,7 +6,7 @@ import logging from pathlib import Path -from typing import Any +from typing import Any, Optional import yaml from deepeval.metrics import GEval @@ -73,50 +73,62 @@ def _load_registry(self, registry_path: str | None = None) -> None: if GEvalHandler._registry is not None: return - # Determine registry path - possible_paths = [] - if registry_path: - path = Path(registry_path) - possible_paths = [path] + # Ensure variables are always bound for static analysis - + path: Optional[Path] = None + possible_paths: list[Path] = [] + + # Normalize user-specified path vs. auto-discovery + if registry_path is not None: + try: + path = Path(registry_path) + except TypeError: + # Bad type passed in; treat as no path provided + path = None + if path is not None: + possible_paths = [path] else: - # Look for config/registry/geval_metrics.yaml relative to project root - # Try multiple locations + package_root = Path(__file__).resolve().parents[3] possible_paths = [ Path.cwd() / "config" / "registry" / "geval_metrics.yaml", - Path(__file__).parent.parent.parent.parent - / "config" - / "registry" - / "geval_metrics.yaml", + package_root / "config" / "registry" / "geval_metrics.yaml", ] - path = None - for p in possible_paths: - if p.exists(): - path = p + + # If no explicit file exists yet, search candidates + if path is None or not path.exists(): + for candidate in possible_paths: + if candidate.exists(): + path = candidate break + # Handle missing or invalid registry if path is None or not path.exists(): + GEvalHandler._registry = {} + GEvalHandler._registry_path = None logger.warning( - "GEval metric registry not found at expected locations. " - "Tried: %s. Will fall back to runtime metadata only.", + "GEval metric registry not found at expected locations. Tried: %s. " + "Will fall back to runtime metadata only.", [str(p) for p in possible_paths], ) - GEvalHandler._registry = {} return # Load registry file try: - with open(path, encoding="utf-8") as f: - GEvalHandler._registry = ( - yaml.safe_load(f) or {} - ) # Default to empty dict if file is empty + with path.open(encoding="utf-8") as f: + loaded = yaml.safe_load(f) or {} + # Guard against non-dict YAML (e.g., list/null) + if not isinstance(loaded, dict): + logger.warning( + "GEval registry file %s did not contain a mapping; using empty registry.", + path, + ) + loaded = {} + GEvalHandler._registry = loaded GEvalHandler._registry_path = path - num_metrics = ( - len(GEvalHandler._registry) if GEvalHandler._registry else 0 - ) - logger.info("Loaded %d GEval metrics from %s", num_metrics, path) - except Exception as e: # pylint: disable=W0718 + logger.info("Loaded %d GEval metrics from %s", len(loaded), path) + except Exception as e: # noqa: BLE001 # pylint: disable=broad-exception-caught logger.error("Failed to load GEval registry from %s: %s", path, e) GEvalHandler._registry = {} + GEvalHandler._registry_path = None def evaluate( # pylint: disable=R0913,R0917 self, From 96e39ba1137c67853d448d4f0673f47abb232ba2 Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Tue, 11 Nov 2025 14:25:15 -0700 Subject: [PATCH 5/5] removing backward compatibility for turn data + coderabbit fixes --- .../core/metrics/deepeval.py | 8 +++- .../core/metrics/geval.py | 40 ++++++++++--------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py index cb194fee..0f316233 100644 --- a/src/lightspeed_evaluation/core/metrics/deepeval.py +++ b/src/lightspeed_evaluation/core/metrics/deepeval.py @@ -119,9 +119,13 @@ def evaluate( return None, f"DeepEval {metric_name} evaluation failed: {str(e)}" # Otherwise, assume it's a GEval metric - # Note: metric_name should NOT have "geval:" prefix here + normalized_metric_name = ( + metric_name.split(":", 1)[1] + if metric_name.startswith("geval:") + else metric_name + ) return self.geval_handler.evaluate( - metric_name=metric_name, + metric_name=normalized_metric_name, conv_data=conv_data, _turn_idx=scope.turn_idx, turn_data=scope.turn_data, diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py index 3b691e91..2ca53395 100644 --- a/src/lightspeed_evaluation/core/metrics/geval.py +++ b/src/lightspeed_evaluation/core/metrics/geval.py @@ -285,12 +285,6 @@ def _evaluate_turn( # pylint: disable=R0913,R0917 # Convert evaluation_params to enum values if valid, otherwise use defaults converted_params = self._convert_evaluation_params(evaluation_params) - if not converted_params: - # If no valid params, use sensible defaults for turn evaluation - converted_params = [ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - ] # Create GEval metric with runtime configuration metric_kwargs: dict[str, Any] = { @@ -302,6 +296,18 @@ def _evaluate_turn( # pylint: disable=R0913,R0917 "top_logprobs": 5, } + # Only set evaluation_params if we have valid enum conversions + # or if no params were provided at all (then use defaults) + if converted_params is None: + if not evaluation_params: + metric_kwargs["evaluation_params"] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ] + # else: leave unset so GEval can auto-detect from custom strings + else: + metric_kwargs["evaluation_params"] = converted_params + # Add evaluation steps if provided if evaluation_steps: metric_kwargs["evaluation_steps"] = evaluation_steps @@ -320,12 +326,7 @@ def _evaluate_turn( # pylint: disable=R0913,R0917 test_case_kwargs["expected_output"] = turn_data.expected_response if turn_data.contexts: - # Normalize contexts: handle both dict and string formats - normalized_contexts = [ - ctx.get("content", str(ctx)) if isinstance(ctx, dict) else str(ctx) - for ctx in turn_data.contexts - ] - test_case_kwargs["context"] = normalized_contexts + test_case_kwargs["context"] = turn_data.contexts # Create test case for a single turn test_case = LLMTestCase(**test_case_kwargs) @@ -385,12 +386,6 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914 """ # Convert evaluation_params to enum values if valid, otherwise use defaults converted_params = self._convert_evaluation_params(evaluation_params) - if not converted_params: - # If no valid params, use sensible defaults for conversation evaluation - converted_params = [ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - ] # Configure the GEval metric for conversation-level evaluation metric_kwargs: dict[str, Any] = { @@ -402,6 +397,15 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914 "top_logprobs": 5, # Vertex/Gemini throws an error if over 20. } + if converted_params is None: + if not evaluation_params: + metric_kwargs["evaluation_params"] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ] + else: + metric_kwargs["evaluation_params"] = converted_params + # Add evaluation steps if provided if evaluation_steps: metric_kwargs["evaluation_steps"] = evaluation_steps