diff --git a/README.md b/README.md index 5abc2cc4..6609faf0 100644 --- a/README.md +++ b/README.md @@ -115,16 +115,18 @@ api: no_tools: null # Whether to bypass tools (optional) system_prompt: null # Custom system prompt (optional) -# Metrics Configuration with thresholds +# Metrics Configuration with thresholds and defaults metrics_metadata: turn_level: - "ragas:faithfulness": - threshold: 0.8 - description: "How faithful the response is to the provided context" - "ragas:response_relevancy": threshold: 0.8 description: "How relevant the response is to the question" + default: true # Used by default when turn_metrics is null + + "ragas:faithfulness": + threshold: 0.8 + description: "How faithful the response is to the provided context" + default: false # Only used when explicitly specified "custom:tool_eval": description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)" @@ -160,16 +162,6 @@ visualization: - conversation_group_id: "test_conversation" description: "Sample evaluation" - # Turn-level metrics to evaluate - turn_metrics: - - "ragas:faithfulness" - - "custom:answer_correctness" - - # Metric-specific configuration - turn_metrics_metadata: - "ragas:faithfulness": - threshold: 0.8 - # Conversation-level metrics conversation_metrics: - "deepeval:conversation_completeness" @@ -186,8 +178,23 @@ visualization: - OpenShift Virtualization is an extension of the OpenShift ... attachments: [] # Attachments (Optional) expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers + + # Per-turn metrics (overrides system defaults) + turn_metrics: + - "ragas:faithfulness" + - "custom:answer_correctness" + + # Per-turn metric configuration + turn_metrics_metadata: + "ragas:faithfulness": + threshold: 0.9 # Override system default + # turn_metrics: null (omitted) → Use system defaults (metrics with default=true) + + - turn_id: id2 + query: Skip this turn evaluation + turn_metrics: [] # Skip evaluation for this turn - - turn_id: id2 + - turn_id: id3 query: How do I create a virtual machine in OpenShift Virtualization? response: null # Populated by API if enabled, otherwise provide contexts: @@ -223,11 +230,21 @@ visualization: | `expected_response` | string | 📋 | Expected response for comparison | ❌ | | `expected_tool_calls` | list[list[dict]] | 📋 | Expected tool call sequences | ❌ | | `tool_calls` | list[list[dict]] | ❌ | Actual tool calls from API | ✅ (if API enabled) | +| `turn_metrics` | list[string] | ❌ | Turn-specific metrics to evaluate | ❌ | +| `turn_metrics_metadata` | dict | ❌ | Turn-specific metric configuration | ❌ | Note: Context will be collected automatically in the future. > 📋 **Required based on metrics**: Some fields are required only when using specific metrics +#### Metrics override behavior + +| Override Value | Behavior | +|---------------------|----------| +| `null` (or omitted) | Use system defaults (metrics with `default: true`) | +| `[]` (empty list) | Skip evaluation for this turn | +| `["metric1", ...]` | Use specified metrics only | + Examples > - `expected_response`: Required for `custom:answer_correctness` > - `expected_tool_calls`: Required for `custom:tool_eval` diff --git a/config/evaluation_data.yaml b/config/evaluation_data.yaml index c9ad898f..e3f9b6ea 100644 --- a/config/evaluation_data.yaml +++ b/config/evaluation_data.yaml @@ -3,14 +3,6 @@ - conversation_group_id: "conv_group_1" description: "conversation group description" - turn_metrics: - - "ragas:faithfulness" - - "ragas:response_relevancy" - - "ragas:context_precision_without_reference" - - turn_metrics_metadata: - "ragas:faithfulness": - threshold: 0.99 conversation_metrics: [] conversation_metrics_metadata: {} @@ -23,15 +15,17 @@ - "Context 2" expected_response: "Expected Response" -- conversation_group_id: "conv_group_2" - description: "conversation group description" + turn_metrics: + - "ragas:faithfulness" + - "ragas:response_relevancy" + - "ragas:context_precision_without_reference" - turn_metrics: - - "ragas:context_recall" - - "ragas:context_relevance" - - "ragas:context_precision_with_reference" + turn_metrics_metadata: + "ragas:faithfulness": + threshold: 0.99 - turn_metrics_metadata: {} +- conversation_group_id: "conv_group_2" + description: "conversation group description" conversation_metrics: [] conversation_metrics_metadata: {} @@ -43,14 +37,14 @@ - "Context 1" expected_response: "Expected Response" + turn_metrics: + - "ragas:context_recall" + - "ragas:context_relevance" + - "ragas:context_precision_with_reference" + - conversation_group_id: "conv_group_3" description: "conversation group description" - turn_metrics: - - "custom:answer_correctness" - - turn_metrics_metadata: {} - conversation_metrics: - "deepeval:conversation_completeness" - "deepeval:conversation_relevancy" @@ -60,14 +54,10 @@ turns: - turn_id: "1" query: "User Query 1" - response: "API Response 1" - contexts: - - "Context" - expected_response: "Expected Response 1" + + turn_metrics: [] # Skip eval for this turn - turn_id: "2" query: "User Query 2" response: "API Response 2" - contexts: - - "Context" - expected_response: "Expected Response 2" + # turn_metrics: null (omitted) → Use system defaults (metrics with default=true) diff --git a/config/system.yaml b/config/system.yaml index b7e95120..fb48ddb9 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -31,23 +31,21 @@ metrics_metadata: # Turn-level metrics metadata turn_level: # Ragas Response Evaluation metrics - "ragas:faithfulness": - threshold: 0.8 - description: "How faithful the response is to the provided context" - "ragas:response_relevancy": threshold: 0.8 description: "How relevant the response is to the question" + default: true # This metric is applied by default when no turn_metrics specified + + "ragas:faithfulness": + threshold: 0.8 + description: "How faithful the response is to the provided context" + default: false # By default the value is false # Ragas Context/Retrieval Evaluation metrics "ragas:context_recall": threshold: 0.8 description: "Did we fetch every fact the answer needs?" - "ragas:context_relevance": - threshold: 0.7 - description: "Is what we retrieved actually relevant to user query?" - "ragas:context_precision_with_reference": threshold: 0.7 description: "How precise the retrieved context is (with reference)" @@ -56,6 +54,10 @@ metrics_metadata: threshold: 0.7 description: "How precise the retrieved context is (without reference)" + "ragas:context_relevance": + threshold: 0.7 + description: "Is what we retrieved actually relevant to user query?" + # Custom metrics "custom:answer_correctness": threshold: 0.75 @@ -70,6 +72,7 @@ metrics_metadata: "deepeval:conversation_completeness": threshold: 0.8 description: "How completely the conversation addresses user intentions" + default: false "deepeval:conversation_relevancy": threshold: 0.7 diff --git a/src/lightspeed_evaluation/core/metrics/manager.py b/src/lightspeed_evaluation/core/metrics/manager.py new file mode 100644 index 00000000..c06b07b7 --- /dev/null +++ b/src/lightspeed_evaluation/core/metrics/manager.py @@ -0,0 +1,136 @@ +"""Metrics mapping for evaluation.""" + +from enum import Enum +from typing import Any, Optional + +from ..models.data import EvaluationData, TurnData +from ..models.system import SystemConfig + + +class MetricLevel(Enum): + """Metric level enumeration.""" + + TURN = "turn" + CONVERSATION = "conversation" + + +class MetricManager: + """Manager for both turn and conversation metrics.""" + + def __init__(self, system_config: SystemConfig): + """Initialize with system configuration.""" + self.system_config = system_config + + def resolve_metrics( + self, metrics: Optional[list[str]], level: MetricLevel + ) -> list[str]: + """Resolve metrics mapping. + + Options: + - None: use system defaults (metrics with default=true) + - []: skip evaluation completely + - [metrics...]: use specified metrics from turn data + + Args: + metrics: The metrics configuration (None, [], or list of metrics) + level: Whether this is TURN or CONVERSATION level + + Returns: + List of metrics to evaluate + """ + if metrics is None: + # None = use system defaults + return self._extract_default_metrics(level) + if metrics == []: + # [] = explicitly skip evaluation + return [] + # Use specified metrics as-is + return metrics + + def get_effective_threshold( + self, + metric_identifier: str, + level: MetricLevel, + conv_data: Optional[EvaluationData] = None, + turn_data: Optional[TurnData] = None, + ) -> Optional[float]: + """Get effective threshold with priority hierarchy. + + Priority: + 1. Level-specific metadata (turn-specific for turns, conversation-specific for convs) + 2. System defaults + + Args: + metric_identifier: The metric to get threshold for + level: Whether this is TURN or CONVERSATION level + conv_data: Conversation data for conversation-level metadata + turn_data: Turn data for turn-specific metadata + + Returns: + Effective threshold or None if not found + """ + # Check level-specific metadata first + level_metadata = self._get_level_metadata(level, conv_data, turn_data) + threshold = level_metadata.get(metric_identifier, {}).get("threshold") + if threshold is not None: + return threshold + + # Fall back to system defaults + system_metadata = self._get_system_metadata(level) + return system_metadata.get(metric_identifier, {}).get("threshold") + + def _get_level_metadata( + self, + level: MetricLevel, + conv_data: Optional[EvaluationData], + turn_data: Optional[TurnData], + ) -> dict[str, dict[str, Any]]: + """Get level-specific metadata (turn or conversation level).""" + if level == MetricLevel.TURN and turn_data and turn_data.turn_metrics_metadata: + return turn_data.turn_metrics_metadata + if ( + level == MetricLevel.CONVERSATION + and conv_data + and conv_data.conversation_metrics_metadata + ): + return conv_data.conversation_metrics_metadata + return {} + + def _get_system_metadata(self, level: MetricLevel) -> dict[str, dict[str, Any]]: + """Get system-level metadata for the given level.""" + if level == MetricLevel.TURN: + return self.system_config.default_turn_metrics_metadata + return self.system_config.default_conversation_metrics_metadata + + def _extract_default_metrics(self, level: MetricLevel) -> list[str]: + """Extract metrics that have default=true from metadata.""" + metrics_metadata = self._get_system_metadata(level) + + default_metrics = [] + for metric_name, metadata in metrics_metadata.items(): + if metadata.get("default", False): # default=false if not specified + default_metrics.append(metric_name) + return default_metrics + + def count_metrics_for_conversation( + self, conv_data: EvaluationData + ) -> dict[str, int]: + """Count total metrics that would be evaluated for a conversation.""" + # Count turn metrics + total_turn_metrics = 0 + for turn_data in conv_data.turns: + turn_metrics = self.resolve_metrics( + turn_data.turn_metrics, MetricLevel.TURN + ) + total_turn_metrics += len(turn_metrics) + + # Count conversation metrics + conversation_metrics = self.resolve_metrics( + conv_data.conversation_metrics, MetricLevel.CONVERSATION + ) + + return { + "turn_metrics": total_turn_metrics, + "conversation_metrics": len(conversation_metrics), + "total_turns": len(conv_data.turns), + } diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py index b0b56346..8bd4ebbd 100644 --- a/src/lightspeed_evaluation/core/models/data.py +++ b/src/lightspeed_evaluation/core/models/data.py @@ -10,6 +10,27 @@ logger = logging.getLogger(__name__) +def _validate_and_deduplicate_metrics( + metrics: list[str], metric_type: str = "metric" +) -> list[str]: + """Validate format and deduplicate metrics while preserving order.""" + # Validate format first + for metric in metrics: + if not metric or ":" not in metric: + raise ValueError( + f'{metric_type} "{metric}" must be in format "framework:metric_name"' + ) + + # Deduplicate while preserving order + seen = set() + deduplicated = [] + for metric in metrics: + if metric not in seen: + deduplicated.append(metric) + seen.add(metric) + return deduplicated + + class TurnData(BaseModel): """Individual turn data within a conversation.""" @@ -41,6 +62,24 @@ class TurnData(BaseModel): default=None, description="Conversation ID - populated by API if enabled" ) + # Per-turn metrics support + turn_metrics: Optional[list[str]] = Field( + default=None, + description="Turn-specific metrics to evaluate (overrides system defaults)", + ) + turn_metrics_metadata: Optional[dict[str, dict[str, Any]]] = Field( + default=None, + description="Turn-specific metric configuration (overrides system defaults)", + ) + + @field_validator("turn_metrics") + @classmethod + def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]: + """Validate and deduplicate turn-specific metrics.""" + if v is not None: + v = _validate_and_deduplicate_metrics(v, "Turn metric") + return v + @field_validator("expected_tool_calls", mode="before") @classmethod def validate_expected_tool_calls( @@ -101,18 +140,12 @@ class EvaluationData(BaseModel): description="Optional description of the conversation group", ) - # Metrics to run (None = skip that level of evaluation) - turn_metrics: Optional[list[str]] = Field( - default=None, description="Turn-level metrics to evaluate" - ) + # Conversation-level metrics conversation_metrics: Optional[list[str]] = Field( default=None, description="Conversation-level metrics to evaluate" ) - # Metric-specific configuration (threshold, weights, etc.) - turn_metrics_metadata: Optional[dict[str, dict[str, Any]]] = Field( - default=None, description="Turn-level metric configuration" - ) + # Conversation-level metric configuration conversation_metrics_metadata: Optional[dict[str, dict[str, Any]]] = Field( default=None, description="Conversation-level metric configuration" ) @@ -122,16 +155,14 @@ class EvaluationData(BaseModel): ..., min_length=1, description="Conversation turns - must have at least one" ) - @field_validator("turn_metrics", "conversation_metrics") + @field_validator("conversation_metrics") @classmethod - def validate_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]: - """Validate metrics are properly formatted.""" + def validate_conversation_metrics( + cls, v: Optional[list[str]] + ) -> Optional[list[str]]: + """Validate and deduplicate conversation metrics.""" if v is not None: - for metric in v: - if not metric or ":" not in metric: - raise ValueError( - f'Metric "{metric}" must be in format "framework:metric_name"' - ) + v = _validate_and_deduplicate_metrics(v, "Conversation metric") return v diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py index b573253d..1235d31f 100644 --- a/src/lightspeed_evaluation/core/system/validator.py +++ b/src/lightspeed_evaluation/core/system/validator.py @@ -143,13 +143,15 @@ def _validate_metrics_availability(self, data: EvaluationData) -> None: """Validate that specified metrics are available/supported.""" conversation_id = data.conversation_group_id - # Validate turn metrics - if data.turn_metrics: - for metric in data.turn_metrics: - if metric not in TURN_LEVEL_METRICS: - self.validation_errors.append( - f"Conversation {conversation_id}: Unknown turn metric '{metric}'" - ) + # Validate per-turn metrics + for turn_data in data.turns: + if turn_data.turn_metrics: + for metric in turn_data.turn_metrics: + if metric not in TURN_LEVEL_METRICS: + self.validation_errors.append( + f"Conversation {conversation_id}, Turn {turn_data.turn_id}: " + f"Unknown turn metric '{metric}'" + ) # Validate conversation metrics if data.conversation_metrics: @@ -180,10 +182,10 @@ def _check_metric_requirements( # Check each turn against metric requirements for turn_data in data.turns: # Skip validation if no turn metrics specified - if not data.turn_metrics: + if not turn_data.turn_metrics: continue - for metric in data.turn_metrics: + for metric in turn_data.turn_metrics: if metric not in METRIC_REQUIREMENTS: continue # Unknown metrics are handled separately diff --git a/src/lightspeed_evaluation/pipeline/evaluation/errors.py b/src/lightspeed_evaluation/pipeline/evaluation/errors.py index cea9702d..a799f8ca 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/errors.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/errors.py @@ -15,10 +15,20 @@ def __init__(self) -> None: self.results: list[EvaluationResult] = [] def mark_all_metrics_as_error( - self, conv_data: EvaluationData, error_reason: str + self, + conv_data: EvaluationData, + error_reason: str, + resolved_turn_metrics: list[list[str]], + resolved_conversation_metrics: list[str], ) -> list[EvaluationResult]: """Mark all turn and conversation metrics as ERROR when there is an error. + Args: + conv_data: Conversation data + error_reason: Reason for error + resolved_turn_metrics: Pre-resolved turn metrics + resolved_conversation_metrics: Pre-resolved conversation metrics + Returns: list[EvaluationResult]: ERROR results for all metrics """ @@ -30,40 +40,38 @@ def mark_all_metrics_as_error( error_results = [] # Mark all turn-level metrics as ERROR - if conv_data.turn_metrics: - for turn_data in conv_data.turns: - for metric_identifier in conv_data.turn_metrics: - error_result = EvaluationResult( - conversation_group_id=conv_data.conversation_group_id, - turn_id=turn_data.turn_id, - metric_identifier=metric_identifier, - result="ERROR", - score=None, - threshold=None, - reason=error_reason, - query=turn_data.query, - response="", - execution_time=0.0, - ) - error_results.append(error_result) - - # Mark all conversation-level metrics as ERROR - if conv_data.conversation_metrics: - for metric_identifier in conv_data.conversation_metrics: + for turn_data, turn_metrics in zip(conv_data.turns, resolved_turn_metrics): + for metric_identifier in turn_metrics: error_result = EvaluationResult( conversation_group_id=conv_data.conversation_group_id, - turn_id=None, # Conversation-level metric + turn_id=turn_data.turn_id, metric_identifier=metric_identifier, result="ERROR", score=None, threshold=None, reason=error_reason, - query="", + query=turn_data.query, response="", execution_time=0.0, ) error_results.append(error_result) + # Mark all conversation-level metrics as ERROR + for metric_identifier in resolved_conversation_metrics: + error_result = EvaluationResult( + conversation_group_id=conv_data.conversation_group_id, + turn_id=None, # Conversation-level metric + metric_identifier=metric_identifier, + result="ERROR", + score=None, + threshold=None, + reason=error_reason, + query="", + response="", + execution_time=0.0, + ) + error_results.append(error_result) + # Store results internally for summary tracking self.results.extend(error_results) return error_results diff --git a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py index 7b2740fa..229cb81f 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py @@ -7,13 +7,9 @@ from ...core.llm.manager import LLMManager from ...core.metrics.custom import CustomMetrics from ...core.metrics.deepeval import DeepEvalMetrics +from ...core.metrics.manager import MetricLevel, MetricManager from ...core.metrics.ragas import RagasMetrics -from ...core.models import ( - EvaluationData, - EvaluationRequest, - EvaluationResult, - EvaluationScope, -) +from ...core.models import EvaluationRequest, EvaluationResult, EvaluationScope from ...core.system import ConfigLoader logger = logging.getLogger(__name__) @@ -22,8 +18,13 @@ class MetricsEvaluator: """Handles individual metric evaluation with proper scoring and status determination.""" - def __init__(self, llm_manager: LLMManager, config_loader: ConfigLoader) -> None: - """Initialize with LLM manager and config.""" + def __init__( + self, + llm_manager: LLMManager, + config_loader: ConfigLoader, + metric_manager: MetricManager, + ) -> None: + """Initialize with LLM manager, config, and metric manager.""" self.config_loader = config_loader self.config = config_loader.system_config @@ -39,6 +40,8 @@ def __init__(self, llm_manager: LLMManager, config_loader: ConfigLoader) -> None "custom": self.custom_metrics, } + self.metric_manager = metric_manager + def evaluate_metric(self, request: EvaluationRequest) -> Optional[EvaluationResult]: """Evaluate a single metric and return result.""" start_time = time.time() @@ -81,9 +84,14 @@ def evaluate_metric(self, request: EvaluationRequest) -> Optional[EvaluationResu if score is None: return self._create_error_result(request, reason, execution_time) - # Get threshold and determine status - threshold = self._get_effective_threshold( - request.conv_data, request.metric_identifier, request.is_conversation + # Get threshold + level = ( + MetricLevel.CONVERSATION + if request.is_conversation + else MetricLevel.TURN + ) + threshold = self.metric_manager.get_effective_threshold( + request.metric_identifier, level, request.conv_data, request.turn_data ) status = self._determine_status(score, threshold) @@ -124,42 +132,11 @@ def _create_error_result( execution_time=execution_time, ) - def _get_effective_threshold( - self, conv_data: EvaluationData, metric_identifier: str, is_conversation: bool - ) -> Optional[float]: - """Get effective threshold for metric (conversation-specific or system default).""" - # Check conversation-specific metadata first - if is_conversation: - metadata = (conv_data.conversation_metrics_metadata or {}).get( - metric_identifier, {} - ) - else: - metadata = (conv_data.turn_metrics_metadata or {}).get( - metric_identifier, {} - ) - - if "threshold" in metadata: - return metadata["threshold"] - - # Fall back to system defaults - if self.config is None: - raise ValueError("SystemConfig must be loaded") - if is_conversation: - default_metadata = ( - self.config.default_conversation_metrics_metadata or {} - ).get(metric_identifier, {}) - else: - default_metadata = (self.config.default_turn_metrics_metadata or {}).get( - metric_identifier, {} - ) - - return default_metadata.get("threshold") - def _determine_status(self, score: float, threshold: Optional[float]) -> str: """Determine evaluation status based on score and threshold.""" if threshold is None: threshold = 0.5 # This will also handle binary metrics - return "PASS" if score >= threshold else "FAIL" + return "PASS" if score >= float(threshold) else "FAIL" def get_supported_frameworks(self) -> list[str]: """Get list of supported evaluation frameworks.""" diff --git a/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py b/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py index f63a42a2..9b9b6a8f 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py @@ -5,13 +5,14 @@ from ...core.api import APIClient from ...core.llm.manager import LLMManager +from ...core.metrics.manager import MetricManager from ...core.models import EvaluationData, EvaluationResult from ...core.output.data_persistence import save_evaluation_data from ...core.system import ConfigLoader, DataValidator from .amender import APIDataAmender from .errors import EvaluationErrorHandler from .evaluator import MetricsEvaluator -from .processor import ConversationProcessor +from .processor import ConversationProcessor, ProcessorComponents logger = logging.getLogger(__name__) @@ -52,25 +53,29 @@ def _initialize_components(self) -> None: # LLM Manager llm_manager = LLMManager.from_llm_config(config.llm) + # Metric manager + metric_manager = MetricManager(config) + # Create pipeline components - api_client = self._create_api_client() - api_amender = APIDataAmender(api_client) + self.api_client = self._create_api_client() + api_amender = APIDataAmender(self.api_client) error_handler = EvaluationErrorHandler() - metrics_evaluator = MetricsEvaluator(llm_manager, self.config_loader) - # Group components for easier access - self.components = { - "api_client": api_client, - "api_amender": api_amender, - "error_handler": error_handler, - "metrics_evaluator": metrics_evaluator, - } + metrics_evaluator = MetricsEvaluator( + llm_manager, self.config_loader, metric_manager + ) + + # Create processor components + processor_components = ProcessorComponents( + metrics_evaluator=metrics_evaluator, + api_amender=api_amender, + error_handler=error_handler, + metric_manager=metric_manager, + ) # Conversation processor self.conversation_processor = ConversationProcessor( self.config_loader, - self.components["metrics_evaluator"], - self.components["api_amender"], - self.components["error_handler"], + processor_components, ) def _create_api_client(self) -> Optional[APIClient]: @@ -165,6 +170,5 @@ def _save_updated_data(self, evaluation_data: list[EvaluationData]) -> None: def close(self) -> None: """Clean up resources.""" - api_client = self.components.get("api_client") - if api_client: - api_client.close() + if self.api_client: + self.api_client.close() diff --git a/src/lightspeed_evaluation/pipeline/evaluation/processor.py b/src/lightspeed_evaluation/pipeline/evaluation/processor.py index 43fcc99a..72cd79a2 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/processor.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/processor.py @@ -1,7 +1,9 @@ """Conversation processing module - handles conversation and turn processing.""" import logging +from dataclasses import dataclass +from ...core.metrics.manager import MetricLevel, MetricManager from ...core.models import EvaluationData, EvaluationRequest, EvaluationResult, TurnData from ...core.system import ConfigLoader from .amender import APIDataAmender @@ -11,22 +13,30 @@ logger = logging.getLogger(__name__) +@dataclass +class ProcessorComponents: + """Components required for conversation processing.""" + + metrics_evaluator: MetricsEvaluator + api_amender: APIDataAmender + error_handler: EvaluationErrorHandler + metric_manager: MetricManager + + class ConversationProcessor: """Processes individual conversations - handles both turn and conversation metrics.""" - def __init__( - self, - config_loader: ConfigLoader, - metrics_evaluator: MetricsEvaluator, - api_amender: APIDataAmender, - error_handler: EvaluationErrorHandler, - ): - """Initialize with required components.""" + def __init__(self, config_loader: ConfigLoader, components: ProcessorComponents): + """Initialize with config loader and components.""" self.config_loader = config_loader self.config = config_loader.system_config - self.metrics_evaluator = metrics_evaluator - self.api_amender = api_amender - self.error_handler = error_handler + self.components = components + + # Keep direct references for easier access + self.metrics_evaluator = components.metrics_evaluator + self.api_amender = components.api_amender + self.error_handler = components.error_handler + self.metric_manager = components.metric_manager def process_conversation(self, conv_data: EvaluationData) -> list[EvaluationResult]: """Process single conversation - handle turn and conversation level metrics. @@ -37,9 +47,24 @@ def process_conversation(self, conv_data: EvaluationData) -> list[EvaluationResu logger.info("Evaluating conversation: %s", conv_data.conversation_group_id) results: list[EvaluationResult] = [] + resolved_turn_metrics = [ + self.metric_manager.resolve_metrics( + turn_data.turn_metrics, MetricLevel.TURN + ) + for turn_data in conv_data.turns + ] + resolved_conversation_metrics = self.metric_manager.resolve_metrics( + conv_data.conversation_metrics, MetricLevel.CONVERSATION + ) + # Skip if no metrics specified at any level - if not conv_data.turn_metrics and not conv_data.conversation_metrics: - logger.debug("No metrics specified, skipping") + has_turn_metrics = any(bool(metrics) for metrics in resolved_turn_metrics) + has_conversation_metrics = bool(resolved_conversation_metrics) + + if not has_turn_metrics and not has_conversation_metrics: + logger.debug( + "No metrics to evaluate (no defaults or explicit metrics), skipping" + ) return results # Amend with API data if enabled @@ -54,34 +79,48 @@ def process_conversation(self, conv_data: EvaluationData) -> list[EvaluationResu if api_error_occurred: logger.error("API error detected - marking all metrics as ERROR") error_results = self.error_handler.mark_all_metrics_as_error( - conv_data, "API error during data amendment" + conv_data, + "API error during data amendment", + resolved_turn_metrics=resolved_turn_metrics, + resolved_conversation_metrics=resolved_conversation_metrics, ) return error_results - # Process turn-level metrics - if conv_data.turn_metrics: - logger.debug("Processing turn-level metrics: %s", conv_data.turn_metrics) - for turn_idx, turn_data in enumerate(conv_data.turns): - turn_results = self._evaluate_turn(conv_data, turn_idx, turn_data) + # Process turn-level metrics for each turn + for turn_idx, (turn_data, turn_metrics) in enumerate( + zip(conv_data.turns, resolved_turn_metrics) + ): + if turn_metrics: + logger.debug("Processing turn %d metrics: %s", turn_idx, turn_metrics) + turn_results = self._evaluate_turn( + conv_data, turn_idx, turn_data, turn_metrics + ) results.extend(turn_results) # Process conversation-level metrics - if conv_data.conversation_metrics: + if resolved_conversation_metrics: logger.debug( "Processing conversation-level metrics: %s", - conv_data.conversation_metrics, + resolved_conversation_metrics, + ) + conv_results = self._evaluate_conversation( + conv_data, resolved_conversation_metrics ) - conv_results = self._evaluate_conversation(conv_data) results.extend(conv_results) return results def _evaluate_turn( - self, conv_data: EvaluationData, turn_idx: int, turn_data: TurnData + self, + conv_data: EvaluationData, + turn_idx: int, + turn_data: TurnData, + turn_metrics: list[str], ) -> list[EvaluationResult]: """Evaluate single turn with specified turn metrics.""" results = [] - for metric_identifier in conv_data.turn_metrics or []: + + for metric_identifier in turn_metrics: request = EvaluationRequest.for_turn( conv_data, metric_identifier, turn_idx, turn_data ) @@ -91,11 +130,12 @@ def _evaluate_turn( return results def _evaluate_conversation( - self, conv_data: EvaluationData + self, conv_data: EvaluationData, conversation_metrics: list[str] ) -> list[EvaluationResult]: """Evaluate conversation-level metrics.""" results = [] - for metric_identifier in conv_data.conversation_metrics or []: + + for metric_identifier in conversation_metrics: request = EvaluationRequest.for_conversation(conv_data, metric_identifier) result = self.metrics_evaluator.evaluate_metric(request) if result: @@ -104,9 +144,4 @@ def _evaluate_conversation( def get_metrics_summary(self, conv_data: EvaluationData) -> dict[str, int]: """Get summary of metrics to be evaluated for a conversation.""" - summary = { - "turn_metrics": len(conv_data.turn_metrics or []), - "conversation_metrics": len(conv_data.conversation_metrics or []), - "total_turns": len(conv_data.turns), - } - return summary + return self.metric_manager.count_metrics_for_conversation(conv_data) diff --git a/tests/conftest.py b/tests/conftest.py index d86aef41..1d27fb2f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -69,10 +69,12 @@ def sample_turn_data(): @pytest.fixture def sample_evaluation_data(sample_turn_data): """Provide sample EvaluationData for testing.""" + # Add turn_metrics to the turn data + sample_turn_data.turn_metrics = ["ragas:faithfulness", "ragas:response_relevancy"] + return EvaluationData( conversation_group_id="test_conversation", description="Test conversation for evaluation", - turn_metrics=["ragas:faithfulness", "ragas:response_relevancy"], conversation_metrics=["deepeval:conversation_completeness"], turns=[sample_turn_data], ) @@ -122,15 +124,21 @@ def temp_config_files(): "turn_level": { "ragas:faithfulness": { "threshold": 0.8, - "type": "turn", - "framework": "ragas", - } + "description": "How faithful the response is to the provided context", + "default": False, + }, + "ragas:response_relevancy": { + "threshold": 0.8, + }, + "custom:answer_correctness": { + "threshold": 0.7, + }, }, "conversation_level": { "deepeval:conversation_completeness": { "threshold": 0.7, - "type": "conversation", - "framework": "deepeval", + "description": "How completely the conversation addresses user intentions", + "default": False, } }, }, @@ -147,9 +155,7 @@ def temp_config_files(): { "conversation_group_id": "test_conv_1", "description": "Test conversation 1", - "turn_metrics": ["ragas:faithfulness", "ragas:response_relevancy"], "conversation_metrics": [], - "turn_metrics_metadata": {}, "conversation_metrics_metadata": {}, "turns": [ { @@ -158,15 +164,15 @@ def temp_config_files(): "response": "Machine learning is a subset of AI.", "contexts": ["Machine learning is a method of data analysis."], "expected_response": "Machine learning is a subset of artificial intelligence.", + "turn_metrics": ["ragas:faithfulness", "ragas:response_relevancy"], + "turn_metrics_metadata": {}, } ], }, { "conversation_group_id": "test_conv_2", "description": "Test conversation 2", - "turn_metrics": ["custom:answer_correctness"], "conversation_metrics": ["deepeval:conversation_completeness"], - "turn_metrics_metadata": {}, "conversation_metrics_metadata": {}, "turns": [ { @@ -175,6 +181,8 @@ def temp_config_files(): "response": "Neural networks are computing systems inspired by biological neural networks.", "contexts": ["Neural networks consist of interconnected nodes."], "expected_response": "Neural networks are computational models inspired by the human brain.", + "turn_metrics": ["custom:answer_correctness"], + "turn_metrics_metadata": {}, }, { "turn_id": "2", @@ -184,6 +192,8 @@ def temp_config_files(): "Applications include computer vision and natural language processing." ], "expected_response": "Applications include computer vision, NLP, and pattern recognition.", + "turn_metrics": None, + "turn_metrics_metadata": {}, }, ], }, diff --git a/tests/unit/core/config/test_models.py b/tests/unit/core/config/test_models.py index c424a64e..e5a39720 100644 --- a/tests/unit/core/config/test_models.py +++ b/tests/unit/core/config/test_models.py @@ -72,20 +72,24 @@ class TestEvaluationData: def test_valid_evaluation_data_creation(self): """Test creating valid EvaluationData instance.""" - turn = TurnData(turn_id="1", query="Test query", response="Test response") + turn = TurnData( + turn_id="1", + query="Test query", + response="Test response", + turn_metrics=["ragas:faithfulness"], + ) eval_data = EvaluationData( conversation_group_id="test_conv", description="Test conversation", - turn_metrics=["ragas:faithfulness"], conversation_metrics=["deepeval:completeness"], turns=[turn], ) assert eval_data.conversation_group_id == "test_conv" assert eval_data.description == "Test conversation" - assert eval_data.turn_metrics == ["ragas:faithfulness"] assert eval_data.conversation_metrics == ["deepeval:completeness"] assert len(eval_data.turns) == 1 + assert eval_data.turns[0].turn_metrics == ["ragas:faithfulness"] def test_evaluation_data_with_minimal_fields(self): """Test EvaluationData with only required fields.""" @@ -94,9 +98,9 @@ def test_evaluation_data_with_minimal_fields(self): assert eval_data.conversation_group_id == "test_conv" assert eval_data.description is None - assert eval_data.turn_metrics is None assert eval_data.conversation_metrics is None assert len(eval_data.turns) == 1 + assert eval_data.turns[0].turn_metrics is None def test_evaluation_data_invalid_empty_conversation_id(self): """Test validation error for empty conversation_group_id.""" @@ -108,29 +112,33 @@ def test_evaluation_data_invalid_empty_conversation_id(self): def test_evaluation_data_invalid_metric_format_missing_colon(self): """Test validation error for metric without colon.""" - turn = TurnData(turn_id="1", query="Test query", response="Test response") with pytest.raises( ValidationError, match='must be in format "framework:metric_name"' ): - EvaluationData( - conversation_group_id="test_conv", + TurnData( + turn_id="1", + query="Test query", + response="Test response", turn_metrics=["invalid_metric"], - turns=[turn], ) def test_evaluation_data_with_metadata(self): """Test EvaluationData with metadata fields.""" - turn = TurnData(turn_id="1", query="Test query", response="Test response") + turn = TurnData( + turn_id="1", + query="Test query", + response="Test response", + turn_metrics=["ragas:faithfulness"], + turn_metrics_metadata={"ragas:faithfulness": {"threshold": 0.8}}, + ) eval_data = EvaluationData( conversation_group_id="test_conv", - turn_metrics=["ragas:faithfulness"], conversation_metrics=["deepeval:completeness"], - turn_metrics_metadata={"ragas:faithfulness": {"threshold": 0.8}}, conversation_metrics_metadata={"deepeval:completeness": {"threshold": 0.9}}, turns=[turn], ) - assert eval_data.turn_metrics_metadata == { + assert eval_data.turns[0].turn_metrics_metadata == { "ragas:faithfulness": {"threshold": 0.8} } assert eval_data.conversation_metrics_metadata == {