diff --git a/src/lightspeed_evaluation/pipeline/evaluation/amender.py b/src/lightspeed_evaluation/pipeline/evaluation/amender.py index 070b97bb..e200a49d 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/amender.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/amender.py @@ -4,7 +4,7 @@ from typing import Any, Optional from lightspeed_evaluation.core.api import APIClient -from lightspeed_evaluation.core.models import EvaluationData +from lightspeed_evaluation.core.models import EvaluationData, TurnData from lightspeed_evaluation.core.system.exceptions import APIError logger = logging.getLogger(__name__) @@ -17,54 +17,56 @@ def __init__(self, api_client: Optional[APIClient]): """Initialize with API client.""" self.api_client = api_client - def amend_conversation_data(self, conv_data: EvaluationData) -> Optional[str]: - """Amend conversation data with API responses. + def amend_single_turn( + self, turn_data: TurnData, conversation_id: Optional[str] = None + ) -> tuple[Optional[str], Optional[str]]: + """Amend single turn data with API response. + + Args: + turn_data: The turn data to amend + conversation_id: Optional conversation ID from previous turns Returns: - Optional[str]: Error message if any API error occurred, None if successful + tuple: (error_message, updated_conversation_id) + - error_message: None if successful, error string if failed + - updated_conversation_id: The conversation ID for next turns """ if not self.api_client: - return None - - # Track conversation_id across turns - conversation_id: Optional[str] = None - - for turn_data in conv_data.turns: - logger.debug("Amending turn %s with API data", turn_data.turn_id) - - try: - api_response = self.api_client.query( - query=turn_data.query, - conversation_id=conversation_id, - attachments=turn_data.attachments, + return None, conversation_id + + logger.debug("Amending turn %s with API data", turn_data.turn_id) + + try: + api_response = self.api_client.query( + query=turn_data.query, + conversation_id=conversation_id, + attachments=turn_data.attachments, + ) + + # AMEND EVALUATION DATA: This modifies the loaded TurnData object in-place + # Update response from API + turn_data.response = api_response.response + turn_data.conversation_id = api_response.conversation_id + + # Update contexts from API output + if api_response.contexts: + turn_data.contexts = api_response.contexts + + # Update tool calls from API output + if api_response.tool_calls: + logger.debug( + "Tool calls provided: %d sequences", + len(api_response.tool_calls), ) - conversation_id = api_response.conversation_id # Track for next turns - - # AMEND EVALUATION DATA: This modifies the loaded TurnData object in-place - # Update response from API - turn_data.response = api_response.response - turn_data.conversation_id = api_response.conversation_id - - # Update contexts from API output - if api_response.contexts: - turn_data.contexts = api_response.contexts - - # Update tool calls from API output - if api_response.tool_calls: - logger.debug( - "Tool calls provided: %d sequences", - len(api_response.tool_calls), - ) - turn_data.tool_calls = api_response.tool_calls - - logger.debug("Data amended for turn %s", turn_data.turn_id) + turn_data.tool_calls = api_response.tool_calls - except APIError as e: - error_msg = f"API Error for turn {turn_data.turn_id}: {e}" - logger.error(error_msg) - return error_msg + logger.debug("Data amended for turn %s", turn_data.turn_id) + return None, api_response.conversation_id - return None # No errors occurred + except APIError as e: + error_msg = f"API Error for turn {turn_data.turn_id}: {e}" + logger.error(error_msg) + return error_msg, conversation_id def get_amendment_summary(self, conv_data: EvaluationData) -> dict[str, Any]: """Get summary of what would be amended for a conversation.""" diff --git a/src/lightspeed_evaluation/pipeline/evaluation/errors.py b/src/lightspeed_evaluation/pipeline/evaluation/errors.py index ddf54240..3fe24eaf 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/errors.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/errors.py @@ -2,7 +2,7 @@ import logging -from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult +from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult, TurnData logger = logging.getLogger(__name__) @@ -76,6 +76,122 @@ def mark_all_metrics_as_error( self.results.extend(error_results) return error_results + def mark_turn_metrics_as_error( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, + conv_data: EvaluationData, + turn_idx: int, + turn_data: TurnData, + turn_metrics: list[str], + error_reason: str, + ) -> list[EvaluationResult]: + """Mark all metrics for a single turn as ERROR. + + Args: + conv_data: Conversation data + turn_idx: Index of the turn + turn_data: Turn data + turn_metrics: Metrics for this turn + error_reason: Reason for error + + Returns: + list[EvaluationResult]: ERROR results for this turn's metrics + """ + logger.warning( + "Marking turn %d metrics as ERROR for conversation %s: %s", + turn_idx, + conv_data.conversation_group_id, + error_reason, + ) + error_results = [] + + # Mark all turn-level metrics as ERROR + for metric_identifier in turn_metrics: + error_result = EvaluationResult( + conversation_group_id=conv_data.conversation_group_id, + turn_id=turn_data.turn_id, + metric_identifier=metric_identifier, + result="ERROR", + score=None, + threshold=None, + reason=error_reason, + query=turn_data.query, + response="", + execution_time=0.0, + ) + error_results.append(error_result) + + # Store results internally for summary tracking + self.results.extend(error_results) + return error_results + + def mark_cascade_failure( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, + conv_data: EvaluationData, + failed_turn_idx: int, + resolved_turn_metrics: list[list[str]], + resolved_conversation_metrics: list[str], + error_reason: str, + ) -> list[EvaluationResult]: + """Mark remaining turns and conversation metrics as ERROR (cascade failure). + + Args: + conv_data: Conversation data + failed_turn_idx: Index of the turn that failed + resolved_turn_metrics: Resolved metrics for all turns + resolved_conversation_metrics: Resolved conversation metrics + error_reason: Reason for error + + Returns: + list[EvaluationResult]: ERROR results for remaining turns and conversation + """ + logger.warning( + "Marking remaining turns (%d onwards) and conversation metrics as ERROR for %s: %s", + failed_turn_idx + 1, + conv_data.conversation_group_id, + error_reason, + ) + error_results = [] + + # Mark remaining turns as ERROR (from failed_turn_idx + 1 onwards) + for turn_idx in range(failed_turn_idx + 1, len(conv_data.turns)): + turn_data = conv_data.turns[turn_idx] + turn_metrics = resolved_turn_metrics[turn_idx] + + for metric_identifier in turn_metrics: + error_result = EvaluationResult( + conversation_group_id=conv_data.conversation_group_id, + turn_id=turn_data.turn_id, + metric_identifier=metric_identifier, + result="ERROR", + score=None, + threshold=None, + reason=error_reason, + query=turn_data.query, + response="", + execution_time=0.0, + ) + error_results.append(error_result) + + # Mark conversation-level metrics as ERROR + for metric_identifier in resolved_conversation_metrics: + error_result = EvaluationResult( + conversation_group_id=conv_data.conversation_group_id, + turn_id=None, # Conversation-level + metric_identifier=metric_identifier, + result="ERROR", + score=None, + threshold=None, + reason=error_reason, + query="", + response="", + execution_time=0.0, + ) + error_results.append(error_result) + + # Store results internally for summary tracking + self.results.extend(error_results) + return error_results + def get_error_summary(self) -> dict[str, int]: """Get summary of error results collected.""" return { diff --git a/src/lightspeed_evaluation/pipeline/evaluation/processor.py b/src/lightspeed_evaluation/pipeline/evaluation/processor.py index 03422f80..d1ac49cf 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/processor.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/processor.py @@ -43,7 +43,9 @@ def __init__(self, config_loader: ConfigLoader, components: ProcessorComponents) self.config = config_loader.system_config self.components = components - def process_conversation(self, conv_data: EvaluationData) -> list[EvaluationResult]: + def process_conversation( # pylint: disable=too-many-locals + self, conv_data: EvaluationData + ) -> list[EvaluationResult]: """Process single conversation - handle turn and conversation level metrics. Returns: @@ -88,31 +90,68 @@ def process_conversation(self, conv_data: EvaluationData) -> list[EvaluationResu return error_results try: - # Step 2: Amend with API data if enabled if self.config is None: raise ValueError("SystemConfig must be loaded") - api_error_message = None - if self.config.api.enabled: - logger.debug("Amending data via API") - api_error_message = self.components.api_amender.amend_conversation_data( - conv_data - ) - # If API error occurred, mark all metrics as ERROR and skip evaluation - if api_error_message: - logger.error("API error detected - marking all metrics as ERROR") - error_results = self.components.error_handler.mark_all_metrics_as_error( - conv_data, - api_error_message, - resolved_turn_metrics=resolved_turn_metrics, - resolved_conversation_metrics=resolved_conversation_metrics, - ) - return error_results + # Step 2: Process each turn individually (API call + evaluation) + conversation_id: Optional[str] = None - # Step 3: Process turn-level metrics for each turn for turn_idx, (turn_data, turn_metrics) in enumerate( zip(conv_data.turns, resolved_turn_metrics) ): + # Step 2a: Amend with API data if enabled (per turn) + if self.config.api.enabled: + logger.debug("Processing turn %d: %s", turn_idx, turn_data.turn_id) + api_error_message, conversation_id = ( + self.components.api_amender.amend_single_turn( + turn_data, conversation_id + ) + ) + logger.debug( + "✅ API Call completed for turn %d: %s", + turn_idx, + turn_data.turn_id, + ) + + # If API error occurred, mark current turn + remaining + conversation as ERROR + if api_error_message: + logger.error( + "API error for turn %d - marking current turn, " + "remaining turns, and conversation as ERROR", + turn_idx, + ) + # Mark current turn as ERROR + current_turn_errors = ( + self.components.error_handler.mark_turn_metrics_as_error( + conv_data, + turn_idx, + turn_data, + turn_metrics, + api_error_message, + ) + ) + results.extend(current_turn_errors) + + # Mark remaining turns and conversation metrics as ERROR + cascade_error_reason = ( + f"Cascade failure from turn {turn_idx + 1} API error: " + f"{api_error_message}" + ) + remaining_errors = ( + self.components.error_handler.mark_cascade_failure( + conv_data, + turn_idx, + resolved_turn_metrics, + resolved_conversation_metrics, + cascade_error_reason, + ) + ) + results.extend(remaining_errors) + + # Stop processing - API failure cascades to all remaining + return results + + # Step 2b: Process turn-level metrics for this turn if turn_metrics: logger.debug( "Processing turn %d metrics: %s", turn_idx, turn_metrics @@ -122,7 +161,7 @@ def process_conversation(self, conv_data: EvaluationData) -> list[EvaluationResu ) results.extend(turn_results) - # Step 4: Process conversation-level metrics + # Step 3: Process conversation-level metrics if resolved_conversation_metrics: logger.debug( "Processing conversation-level metrics: %s", @@ -136,7 +175,7 @@ def process_conversation(self, conv_data: EvaluationData) -> list[EvaluationResu return results finally: - # Step 5: Always run cleanup script (if provided) regardless of results + # Step 4: Always run cleanup script (if provided) regardless of results self._run_cleanup_script(conv_data) def _evaluate_turn( diff --git a/tests/unit/pipeline/evaluation/test_amender.py b/tests/unit/pipeline/evaluation/test_amender.py index e259acf3..39bbd77d 100644 --- a/tests/unit/pipeline/evaluation/test_amender.py +++ b/tests/unit/pipeline/evaluation/test_amender.py @@ -1,6 +1,6 @@ """Unit tests for pipeline evaluation amender module.""" -from lightspeed_evaluation.core.models import APIResponse, EvaluationData, TurnData +from lightspeed_evaluation.core.models import APIResponse, TurnData from lightspeed_evaluation.core.system.exceptions import APIError from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender @@ -8,20 +8,20 @@ class TestAPIDataAmender: """Unit tests for APIDataAmender.""" - def test_amend_conversation_data_no_client(self): + def test_amend_single_turn_no_client(self): """Test amendment returns None when no API client is available.""" amender = APIDataAmender(None) turn = TurnData(turn_id="1", query="Test query", response=None) - conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = amender.amend_conversation_data(conv_data) + error_msg, conversation_id = amender.amend_single_turn(turn) - assert result is None + assert error_msg is None + assert conversation_id is None assert turn.response is None # Not modified - def test_amend_conversation_data_single_turn(self, mocker): - """Test amending conversation data with single turn.""" + def test_amend_single_turn_success(self, mocker): + """Test amending single turn data successfully.""" mock_client = mocker.Mock() api_response = APIResponse( response="Generated response", @@ -34,12 +34,12 @@ def test_amend_conversation_data_single_turn(self, mocker): amender = APIDataAmender(mock_client) turn = TurnData(turn_id="1", query="Test query", response=None) - conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = amender.amend_conversation_data(conv_data) + error_msg, conversation_id = amender.amend_single_turn(turn) # No error should be returned - assert result is None + assert error_msg is None + assert conversation_id == "conv_123" # API client should be called once mock_client.query.assert_called_once_with( @@ -51,93 +51,69 @@ def test_amend_conversation_data_single_turn(self, mocker): assert turn.conversation_id == "conv_123" assert turn.contexts == ["Context 1", "Context 2"] - def test_amend_conversation_data_multiple_turns(self, mocker): - """Test amending conversation with multiple turns maintains conversation_id.""" + def test_amend_single_turn_with_conversation_id(self, mocker): + """Test amending turn with existing conversation ID.""" mock_client = mocker.Mock() - - # First turn response - response1 = APIResponse( - response="Response 1", - conversation_id="conv_123", - contexts=["Context 1"], - tool_calls=[], - ) - - # Second turn response (same conversation) - response2 = APIResponse( - response="Response 2", + api_response = APIResponse( + response="Follow-up response", conversation_id="conv_123", - contexts=["Context 2"], + contexts=["Context 3"], tool_calls=[], ) - - mock_client.query.side_effect = [response1, response2] + mock_client.query.return_value = api_response amender = APIDataAmender(mock_client) - turn1 = TurnData(turn_id="1", query="Query 1", response=None) - turn2 = TurnData(turn_id="2", query="Query 2", response=None) - conv_data = EvaluationData( - conversation_group_id="test_conv", turns=[turn1, turn2] - ) + turn = TurnData(turn_id="2", query="Follow-up query", response=None) - result = amender.amend_conversation_data(conv_data) + error_msg, conversation_id = amender.amend_single_turn(turn, "conv_123") - assert result is None - - # Should be called twice - assert mock_client.query.call_count == 2 - - # First call without conversation_id - mock_client.query.assert_any_call( - query="Query 1", conversation_id=None, attachments=None - ) + # No error should be returned + assert error_msg is None + assert conversation_id == "conv_123" - # Second call with conversation_id from first response - mock_client.query.assert_any_call( - query="Query 2", conversation_id="conv_123", attachments=None + # API client should be called with existing conversation ID + mock_client.query.assert_called_once_with( + query="Follow-up query", conversation_id="conv_123", attachments=None ) - # Both turns should be amended - assert turn1.response == "Response 1" - assert turn1.conversation_id == "conv_123" - assert turn2.response == "Response 2" - assert turn2.conversation_id == "conv_123" + # Turn data should be amended + assert turn.response == "Follow-up response" + assert turn.conversation_id == "conv_123" + assert turn.contexts == ["Context 3"] - def test_amend_conversation_data_with_tool_calls(self, mocker): + def test_amend_single_turn_with_tool_calls(self, mocker): """Test amending turn data with tool calls.""" mock_client = mocker.Mock() - - tool_calls = [ - [{"tool_name": "search", "arguments": {"query": "test"}}], - [{"tool_name": "calculator", "arguments": {"expr": "2+2"}}], - ] - api_response = APIResponse( - response="Used tools", - conversation_id="conv_123", - contexts=["Context"], - tool_calls=tool_calls, + response="Tool response", + conversation_id="conv_456", + contexts=[], + tool_calls=[[{"tool": "test_tool", "args": {"param": "value"}}]], ) mock_client.query.return_value = api_response amender = APIDataAmender(mock_client) - turn = TurnData(turn_id="1", query="Query with tools", response=None) - conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) + turn = TurnData(turn_id="3", query="Tool query", response=None) - result = amender.amend_conversation_data(conv_data) + error_msg, conversation_id = amender.amend_single_turn(turn) - assert result is None - assert turn.tool_calls == tool_calls + # No error should be returned + assert error_msg is None + assert conversation_id == "conv_456" + + # Turn data should be amended with tool calls + assert turn.response == "Tool response" + assert turn.tool_calls == [[{"tool": "test_tool", "args": {"param": "value"}}]] - def test_amend_conversation_data_with_attachments(self, mocker): - """Test amending turn with attachments.""" + def test_amend_single_turn_with_attachments(self, mocker): + """Test amending turn data with attachments.""" mock_client = mocker.Mock() api_response = APIResponse( - response="Response with attachments", - conversation_id="conv_123", - contexts=["Context"], + response="Attachment response", + conversation_id="conv_789", + contexts=["Attachment context"], tool_calls=[], ) mock_client.query.return_value = api_response @@ -145,147 +121,94 @@ def test_amend_conversation_data_with_attachments(self, mocker): amender = APIDataAmender(mock_client) turn = TurnData( - turn_id="1", - query="Query", + turn_id="4", + query="Attachment query", response=None, attachments=["file1.txt", "file2.pdf"], ) - conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = amender.amend_conversation_data(conv_data) + error_msg, conversation_id = amender.amend_single_turn(turn) - assert result is None + # No error should be returned + assert error_msg is None + assert conversation_id == "conv_789" - # Should pass attachments to API + # API client should be called with attachments mock_client.query.assert_called_once_with( - query="Query", conversation_id=None, attachments=["file1.txt", "file2.pdf"] + query="Attachment query", + conversation_id=None, + attachments=["file1.txt", "file2.pdf"], ) - def test_amend_conversation_data_api_error_first_turn(self, mocker): - """Test API error on first turn returns error message.""" + # Turn data should be amended + assert turn.response == "Attachment response" + assert turn.contexts == ["Attachment context"] + + def test_amend_single_turn_api_error(self, mocker): + """Test handling API error during turn amendment.""" mock_client = mocker.Mock() mock_client.query.side_effect = APIError("Connection failed") amender = APIDataAmender(mock_client) - turn = TurnData(turn_id="1", query="Query", response=None) - conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) + turn = TurnData(turn_id="5", query="Error query", response=None) - result = amender.amend_conversation_data(conv_data) + error_msg, conversation_id = amender.amend_single_turn(turn) - # Should return error message - assert result is not None - assert "API Error for turn 1" in result - assert "Connection failed" in result + # Error should be returned + assert error_msg == "API Error for turn 5: Connection failed" + assert conversation_id is None - def test_amend_conversation_data_api_error_second_turn(self, mocker): - """Test API error on second turn after first succeeds.""" - mock_client = mocker.Mock() + # Turn data should not be modified + assert turn.response is None + assert turn.conversation_id is None - # First turn succeeds - response1 = APIResponse( - response="Response 1", - conversation_id="conv_123", - contexts=["Context"], + def test_amend_single_turn_no_contexts_in_response(self, mocker): + """Test amending turn when API response has no contexts.""" + mock_client = mocker.Mock() + api_response = APIResponse( + response="No context response", + conversation_id="conv_no_ctx", + contexts=[], # Empty contexts tool_calls=[], ) - - # Second turn fails - mock_client.query.side_effect = [response1, APIError("Rate limit exceeded")] + mock_client.query.return_value = api_response amender = APIDataAmender(mock_client) - turn1 = TurnData(turn_id="1", query="Query 1", response=None) - turn2 = TurnData(turn_id="2", query="Query 2", response=None) - conv_data = EvaluationData( - conversation_group_id="test_conv", turns=[turn1, turn2] - ) + turn = TurnData(turn_id="6", query="No context query", response=None) - result = amender.amend_conversation_data(conv_data) + error_msg, conversation_id = amender.amend_single_turn(turn) - # Should return error message for turn 2 - assert result is not None - assert "API Error for turn 2" in result - assert "Rate limit exceeded" in result + # No error should be returned + assert error_msg is None + assert conversation_id == "conv_no_ctx" - # First turn should still be amended - assert turn1.response == "Response 1" - # Second turn should not be amended - assert turn2.response is None + # Turn data should be amended (contexts should remain None since API response has empty contexts) + assert turn.response == "No context response" + assert turn.contexts is None - def test_amend_conversation_data_no_contexts_in_response(self, mocker): - """Test amending when API response has no contexts.""" + def test_amend_single_turn_no_tool_calls_in_response(self, mocker): + """Test amending turn when API response has no tool calls.""" mock_client = mocker.Mock() api_response = APIResponse( - response="Response without contexts", - conversation_id="conv_123", - contexts=[], - tool_calls=[], + response="No tools response", + conversation_id="conv_no_tools", + contexts=["Context"], + tool_calls=[], # Empty tool calls ) mock_client.query.return_value = api_response amender = APIDataAmender(mock_client) - turn = TurnData( - turn_id="1", query="Query", response=None, contexts=["Original context"] - ) - conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - - result = amender.amend_conversation_data(conv_data) - - assert result is None - assert turn.response == "Response without contexts" - # Contexts should remain unchanged when API returns empty list - assert turn.contexts == ["Original context"] - - def test_get_amendment_summary_with_client(self, mocker): - """Test getting amendment summary with API client.""" - mock_client = mocker.Mock() - amender = APIDataAmender(mock_client) - - turn1 = TurnData(turn_id="1", query="Q1", response="R1") - turn2 = TurnData(turn_id="2", query="Q2", response=None) - conv_data = EvaluationData( - conversation_group_id="test_conv", turns=[turn1, turn2] - ) - - summary = amender.get_amendment_summary(conv_data) - - assert summary["conversation_group_id"] == "test_conv" - assert summary["total_turns"] == 2 - assert summary["api_enabled"] is True - assert summary["turns_with_existing_data"] == 1 - - def test_get_amendment_summary_without_client(self): - """Test getting amendment summary without API client.""" - amender = APIDataAmender(None) - - turn = TurnData(turn_id="1", query="Query", response=None) - conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - - summary = amender.get_amendment_summary(conv_data) + turn = TurnData(turn_id="7", query="No tools query", response=None) - assert summary["conversation_group_id"] == "test_conv" - assert summary["total_turns"] == 1 - assert summary["api_enabled"] is False - assert summary["turns_with_existing_data"] == 0 + error_msg, conversation_id = amender.amend_single_turn(turn) - def test_get_amendment_summary_with_tool_calls(self, mocker): - """Test summary counts turns with tool calls as having existing data.""" - mock_client = mocker.Mock() - amender = APIDataAmender(mock_client) - - turn1 = TurnData( - turn_id="1", - query="Q1", - response=None, - tool_calls=[[{"tool_name": "search", "arguments": {}}]], - ) - turn2 = TurnData(turn_id="2", query="Q2", response=None) - conv_data = EvaluationData( - conversation_group_id="test_conv", turns=[turn1, turn2] - ) - - summary = amender.get_amendment_summary(conv_data) + # No error should be returned + assert error_msg is None + assert conversation_id == "conv_no_tools" - assert summary["turns_with_existing_data"] == 1 # turn1 has tool_calls + # Turn data should be amended (tool_calls should remain None since API response has empty tool_calls) + assert turn.response == "No tools response" + assert turn.tool_calls is None diff --git a/tests/unit/pipeline/evaluation/test_errors.py b/tests/unit/pipeline/evaluation/test_errors.py index 98b42c05..2822fff3 100644 --- a/tests/unit/pipeline/evaluation/test_errors.py +++ b/tests/unit/pipeline/evaluation/test_errors.py @@ -178,3 +178,117 @@ def test_multiple_error_batches(self): assert summary["total_errors"] == 3 assert summary["turn_errors"] == 2 assert summary["conversation_errors"] == 1 + + def test_mark_turn_metrics_as_error(self): + """Test marking metrics for a single turn as error.""" + handler = EvaluationErrorHandler() + + turn_data = TurnData( + turn_id="turn1", query="Test query", response="Test response" + ) + conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) + + turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"] + error_reason = "API Error: Connection timeout" + + results = handler.mark_turn_metrics_as_error( + conv_data, 0, turn_data, turn_metrics, error_reason + ) + + # Should have 2 error results (one for each metric) + assert len(results) == 2 + + # Check first error result + assert results[0].conversation_group_id == "test_conv" + assert results[0].turn_id == "turn1" + assert results[0].metric_identifier == "ragas:faithfulness" + assert results[0].result == "ERROR" + assert results[0].score is None + assert results[0].threshold is None + assert results[0].reason == error_reason + assert results[0].query == "Test query" + assert results[0].response == "" + assert results[0].execution_time == 0.0 + + # Check second error result + assert results[1].conversation_group_id == "test_conv" + assert results[1].turn_id == "turn1" + assert results[1].metric_identifier == "custom:answer_correctness" + assert results[1].result == "ERROR" + assert results[1].reason == error_reason + + # Verify results are stored internally + summary = handler.get_error_summary() + assert summary["total_errors"] == 2 + assert summary["turn_errors"] == 2 + assert summary["conversation_errors"] == 0 + + def test_mark_cascade_failure(self): + """Test marking remaining turns and conversation metrics as error after API failure.""" + handler = EvaluationErrorHandler() + + # Setup conversation with 3 turns + turn1 = TurnData(turn_id="turn1", query="Query 1", response="Response 1") + turn2 = TurnData(turn_id="turn2", query="Query 2", response="Response 2") + turn3 = TurnData(turn_id="turn3", query="Query 3", response="Response 3") + conv_data = EvaluationData( + conversation_group_id="test_conv", turns=[turn1, turn2, turn3] + ) + + # Resolved metrics for all turns + resolved_turn_metrics = [ + ["ragas:faithfulness"], # turn1 + ["custom:answer_correctness"], # turn2 + ["ragas:response_relevancy"], # turn3 + ] + resolved_conversation_metrics = [ + "deepeval:conversation_completeness", + "deepeval:conversation_relevancy", + ] + + # API failure happens at turn 0 (first turn) + failed_turn_idx = 0 + error_reason = "Cascade failure from turn 1 API error: Connection timeout" + + results = handler.mark_cascade_failure( + conv_data, + failed_turn_idx, + resolved_turn_metrics, + resolved_conversation_metrics, + error_reason, + ) + + # Should have errors for: + # - Turn 2 (1 metric) + Turn 3 (1 metric) + Conversation (2 metrics) = 4 total + assert len(results) == 4 + + # Check turn 2 error + turn2_result = results[0] + assert turn2_result.conversation_group_id == "test_conv" + assert turn2_result.turn_id == "turn2" + assert turn2_result.metric_identifier == "custom:answer_correctness" + assert turn2_result.result == "ERROR" + assert turn2_result.reason == error_reason + + # Check turn 3 error + turn3_result = results[1] + assert turn3_result.turn_id == "turn3" + assert turn3_result.metric_identifier == "ragas:response_relevancy" + assert turn3_result.result == "ERROR" + + # Check conversation-level errors + conv_result1 = results[2] + assert conv_result1.turn_id is None # Conversation-level + assert conv_result1.metric_identifier == "deepeval:conversation_completeness" + assert conv_result1.result == "ERROR" + + conv_result2 = results[3] + assert conv_result2.turn_id is None # Conversation-level + assert conv_result2.metric_identifier == "deepeval:conversation_relevancy" + assert conv_result2.result == "ERROR" + + # Verify summary + summary = handler.get_error_summary() + assert summary["total_errors"] == 4 + assert summary["turn_errors"] == 2 # turn2 + turn3 + assert summary["conversation_errors"] == 2