From 79cf74ebcfb193d27b7a70ed88a79969688896dd Mon Sep 17 00:00:00 2001
From: Asutosh Samal <asamal@redhat.com>
Date: Sun, 27 Jul 2025 19:06:37 +0530
Subject: [PATCH] agent eval: multi-turn & refactoring

fix conv id handling
---
 lsc_agent_eval/README.md                      | 225 ++++---
 .../sample_data/agent_goal_eval_example.yaml  |  88 ++-
 .../script/{eval3 => conv3}/cleanup.sh        |   0
 .../script/{eval3 => conv3}/setup.sh          |   0
 .../script/{eval4 => conv4}/cleanup.sh        |   0
 .../script/{eval4 => conv4/eval1}/verify.sh   |   0
 .../script/{eval4 => conv4}/setup.sh          |   0
 lsc_agent_eval/src/lsc_agent_eval/__init__.py |  35 +-
 .../core/agent_goal_eval/__init__.py          |  16 +
 .../core/agent_goal_eval/agent_goal_eval.py   | 191 ++++--
 .../core/agent_goal_eval/eval_data.py         | 192 ++++--
 .../core/agent_goal_eval/evaluator.py         | 136 ++---
 .../core/agent_goal_eval/models.py            | 304 +++++++++-
 .../core/agent_goal_eval/results.py           |  83 ++-
 .../core/agent_goal_eval/script_runner.py     |  30 +-
 .../core/agent_goal_eval/utils.py             |  42 ++
 .../lsc_agent_eval/core/utils/api_client.py   |  14 +-
 .../lsc_agent_eval/core/utils/exceptions.py   |   4 +-
 .../agent_goal_eval/test_agent_goal_eval.py   | 308 +++-------
 .../core/agent_goal_eval/test_eval_data.py    | 569 ++++++++++--------
 .../core/agent_goal_eval/test_evaluator.py    | 506 ++++++++--------
 .../tests/core/agent_goal_eval/test_models.py | 306 ++++++++--
 .../core/agent_goal_eval/test_results.py      | 559 +++++++----------
 .../agent_goal_eval/test_script_runner.py     |  50 +-
 .../tests/core/utils/test_api_client.py       |  35 +-
 .../tests/core/utils/test_exceptions.py       |  34 +-
 26 files changed, 2162 insertions(+), 1565 deletions(-)
 rename lsc_agent_eval/sample_data/script/{eval3 => conv3}/cleanup.sh (100%)
 rename lsc_agent_eval/sample_data/script/{eval3 => conv3}/setup.sh (100%)
 rename lsc_agent_eval/sample_data/script/{eval4 => conv4}/cleanup.sh (100%)
 rename lsc_agent_eval/sample_data/script/{eval4 => conv4/eval1}/verify.sh (100%)
 rename lsc_agent_eval/sample_data/script/{eval4 => conv4}/setup.sh (100%)
 create mode 100644 lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py

diff --git a/lsc_agent_eval/README.md b/lsc_agent_eval/README.md
index 593118c8..76d0de4d 100644
--- a/lsc_agent_eval/README.md
+++ b/lsc_agent_eval/README.md
@@ -1,16 +1,17 @@
 # Lightspeed Agent Evaluation
 
-A standalone package for evaluating agent-based systems, specifically designed for evaluating agent goal achievement.
+A framework for evaluating AI agent performance.
 
 ## Features
 
 - **Agent Goal Evaluation**: Evaluate whether an agent successfully achieves specified goals
+- **Multi-turn Evaluation**: Organize evaluations into conversation groups for multi-turn testing
 - **Multi-type Evaluation**: Support for different evaluation types:
   - `judge-llm`: LLM-based evaluation using a judge model
   - `script`: Script-based evaluation using verification scripts (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench))
-  - `sub-string`: Simple substring matching evaluation
+  - `sub-string`: Simple substring matching evaluation (ALL keywords must be present in response)
 - **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after evaluation
-- **Result Tracking**: Result tracking and CSV output
+- **Result Tracking**: Result tracking with CSV output and JSON statistics
 - **Standalone Package**: Can be installed and used independently of the main lightspeed-core-evaluation package
 - **LiteLLM Integration**: Unified interface for Judge LLM
 
@@ -45,13 +46,102 @@ pip install -e .
 pdm install
 ```
 
-## Usage
+## Data Configuration
+
+The evaluation is configured using a YAML file that defines conversations. Each conversation contains one or more evaluations and includes:
+
+- `conversation_group`: Identifier for grouping related evaluations/conversation
+- `description`: Description of the conversation (Optional)
+- `setup_script`: Setup script to run before the conversation (Optional)
+- `cleanup_script`: Cleanup script to run after the conversation (Optional)
+- `conversation`: List of evaluations in this conversation
+
+Each evaluation within a conversation can include:
+- `eval_id`: Unique identifier for the evaluation
+- `eval_query`: The query/task to send to the agent
+- `eval_type`: Type of evaluation (judge-llm, script, sub-string)
+- `expected_response`: Expected response (for judge-llm evaluation)
+- `expected_keywords`: Keywords to look for (for sub-string evaluation)
+- `eval_verify_script`: Verification script (for script evaluation)
+- `description`: Description of the evaluation (Optional)
+
+Note: `eval_id` can't contain duplicate values within a conversation group. But it is okay for cross conversation group (A warning is logged anyway for awareness)
+
+### Example Data Configuration
+
+```yaml
+# Multi-turn Conversations
+- conversation_group: conv1
+  description: Basic conversation flow testing cluster operations
+  conversation:
+    - eval_id: eval1
+      eval_query: Hi!
+      eval_type: judge-llm
+      expected_response: Hello! I'm an AI assistant for the Installer.
+      description: Initial greeting to start conversation
+    - eval_id: eval2
+      eval_query: Get me active clusters
+      eval_type: judge-llm
+      expected_response: Active clusters are x1, x2.
+      description: Request for cluster information
+
+- conversation_group: conv2
+  description: Multi-turn conversation with setup/cleanup
+  setup_script: sample_data/script/setup_environment.sh
+  cleanup_script: sample_data/script/cleanup_environment.sh
+  conversation:
+    - eval_id: eval1
+      eval_query: Hi! Can you help me manage pods?
+      eval_type: judge-llm
+      expected_response: Hello! I can help you manage pods.
+      description: Initial greeting
+    - eval_id: eval2
+      eval_query: Create a pod named test-pod
+      eval_type: script
+      eval_verify_script: sample_data/script/verify_pod.sh
+      description: Create pod and verify
+    - eval_id: eval3
+      eval_query: List all pods
+      eval_type: sub-string
+      expected_keywords: ['test-pod']
+      description: Verify pod is listed
+
+# Single-turn Conversations
+- conversation_group: conv3
+  description: Test namespace creation and detection with scripts
+  setup_script: sample_data/script/conv3/setup.sh
+  cleanup_script: sample_data/script/conv3/cleanup.sh
+  conversation:
+    - eval_id: eval1
+      eval_query: is there a openshift-lightspeed namespace ?
+      eval_type: sub-string
+      expected_keywords:
+        - 'yes'
+        - 'lightspeed'
+      description: Check for openshift-lightspeed namespace after setup
+```
+
+The `sample_data/` directory contains example configurations:
+- `agent_goal_eval_example.yaml`: Examples with various evaluation types
+- `script/`: Example setup, cleanup, and verify scripts
+
+## Judge LLM
+
+For judge-llm evaluations, currently LiteLLM is used.
+
+### Judge LLM - Setup
+Expectation is that, either a third-party inference provider access is there or local model inference is already created. The eval framework doesn't handle this.
+
+- **OpenAI**: Set `OPENAI_API_KEY` environment variable
+- **Azure OpenAI**: Set `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT`
+- **IBM Watsonx**: Set `WATSONX_API_KEY`, `WATSONX_API_BASE`, `WATSONX_PROJECT_ID`
+- **Ollama**: Set `OLLAMA_API_BASE` (for local models)
+- **Any Other Provider**: Check [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
 
-### Command Line Interface
+## Usage
 
 ```bash
-# Run agent evaluation with basic configuration
-lsc-agent-eval \
+lsc_agent_eval \
     --eval_data_yaml agent_goal_eval.yaml \
     --agent_endpoint http://localhost:8080 \
     --agent_provider watsonx \
@@ -61,8 +151,6 @@ lsc-agent-eval \
     --result_dir ./eval_output
 ```
 
-### Python API
-
 ```python
 from lsc_agent_eval import AgentGoalEval
 
@@ -84,44 +172,7 @@ evaluator = AgentGoalEval(args)
 evaluator.run_evaluation()
 ```
 
-## Configuration
-
-The evaluation is configured using a YAML file that defines test cases. Each test case can include:
-
-- `eval_id`: Unique identifier for the evaluation
-- `eval_query`: The query/task to send to the agent
-- `eval_type`: Type of evaluation (judge-llm, script, sub-string)
-- `expected_response`: Expected response (for judge-llm evaluation)
-- `expected_keywords`: Keywords to look for (for sub-string evaluation)
-- `eval_verify_script`: Verification script (for script evaluation)
-- `eval_setup_script`: Optional setup script to run before evaluation
-- `eval_cleanup_script`: Optional cleanup script to run after evaluation
-
-### Example YAML Configuration
-
-```yaml
-# data/example_eval.yaml
-- eval_id: eval1
-  eval_query: "is there a openshift-monitoring namespace?"
-  eval_type: sub-string
-  expected_keywords:
-    - 'yes'
-    - openshift-monitoring
-
-- eval_id: eval2
-  eval_query: "is there a openshift-monitoring namespace?"
-  eval_type: judge-llm
-  expected_response: "there is a openshift-monitoring namespace."
-
-- eval_id: eval3
-  eval_query: "create a namespace called openshift-lightspeed"
-  eval_setup_script: script/eval3/setup.sh
-  eval_type: script
-  eval_verify_script: script/eval3/verify.sh
-  eval_cleanup_script: script/eval3/cleanup.sh
-```
-
-## Command Line Arguments
+### Key Arguments
 
 - `--eval_data_yaml`: Path to the YAML file containing evaluation data
 - `--agent_endpoint`: Endpoint URL for the agent API (default: <http://localhost:8080>)
@@ -133,33 +184,60 @@ The evaluation is configured using a YAML file that defines test cases. Each tes
 - `--result_dir`: Directory to save evaluation results (default: eval_output/)
 - `--kubeconfig`: Path to kubeconfig file (if needed for scripts)
 
-## Output
+## Evaluation Flow
 
-The evaluation results are saved to a CSV file containing:
-- `eval_id`: Evaluation identifier
-- `query`: The query sent to the agent
-- `response`: The agent's response
-- `eval_type`: Type of evaluation performed
-- `result`: Result (pass/fail)
+### Conversation Processing Order
 
-## Dependencies
+1. **Load Configuration**: Parse and validate YAML configuration
+2. **Process Conversations**: For each conversation group:
+   - Run setup script (if provided)
+   - Run all evaluations sequentially:
+     - For the first evaluation: Send query without conversation ID, receive new conversation ID from API
+     - For subsequent evaluations: Use the conversation ID from the first evaluation to maintain context
+     - Execute evaluation based on eval_type (either sub-string, judge-llm or script)
+   - Run cleanup script (if provided)
+3. **Save Results**: Export to CSV and JSON with statistics
 
-This package depends on:
-- `pandas`: Data manipulation and analysis
-- `httpx`: HTTP client for API calls
-- `tqdm`: Progress bars
-- `pyyaml`: YAML file processing
-- `litellm`: Unified interface to 100+ LLM providers
+### Script Execution
 
-## LiteLLM Integration (Judge LLM)
+- **Setup Scripts**: Run once before all evaluations in a conversation
+  - If setup fails, all evaluations in the conversation are marked as ERROR
+- **Cleanup Scripts**: Run once after all evaluations in a conversation
+  - Cleanup failures are logged as warnings (non-critical)
+  - Always executed regardless of evaluation results
+- **Verify Scripts**: Run per individual evaluation for script type evaluations
+  - Used to verify the agent's action is successful
 
-For judge-llm evaluations, you can use any of the 100+ supported providers:
+### Error Handling
 
-- **OpenAI**: Set `OPENAI_API_KEY` environment variable
-- **Azure OpenAI**: Set `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT`
-- **IBM Watsonx**: Set `WATSONX_API_KEY`, `WATSONX_API_BASE`, `WATSONX_PROJECT_ID`
-- **Ollama**: Set `OLLAMA_API_BASE` (for local models)
-- **And many more**: See [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
+- **Setup Failure**: Marks all evaluations in conversation as ERROR
+- **Cleanup Failure**: Logged as warning, does not affect evaluation results
+- **API Errors**: Evaluation marked as Error
+- **Evaluation Failure**: Individual evaluation marked as ERROR or FAIL
+- **Configuration Errors**: Detailed validation message
+
+## Output
+
+The framework generates two types of output:
+
+### CSV Results (`agent_goal_eval_results_YYYYMMDD_HHMMSS.csv`)
+
+Contains detailed results with columns:
+- `conversation_group`: The conversation group identifier
+- `conversation_id`: The conversation ID returned by the Agent API
+- `eval_id`: Individual evaluation identifier
+- `result`: PASS, FAIL, or ERROR
+- `eval_type`: Type of evaluation performed
+- `query`: The question/task sent to the agent
+- `response`: The agent's response
+- `error`: Error message (if any)
+
+### JSON Statistics (`agent_goal_eval_summary_YYYYMMDD_HHMMSS.json`)
+
+Result statistics:
+- **Overall Summary**: Total evaluations, pass/fail/error counts, success rate
+- **By Conversation**: Breakdown of results for each conversation group
+- **By Evaluation Type**: Performance metrics for each evaluation type (judge-llm, script, sub-string)
 
 ## Development
 
@@ -174,10 +252,15 @@ cd lightspeed-evaluation/lsc_agent_eval
 pdm install --dev
 
 # Run tests
-pdm run pytest
+pdm run pytest tests --cov=src
 
 # Run linting
 pdm run ruff check
+pdm run isort src tests
+pdm run black src tests
+pdm run mypy src
+pdm run pyright src
+pdm run pylint src
 ```
 
 ### Contributing
@@ -186,7 +269,7 @@ pdm run ruff check
 2. Create a feature branch
 3. Make your changes
 4. Add tests for new functionality
-5. Run the test suite
+5. Run tests and lint checks
 6. Submit a pull request
 
 ## License
@@ -195,4 +278,4 @@ This project is licensed under the Apache License 2.0. See the LICENSE file for
 
 ## Support
 
-For issues and questions, please use the [GitHub Issues](https://github.com/lightspeed-core/lightspeed-evaluation/issues) tracker. 
\ No newline at end of file
+For issues and questions, please use the [GitHub Issues](https://github.com/lightspeed-core/lightspeed-evaluation/issues) tracker. 
diff --git a/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml b/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml
index efed30aa..d346057e 100644
--- a/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml
+++ b/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml
@@ -1,26 +1,68 @@
-- eval_id: eval1
-  eval_query: is there a openshift-monitoring namespace ?
-  eval_type: sub-string
-  expected_keywords:
-  - 'yes'
-  - openshift-monitoring
+- conversation_group: conv1
+  description: Test namespace detection using substring matching
+  conversation:
+    - eval_id: eval1
+      eval_query: is there a openshift-monitoring namespace ?
+      eval_type: sub-string
+      expected_keywords:
+        - 'yes'
+        - openshift-monitoring
+      description: Check for openshift-monitoring namespace existence
 
-- eval_id: eval2
-  eval_query: is there a openshift-monitoring namespace ?
-  eval_type: judge-llm
-  expected_response: there is a openshift-monitoring namespace.
+- conversation_group: conv2
+  description: Test namespace detection using LLM judge
+  conversation:
+    - eval_id: eval1
+      eval_query: is there a openshift-monitoring namespace ?
+      eval_type: judge-llm
+      expected_response: there is a openshift-monitoring namespace.
+      description: Verify openshift-monitoring namespace with LLM evaluation
 
-- eval_id: eval3
-  eval_query: is there a openshift-lightspeed namespace ?
-  eval_setup_script: sample_data/script/eval3/setup.sh
-  eval_type: sub-string
-  expected_keywords:
-  - 'yes'
-  eval_cleanup_script: sample_data/script/eval3/cleanup.sh
+- conversation_group: conv3
+  description: Test namespace creation and detection with scripts
+  setup_script: sample_data/script/conv3/setup.sh
+  cleanup_script: sample_data/script/conv3/cleanup.sh
+  conversation:
+    - eval_id: eval1
+      eval_query: is there a openshift-lightspeed namespace ?
+      eval_type: sub-string
+      expected_keywords:
+        - 'yes'
+      description: Check for openshift-lightspeed namespace after setup
 
-- eval_id: eval4
-  eval_query: create a namespace called openshift-lightspeed
-  eval_setup_script: sample_data/script/eval4/setup.sh
-  eval_type: script
-  eval_verify_script: sample_data/script/eval4/verify.sh
-  eval_cleanup_script: sample_data/script/eval4/cleanup.sh
+- conversation_group: conv4
+  description: Test namespace creation with full script validation
+  setup_script: sample_data/script/conv4/setup.sh
+  cleanup_script: sample_data/script/conv4/cleanup.sh
+  conversation:
+    - eval_id: eval1
+      eval_query: create a namespace called openshift-lightspeed
+      eval_type: script
+      eval_verify_script: sample_data/script/conv4/eval1/verify.sh
+      description: Create namespace and verify with script
+
+- conversation_group: conv5
+  description: Test conversation retention - multi turn success
+  conversation:
+    - eval_id: eval1
+      eval_query: what is openshift virtualization ?
+      eval_type: sub-string
+      expected_keywords:
+        - virtualization
+      description: Test first conversation
+    - eval_id: eval2
+      eval_query: what was my previous query ?
+      eval_type: sub-string
+      expected_keywords:
+        - virtualization
+      description: Test second conversation
+
+- conversation_group: conv6
+  description: Test conversation retention - new conversation
+  conversation:
+    - eval_id: eval1
+      eval_query: what was my previous query ?
+      eval_type: sub-string
+      expected_keywords:
+        - virtualization
+      description: new conversation (failure)
diff --git a/lsc_agent_eval/sample_data/script/eval3/cleanup.sh b/lsc_agent_eval/sample_data/script/conv3/cleanup.sh
similarity index 100%
rename from lsc_agent_eval/sample_data/script/eval3/cleanup.sh
rename to lsc_agent_eval/sample_data/script/conv3/cleanup.sh
diff --git a/lsc_agent_eval/sample_data/script/eval3/setup.sh b/lsc_agent_eval/sample_data/script/conv3/setup.sh
similarity index 100%
rename from lsc_agent_eval/sample_data/script/eval3/setup.sh
rename to lsc_agent_eval/sample_data/script/conv3/setup.sh
diff --git a/lsc_agent_eval/sample_data/script/eval4/cleanup.sh b/lsc_agent_eval/sample_data/script/conv4/cleanup.sh
similarity index 100%
rename from lsc_agent_eval/sample_data/script/eval4/cleanup.sh
rename to lsc_agent_eval/sample_data/script/conv4/cleanup.sh
diff --git a/lsc_agent_eval/sample_data/script/eval4/verify.sh b/lsc_agent_eval/sample_data/script/conv4/eval1/verify.sh
similarity index 100%
rename from lsc_agent_eval/sample_data/script/eval4/verify.sh
rename to lsc_agent_eval/sample_data/script/conv4/eval1/verify.sh
diff --git a/lsc_agent_eval/sample_data/script/eval4/setup.sh b/lsc_agent_eval/sample_data/script/conv4/setup.sh
similarity index 100%
rename from lsc_agent_eval/sample_data/script/eval4/setup.sh
rename to lsc_agent_eval/sample_data/script/conv4/setup.sh
diff --git a/lsc_agent_eval/src/lsc_agent_eval/__init__.py b/lsc_agent_eval/src/lsc_agent_eval/__init__.py
index a9c203f9..ce9304d0 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/__init__.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/__init__.py
@@ -1,38 +1,27 @@
 """Agent evaluation modules."""
 
-from .core.agent_goal_eval.agent_goal_eval import AgentGoalEval
-from .core.agent_goal_eval.eval_data import AgentGoalEvalDataManager
-from .core.agent_goal_eval.evaluator import EvaluationRunner
-from .core.agent_goal_eval.models import EvaluationDataConfig, EvaluationResult
-from .core.agent_goal_eval.results import ResultsManager
-from .core.agent_goal_eval.script_runner import ScriptRunner
-from .core.utils.api_client import AgentHttpClient
+from .core.agent_goal_eval import AgentGoalEval
+from .core.agent_goal_eval.models import (
+    ConversationDataConfig,
+    EvaluationDataConfig,
+    EvaluationResult,
+)
 from .core.utils.exceptions import (
     AgentAPIError,
     AgentEvaluationError,
-    ConfigurationError,
+    EvaluationDataError,
     JudgeModelError,
     ScriptExecutionError,
 )
-from .core.utils.judge import JudgeModelManager
 
 __all__ = [
-    # Exceptions
+    "AgentGoalEval",
+    "EvaluationDataConfig",
+    "EvaluationResult",
+    "ConversationDataConfig",
     "AgentEvaluationError",
-    "ConfigurationError",
+    "EvaluationDataError",
     "AgentAPIError",
     "ScriptExecutionError",
     "JudgeModelError",
-    # Models
-    "EvaluationResult",
-    "EvaluationDataConfig",
-    # Components
-    "AgentGoalEvalDataManager",
-    "AgentHttpClient",
-    "ScriptRunner",
-    "JudgeModelManager",
-    "EvaluationRunner",
-    "ResultsManager",
-    # Main class
-    "AgentGoalEval",
 ]
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py
index 1218ccd4..36028c4d 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py
@@ -1 +1,17 @@
 """Agent goal evaluation modules."""
+
+from .agent_goal_eval import AgentGoalEval
+from .eval_data import AgentGoalEvalDataManager
+from .evaluator import EvaluationRunner
+from .models import ConversationDataConfig, EvaluationDataConfig, EvaluationResult
+from .results import ResultsManager
+
+__all__ = [
+    "AgentGoalEval",
+    "AgentGoalEvalDataManager",
+    "EvaluationRunner",
+    "EvaluationDataConfig",
+    "EvaluationResult",
+    "ConversationDataConfig",
+    "ResultsManager",
+]
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py
index f6d01725..4aae8428 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py
@@ -2,15 +2,22 @@
 
 import argparse
 import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
 
 from tqdm import tqdm
 
 from ..utils.api_client import AgentHttpClient
+from ..utils.exceptions import AgentEvaluationError, ScriptExecutionError
 from ..utils.judge import JudgeModelManager
 from .eval_data import AgentGoalEvalDataManager
 from .evaluator import EvaluationRunner
-from .models import EvaluationDataConfig, EvaluationResult
 from .results import ResultsManager
+from .script_runner import ScriptRunner
+from .utils import create_error_result
+
+if TYPE_CHECKING:
+    from .models import ConversationDataConfig, EvaluationDataConfig, EvaluationResult
 
 logger = logging.getLogger(__name__)
 
@@ -29,6 +36,9 @@ def _setup_components(self) -> None:
         # Eval data manager
         self.data_manager = AgentGoalEvalDataManager(self.eval_args.eval_data_yaml)
 
+        # Script runner
+        self.script_runner = ScriptRunner(getattr(self.eval_args, "kubeconfig", None))
+
         # Agent HTTP client
         self.agent_client = AgentHttpClient(
             self.eval_args.agent_endpoint, self.eval_args.agent_auth_token_file
@@ -43,37 +53,38 @@ def _setup_components(self) -> None:
 
         # Evaluation runner
         self.evaluation_runner = EvaluationRunner(
-            self.agent_client,
-            self.judge_manager,
-            kubeconfig=getattr(self.eval_args, "kubeconfig", None),
+            self.agent_client, self.script_runner, self.judge_manager
         )
 
-        # Results manager
-        self.results_manager = ResultsManager(self.eval_args.result_dir)
-
     def run_evaluation(self) -> None:
         """Run all evaluations and save results."""
         try:
-            eval_data = self.data_manager.get_eval_data()
-            logger.info("Running %d evaluations", len(eval_data))
+            conversations = self.data_manager.get_conversations()
+
+            logger.info(
+                "Starting Agent Goal Evaluation\n"
+                "Total: %d evaluations across %d conversations",
+                self.data_manager.get_eval_count(),
+                len(conversations),
+            )
 
             results = []
-            pbar = tqdm(eval_data)
-            for data_config in pbar:
-                pbar.set_description(f"Running evaluation for {data_config.eval_id}")
-                result = self.evaluation_runner.run_evaluation(
-                    data_config,
-                    self.eval_args.agent_provider,
-                    self.eval_args.agent_model,
+
+            # Process each conversation for evaluation
+            for conv_idx, conversation in enumerate(conversations, 1):
+                print(
+                    f"\n📋 Conversation {conv_idx}/{len(conversations)}: "
+                    f"{conversation.conversation_group}"
                 )
-                self._print_individual_result(data_config, result, pbar)
-                results.append(result)
+                conversation_results = self._process_conversation(conversation)
+                results.extend(conversation_results)
 
             # Save results
-            self.results_manager.save_results(results)
+            results_manager = ResultsManager(results)
+            results_manager.save_results(self.eval_args.result_dir)
 
             # Print summary
-            self._print_summary(results)
+            self._print_summary(results_manager)
 
         except Exception as e:
             logger.error("Evaluation failed: %s", e)
@@ -82,9 +93,104 @@ def run_evaluation(self) -> None:
             # Clean up resources
             self._cleanup()
 
+    def _process_conversation(
+        self, conversation: "ConversationDataConfig"
+    ) -> list["EvaluationResult"]:
+        """Process single conversation group."""
+        conversation_group = conversation.conversation_group
+        evaluations = conversation.conversation
+        print(f"   Evaluations count: {len(evaluations)}")
+
+        # Always start with None - conversation_id will be obtained from first API call
+        conversation_id = None
+
+        results = []
+
+        # Run setup script for the conversation
+        if conversation.setup_script:
+            try:
+                self._run_setup_script(conversation.setup_script, conversation_group)
+            except ScriptExecutionError as e:
+                # If setup fails, mark all evaluations as ERROR
+                for eval_data in evaluations:
+                    error_result = create_error_result(
+                        eval_data, f"Setup script failed: {str(e)}", conversation_id
+                    )
+                    results.append(error_result)
+                print(f"❌ Setup script failed for {conversation_group}: {e}")
+                return results
+
+        # Run evaluations
+        print(f"   Running {len(evaluations)} evaluations...")
+        evaluation_results = self._run_conversation_evaluations(
+            evaluations, conversation_group, conversation_id
+        )
+        results.extend(evaluation_results)
+
+        # Run cleanup script for the conversation
+        if conversation.cleanup_script:
+            self._run_cleanup_script(conversation.cleanup_script, conversation_group)
+
+        return results
+
+    def _run_setup_script(self, setup_script: Path, conversation_group: str) -> None:
+        """Run setup script for a conversation."""
+        setup_success = self.script_runner.run_script(setup_script)
+        if not setup_success:
+            raise ScriptExecutionError("Setup script returned non-zero exit code")
+        logger.debug("Setup script executed successfully for %s", conversation_group)
+
+    def _run_cleanup_script(
+        self, cleanup_script: Path, conversation_group: str
+    ) -> None:
+        """Run cleanup script for a conversation."""
+        try:
+            cleanup_success = self.script_runner.run_script(cleanup_script)
+            if cleanup_success:
+                logger.debug("Cleanup completed successfully")
+            else:
+                logger.warning("Cleanup script failed (non-critical)")
+        except ScriptExecutionError as e:
+            logger.warning("Cleanup script failed for %s: %s", conversation_group, e)
+
+    def _run_conversation_evaluations(
+        self,
+        evaluations: list["EvaluationDataConfig"],
+        conversation_group: str,
+        conversation_id: Optional[str],
+    ) -> list["EvaluationResult"]:
+        """Run all evaluations for a conversation."""
+        results = []
+
+        with tqdm(
+            total=len(evaluations),
+            desc=f"Evaluating {conversation_group}",
+        ) as pbar:
+            for eval_data in evaluations:
+                result = self.evaluation_runner.run_evaluation(
+                    eval_data,
+                    self.eval_args.agent_provider,
+                    self.eval_args.agent_model,
+                    conversation_id,
+                )
+
+                # Update conversation_id from API response for subsequent evaluations
+                if conversation_id is None:
+                    conversation_id = result.conversation_id
+                    print(
+                        f"  Received conversation ID from API: {result.conversation_id}"
+                    )
+
+                self._print_individual_result(eval_data, result, pbar)
+                results.append(result)
+
+                pbar.update(1)
+
+        return results
+
     @staticmethod
     def _print_individual_result(
-        data_config: EvaluationDataConfig, result: EvaluationResult, pbar: tqdm
+        data_config: "EvaluationDataConfig", result: "EvaluationResult", pbar: tqdm
     ) -> None:
         """Print individual result."""
         match result.result:
@@ -94,7 +200,10 @@ def _print_individual_result(
                 marker = "❌"
             case _:
                 marker = "⚠️ "
-        pbar.write(f"{marker} {result.eval_id}: {result.result}")
+        pbar.write(
+            f"{marker} {result.conversation_group}/{result.eval_id} "
+            f"{result.conversation_id}: {result.result}"
+        )
 
         if result.result != "PASS":
             pbar.write(f"   Query: {result.query}")
@@ -111,25 +220,36 @@ def _print_individual_result(
         if result.result == "ERROR":
             pbar.write(f"   Error message: {result.error}")
 
-    def _print_summary(self, results: list[EvaluationResult]) -> None:
+    def _print_summary(self, results_manager: ResultsManager) -> None:
         """Print evaluation summary."""
-        total = len(results)
-        passed = sum(1 for r in results if r.result == "PASS")
-        failed = sum(1 for r in results if r.result == "FAIL")
-        errored = sum(1 for r in results if r.result == "ERROR")
-        success_rate = (passed / total * 100) if total > 0 else 0
+        stats = results_manager.get_results_stats()
 
         print(f"\n{'='*25}")
         print("EVALUATION SUMMARY")
         print(f"{'='*25}")
-        print(f"Total Evaluations: {total}")
-        print(f"✅ Passed: {passed}")
-        print(f"❌ Failed: {failed}")
-        print(f"⚠️  Errored: {errored}")
-        print(f"Success Rate: {success_rate:.1f}%")
+        print(f"Total Evaluations: {stats.total_evaluations}")
+        print(f"✅ Passed: {stats.passed}")
+        print(f"❌ Failed: {stats.failed}")
+        print(f"⚠️  Errored: {stats.errored}")
+        print(f"Success Rate: {stats.success_rate:.1f}%")
+
+        # Show conversation breakdown if multiple conversations
+        if len(stats.by_conversation) > 1:
+            print("\nSummary by Conversation:")
+            for conv_group, counts in stats.by_conversation.items():
+                print(
+                    f"{conv_group}: {counts['passed']}/{counts['total']} "
+                    f"({counts['success_rate']:.1f}%)"
+                )
+
         print(f"{'='*25}\n")
 
-        self.result_summary = {"PASS": passed, "FAIL": failed, "ERROR": errored}
+        self.result_summary = {
+            "TOTAL": stats.total_evaluations,
+            "PASS": stats.passed,
+            "FAIL": stats.failed,
+            "ERROR": stats.errored,
+        }
 
     def _cleanup(self) -> None:
         """Clean up resources."""
@@ -141,4 +261,7 @@ def _cleanup(self) -> None:
 
     def get_result_summary(self) -> dict[str, int]:
         """Get result summary."""
+        if not self.result_summary:
+            raise AgentEvaluationError("No results available. Run evaluation first.")
+
         return self.result_summary
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py
index e664912c..615f3adf 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py
@@ -1,88 +1,170 @@
 """Agent Goal Eval data management."""
 
+import logging
 from pathlib import Path
 from typing import Any
 
 import yaml
+from pydantic import ValidationError
 
-from ..utils.exceptions import ConfigurationError
-from .models import EvaluationDataConfig
+from ..utils.exceptions import EvaluationDataError
+from .models import ConversationDataConfig
+
+logger = logging.getLogger(__name__)
 
 
 class AgentGoalEvalDataManager:
     """Processes agent eval data and validation."""
 
-    def __init__(self, eval_data_file: str):
-        """Initialize configuration manager."""
-        self.eval_data_file = Path(eval_data_file)
-        self.eval_data: list[EvaluationDataConfig] = []
-        self._validate_eval_data_file()
-        self._load_eval_data()
-
-    def _validate_eval_data_file(self) -> None:
-        """Validate eval data file exists and is readable."""
-        if not self.eval_data_file.exists():
-            raise ConfigurationError(f"Eval data file not found: {self.eval_data_file}")
+    def __init__(self, eval_data_file: str) -> None:
+        """Initialize eval data manager."""
+        self.eval_data_file = eval_data_file
+        self.conversations: list[ConversationDataConfig] = []
 
-        if not self.eval_data_file.is_file():
-            raise ConfigurationError(
-                f"Eval data file path is not a file: {self.eval_data_file}"
-            )
+        self._load_eval_data()
+        self._log_loaded_data_stats()
 
     def _load_eval_data(self) -> None:
         """Load evaluation data from YAML file."""
         try:
-            with open(self.eval_data_file, "r", encoding="utf-8") as file:
-                eval_data = yaml.safe_load(file)
+            eval_data_path = Path(self.eval_data_file).resolve()
+            logger.info("Loading evaluation data from: %s", str(eval_data_path))
 
-            if not isinstance(eval_data, list):
-                raise ConfigurationError(
-                    "Eval data file must contain a list of evaluations"
+            with open(eval_data_path, "r", encoding="utf-8") as file:
+                raw_data = yaml.safe_load(file)
+
+            if raw_data is None:
+                raise EvaluationDataError("Eval data file is empty")
+            if not isinstance(raw_data, list):
+                raise EvaluationDataError(
+                    f"Eval data file must contain a list of conversations, got {type(raw_data)}"
+                )
+            if not raw_data:
+                raise EvaluationDataError(
+                    "Eval data file must contain at least one conversation"
                 )
 
-            self.eval_data = []
-            for data in eval_data:
-                self._validate_eval_data(data)
-                self.eval_data.append(EvaluationDataConfig(**data))
+            logger.info("Found %d conversation(s) in YAML file", len(raw_data))
+
+            # Process each conversation
+            self._load_conversation_data(raw_data)
 
         except yaml.YAMLError as e:
-            raise ConfigurationError(f"Invalid YAML in eval data file: {e}") from e
+            raise EvaluationDataError(f"Invalid YAML in eval data file: {e}") from e
+        except FileNotFoundError as e:
+            raise EvaluationDataError(f"Eval data file not found: {e}") from e
+        except EvaluationDataError:
+            raise
         except Exception as e:
-            raise ConfigurationError(f"Error loading eval data file: {e}") from e
-
-    def _validate_eval_data(self, eval_data: dict[str, Any]) -> None:
-        """Validate a single evaluation data point."""
-        required_fields = ["eval_id", "eval_query"]
-        for field in required_fields:
-            if field not in eval_data:
-                raise ConfigurationError(
-                    f"Missing required field '{field}' in evaluation data"
+            raise EvaluationDataError(f"Error loading eval data file: {e}") from e
+
+    def _load_conversation_data(self, raw_data: list[dict[str, Any]]) -> None:
+        """Load conversation data."""
+        logger.info("Processing conversation data...")
+
+        self.conversations = []
+        processed_groups = set()
+
+        for idx, conversation_data in enumerate(raw_data, 1):
+            logger.debug("Processing conversation %d", idx)
+
+            try:
+                conversation_config = ConversationDataConfig(**conversation_data)
+
+                # Check for duplicate conversation groups
+                if conversation_config.conversation_group in processed_groups:
+                    raise EvaluationDataError(
+                        "Duplicate conversation_group "
+                        f"'{conversation_config.conversation_group}' found"
+                    )
+                processed_groups.add(conversation_config.conversation_group)
+
+                # Store the conversation
+                self.conversations.append(conversation_config)
+
+                logger.info(
+                    "Loaded conversation '%s' with %d evaluations",
+                    conversation_config.conversation_group,
+                    len(conversation_config.conversation),
                 )
 
-        eval_type = eval_data.get("eval_type", "judge-llm")
-        if eval_type not in ["judge-llm", "script", "sub-string"]:
-            raise ConfigurationError(f"Invalid eval_type: {eval_type}")
+            except ValidationError as e:
+                error_details = self._format_pydantic_error(e)
+                conversation_group = conversation_data.get(
+                    "conversation_group", f"conversation_{idx}"
+                )
+                raise EvaluationDataError(
+                    f"Validation error in conversation '{conversation_group}': {error_details}"
+                ) from e
+            except EvaluationDataError:
+                raise
+            except Exception as e:
+                raise EvaluationDataError(
+                    f"Error processing conversation {idx}: {e}"
+                ) from e
+
+    def _format_pydantic_error(self, error: ValidationError) -> str:
+        """Format Pydantic validation error."""
+        errors = []
+        for err in error.errors():
+            field = " -> ".join(str(loc) for loc in err["loc"])
+            message = err["msg"]
+            errors.append(f"{field}: {message}")
+        return "; ".join(errors)
+
+    def _log_loaded_data_stats(self) -> None:
+        """Log statistics about loaded data."""
+        if not self.conversations:
+            raise EvaluationDataError("No valid conversations found in eval data file")
+
+        # Calculate statistics from conversations
+        eval_types: dict[str, int] = {}
+        conversation_stats = {}
+        total_evaluations = 0
+
+        for conversation in self.conversations:
+            conv_group = conversation.conversation_group
+            conversation_stats[conv_group] = len(conversation.conversation)
+            total_evaluations += len(conversation.conversation)
+
+            for eval_config in conversation.conversation:
+                eval_types[eval_config.eval_type] = (
+                    eval_types.get(eval_config.eval_type, 0) + 1
+                )
 
-        # Validate type-specific requirements
-        if eval_type == "judge-llm" and "expected_response" not in eval_data:
-            raise ConfigurationError(
-                "eval_type 'judge-llm' requires 'expected_response' field"
-            )
+        if total_evaluations == 0:
+            raise EvaluationDataError("No valid evaluations found in eval data file")
 
-        if eval_type == "sub-string" and "expected_keywords" not in eval_data:
-            raise ConfigurationError(
-                "eval_type 'sub-string' requires 'expected_keywords' field"
+        # Check for duplicate eval_ids across all conversations
+        all_eval_ids = []
+        for conversation in self.conversations:
+            all_eval_ids.extend(
+                [eval_config.eval_id for eval_config in conversation.conversation]
             )
 
-        if eval_type == "script" and "eval_verify_script" not in eval_data:
-            raise ConfigurationError(
-                "eval_type 'script' requires 'eval_verify_script' field"
+        duplicate_ids = [
+            eval_id for eval_id in all_eval_ids if all_eval_ids.count(eval_id) > 1
+        ]
+        if duplicate_ids:
+            logger.warning(
+                "Duplicate eval_id(s) found across conversations: %s",
+                set(duplicate_ids),
             )
 
-    def get_eval_data(self) -> list[EvaluationDataConfig]:
-        """Get all evaluation configurations."""
-        return self.eval_data
+        logger.info("✅ Data validation complete:")
+        logger.info("  %d conversations", len(self.conversations))
+        logger.info("  %d total evaluations", total_evaluations)
+        logger.info("  Evaluation types: %s", dict(eval_types))
+
+        for conv_group, count in conversation_stats.items():
+            logger.debug("  %s: %d evaluations", conv_group, count)
+
+    def get_conversations(self) -> list[ConversationDataConfig]:
+        """Get all conversation configurations."""
+        return self.conversations
 
     def get_eval_count(self) -> int:
-        """Get the number of evaluation configurations."""
-        return len(self.eval_data)
+        """Get the total number of evaluation configurations."""
+        return sum(
+            len(conversation.conversation) for conversation in self.conversations
+        )
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py
index ad51b92e..428b1fff 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py
@@ -1,14 +1,17 @@
 """Evaluation runner that orchestrates different evaluation types."""
 
 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
-from ..utils.api_client import AgentHttpClient
 from ..utils.exceptions import AgentAPIError, JudgeModelError, ScriptExecutionError
-from ..utils.judge import JudgeModelManager
 from ..utils.prompt import ANSWER_CORRECTNESS_PROMPT
-from .models import EvaluationDataConfig, EvaluationResult
-from .script_runner import ScriptRunner
+from .utils import create_error_result, create_success_result
+
+if TYPE_CHECKING:
+    from ..utils.api_client import AgentHttpClient
+    from ..utils.judge import JudgeModelManager
+    from .models import EvaluationDataConfig, EvaluationResult
+    from .script_runner import ScriptRunner
 
 logger = logging.getLogger(__name__)
 
@@ -18,96 +21,49 @@ class EvaluationRunner:
 
     def __init__(
         self,
-        agent_client: AgentHttpClient,
-        judge_manager: Optional[JudgeModelManager] = None,
-        kubeconfig: Optional[str] = None,
+        agent_client: "AgentHttpClient",
+        script_runner: "ScriptRunner",
+        judge_manager: Optional["JudgeModelManager"] = None,
     ):
         """Initialize evaluation runner."""
         self.agent_client = agent_client
         self.judge_manager = judge_manager
-        self.kubeconfig = kubeconfig
+        self.script_runner = script_runner
 
     def run_evaluation(
-        self, data_config: EvaluationDataConfig, agent_provider: str, agent_model: str
-    ) -> EvaluationResult:
+        self,
+        data_config: "EvaluationDataConfig",
+        agent_provider: str,
+        agent_model: str,
+        conversation_id: Optional[str] = None,
+    ) -> "EvaluationResult":
         """Run a single evaluation based on configuration."""
         try:
-            # Execute setup script if provided
-            if data_config.eval_setup_script:
-                try:
-                    script_runner = ScriptRunner(kubeconfig=self.kubeconfig)
-                    success = script_runner.run_script(data_config.eval_setup_script)
-                    if not success:
-                        raise ScriptExecutionError(
-                            "Setup script returned non-zero exit code"
-                        )
-                    logger.debug(
-                        "Setup script executed successfully for %s", data_config.eval_id
-                    )
-                except ScriptExecutionError as e:
-                    logger.error(
-                        "Setup script failed for %s: %s", data_config.eval_id, e
-                    )
-                    return EvaluationResult(
-                        eval_id=data_config.eval_id,
-                        query=data_config.eval_query,
-                        response="",
-                        eval_type=data_config.eval_type,
-                        result="ERROR",
-                        error=f"Setup script failed: {e}",
-                    )
-
-            response = self.agent_client.query_agent(
-                data_config.eval_query, agent_provider, agent_model
-            )
+            # Query the agent
+            api_input = {
+                "query": data_config.eval_query,
+                "provider": agent_provider,
+                "model": agent_model,
+                "conversation_id": conversation_id,
+            }
+
+            response, conversation_id = self.agent_client.query_agent(api_input)
 
-            # Evaluate response based on type
-            success = self._evaluate_response(data_config, response)
-
-            # Execute cleanup script if provided
-            if data_config.eval_cleanup_script:
-                try:
-                    cleanup_runner = ScriptRunner(kubeconfig=self.kubeconfig)
-                    cleanup_success = cleanup_runner.run_script(
-                        data_config.eval_cleanup_script
-                    )
-                    if cleanup_success:
-                        logger.debug(
-                            "Cleanup script executed successfully for %s",
-                            data_config.eval_id,
-                        )
-                    else:
-                        logger.warning(
-                            "Cleanup script failed for %s", data_config.eval_id
-                        )
-                except ScriptExecutionError as e:
-                    logger.warning(
-                        "Cleanup script failed for %s: %s", data_config.eval_id, e
-                    )
-
-            return EvaluationResult(
-                eval_id=data_config.eval_id,
-                query=data_config.eval_query,
-                response=response,
-                eval_type=data_config.eval_type,
-                result="PASS" if success else "FAIL",
+            # Evaluate agent action based on eval type
+            success = self._evaluate_agent_action(data_config, response)
+
+            return create_success_result(
+                data_config, response, success, conversation_id
             )
 
         except (AgentAPIError, ScriptExecutionError, JudgeModelError) as e:
             logger.error("Evaluation failed for %s: %s", data_config.eval_id, e)
-            return EvaluationResult(
-                eval_id=data_config.eval_id,
-                query=data_config.eval_query,
-                response="",
-                eval_type=data_config.eval_type,
-                result="ERROR",
-                error=str(e),
-            )
+            return create_error_result(data_config, str(e), conversation_id)
 
-    def _evaluate_response(
-        self, data_config: EvaluationDataConfig, response: str
+    def _evaluate_agent_action(
+        self, data_config: "EvaluationDataConfig", response: str
     ) -> bool:
-        """Evaluate response based on configuration type."""
+        """Evaluate agent action based on configuration type."""
         match data_config.eval_type:
             case "script":
                 return self._evaluate_script(data_config)
@@ -119,27 +75,27 @@ def _evaluate_response(
                 logger.error("Unknown evaluation type: %s", data_config.eval_type)
                 return False
 
-    def _evaluate_script(self, data_config: EvaluationDataConfig) -> bool:
+    def _evaluate_script(self, data_config: "EvaluationDataConfig") -> bool:
         """Evaluate using script execution."""
         if not data_config.eval_verify_script:
             logger.error("No verify script provided for script evaluation")
             return False
 
-        script_runner = ScriptRunner(kubeconfig=self.kubeconfig)
-        return script_runner.run_script(data_config.eval_verify_script)
+        return self.script_runner.run_script(data_config.eval_verify_script)
 
     def _evaluate_substring(
-        self, data_config: EvaluationDataConfig, response: str
+        self, data_config: "EvaluationDataConfig", response: str
     ) -> bool:
         """Evaluate using substring matching."""
         if not data_config.expected_keywords:
             return False
 
         response_lower = response.lower()
-        return any(
-            keyword.lower() in response_lower
-            for keyword in data_config.expected_keywords
-        )
+        # All keywords must be present for evaluation to pass
+        for keyword in data_config.expected_keywords:
+            if keyword.lower() not in response_lower:
+                return False
+        return True
 
     def _extract_numeric_result(self, response: Optional[str]) -> int:
         """Extract numeric result from judge response."""
@@ -156,7 +112,7 @@ def _extract_numeric_result(self, response: Optional[str]) -> int:
         return int(response)
 
     def _evaluate_judge_llm(
-        self, data_config: EvaluationDataConfig, response: str
+        self, data_config: "EvaluationDataConfig", response: str
     ) -> bool:
         """Evaluate using judge LLM."""
         if not self.judge_manager:
@@ -179,6 +135,6 @@ def _evaluate_judge_llm(
         result = self._extract_numeric_result(judge_resp)
         return result == 1
 
-    def get_judge_manager(self) -> Optional[JudgeModelManager]:
+    def get_judge_manager(self) -> Optional["JudgeModelManager"]:
         """Get the judge model manager."""
         return self.judge_manager
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py
index 65c1140b..568888cd 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py
@@ -1,30 +1,292 @@
 """Data models for agent evaluation."""
 
-from dataclasses import dataclass
-from typing import Optional
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 
+from pydantic import BaseModel, Field, ValidationInfo, field_validator, model_validator
 
-@dataclass
-class EvaluationResult:
-    """Evaluation result data structure."""
+VALID_EVAL_TYPES = ["judge-llm", "script", "sub-string"]
+VALID_EVAL_RESULTS = ["PASS", "FAIL", "ERROR"]
 
-    eval_id: str
-    query: str
-    response: str
-    eval_type: str
-    result: str
-    error: Optional[str] = None
 
+def _validate_eval_type(eval_type: str) -> str:
+    """Validate evaluation type."""
+    if eval_type not in VALID_EVAL_TYPES:
+        raise ValueError(
+            f"eval_type must be one of {VALID_EVAL_TYPES}, got '{eval_type}'"
+        )
+    return eval_type
 
-@dataclass
-class EvaluationDataConfig:  # pylint: disable=too-many-instance-attributes
+
+def _validate_script_path(
+    script_file: Optional[Union[str, Path]], script_name: str
+) -> Optional[Path]:
+    """Validate script path exists and convert to absolute Path."""
+    if script_file is not None:
+        if isinstance(script_file, str):
+            script_file = script_file.strip()
+            if not script_file:
+                raise ValueError(f"{script_name} cannot be empty string")
+            script_file = Path(script_file)
+
+        # Convert to absolute path
+        script_path = script_file.resolve()
+
+        # Validate file exists
+        if not script_path.exists():
+            raise ValueError(f"{script_name} file not found: {script_path}")
+
+        if not script_path.is_file():
+            raise ValueError(f"{script_name} is not a file: {script_path}")
+
+        return script_path
+
+    return None
+
+
+def _calculate_stats_by_category(
+    results: list["EvaluationResult"],
+    key_extractor: Callable[["EvaluationResult"], str],
+) -> dict[str, dict[str, Union[int, float]]]:
+    """Calculate statistics grouped by a category from each result."""
+    category_stats: dict[str, dict[str, Union[int, float]]] = {}
+
+    for result in results:
+        category = key_extractor(result)
+        if category not in category_stats:
+            category_stats[category] = {"passed": 0, "failed": 0, "errored": 0}
+
+        if result.result == "PASS":
+            category_stats[category]["passed"] += 1
+        elif result.result == "FAIL":
+            category_stats[category]["failed"] += 1
+        elif result.result == "ERROR":
+            category_stats[category]["errored"] += 1
+
+    # Calculate success rates
+    for stats in category_stats.values():
+        total = stats["passed"] + stats["failed"] + stats["errored"]
+        stats["total"] = total
+        stats["success_rate"] = (
+            round((stats["passed"] / total) * 100, 2) if total > 0 else 0.0
+        )
+
+    return category_stats
+
+
+class EvaluationDataConfig(BaseModel):
     """Single evaluation data configuration."""
 
-    eval_id: str
-    eval_query: str
-    eval_type: str = "judge-llm"
-    expected_response: Optional[str] = None
-    expected_keywords: Optional[list[str]] = None
-    eval_setup_script: Optional[str] = None
-    eval_verify_script: Optional[str] = None
-    eval_cleanup_script: Optional[str] = None
+    eval_id: str = Field(..., min_length=1, description="Unique evaluation identifier")
+    eval_query: str = Field(..., min_length=1, description="Query to send to the agent")
+    eval_type: str = Field(
+        ..., description="Type of evaluation (judge-llm, sub-string, script)"
+    )
+    expected_response: Optional[str] = Field(
+        None, min_length=1, description="Expected response for judge-llm"
+    )
+    expected_keywords: Optional[list[str]] = Field(
+        None, min_length=1, description="List of expected keywords for sub-string"
+    )
+    eval_verify_script: Optional[Path] = Field(
+        None, description="Script path for script evaluation"
+    )
+    conversation_group: Optional[str] = Field(None, min_length=1)
+    description: Optional[str] = Field(
+        None, min_length=1, max_length=500, description="Description of this evaluation"
+    )
+
+    @field_validator("eval_type")
+    @classmethod
+    def validate_eval_type(cls, v: str) -> str:
+        """Validate evaluation type."""
+        return _validate_eval_type(v)
+
+    @field_validator("expected_keywords")
+    @classmethod
+    def validate_keywords(cls, v: Optional[list[str]]) -> Optional[list[str]]:
+        """Ensure keywords is a list and validate content."""
+        if v is not None:
+            if not isinstance(v, list):
+                v = [v]
+            # Remove empty strings and validate
+            v = [keyword.strip() for keyword in v if keyword and keyword.strip()]
+            if not v:
+                raise ValueError("expected_keywords cannot be empty after filtering")
+        return v
+
+    @field_validator("eval_verify_script")
+    @classmethod
+    def validate_script_path(cls, v: Optional[Union[str, Path]]) -> Optional[Path]:
+        """Validate verify script path exists and convert to absolute Path."""
+        return _validate_script_path(v, "eval_verify_script")
+
+    @model_validator(mode="after")
+    def validate_eval_requirements(self) -> "EvaluationDataConfig":
+        """Validate eval type specific requirements."""
+        if self.eval_type == "judge-llm":
+            if not self.expected_response:
+                raise ValueError(
+                    "eval_type 'judge-llm' requires non-empty 'expected_response'"
+                )
+
+        elif self.eval_type == "sub-string":
+            if not self.expected_keywords or len(self.expected_keywords) == 0:
+                raise ValueError(
+                    "eval_type 'sub-string' requires non-empty 'expected_keywords'"
+                )
+
+        elif self.eval_type == "script":
+            if not self.eval_verify_script:
+                raise ValueError(
+                    "eval_type 'script' requires non-empty 'eval_verify_script'"
+                )
+
+        return self
+
+
+class ConversationDataConfig(BaseModel):
+    """Configuration for a conversation group."""
+
+    conversation_group: str = Field(
+        ..., min_length=1, max_length=100, description="Conversation group identifier"
+    )
+    conversation: list[EvaluationDataConfig] = Field(
+        ..., min_length=1, description="List of evaluations in this conversation group"
+    )
+    description: Optional[str] = Field(
+        None,
+        min_length=1,
+        max_length=500,
+        description="Description of this conversation group",
+    )
+    setup_script: Optional[Path] = Field(
+        None, description="Setup script path for conversation group"
+    )
+    cleanup_script: Optional[Path] = Field(
+        None, description="Cleanup script path for conversation group"
+    )
+
+    @field_validator("conversation_group")
+    @classmethod
+    def validate_conversation_group(cls, v: str) -> str:
+        """Validate conversation group name."""
+        v = v.strip()
+        if not v:
+            raise ValueError("conversation_group cannot be empty")
+
+        return v
+
+    @field_validator("setup_script", "cleanup_script")
+    @classmethod
+    def validate_script_path(
+        cls, v: Optional[Union[str, Path]], info: ValidationInfo
+    ) -> Optional[Path]:
+        """Validate script path exists and convert to absolute Path."""
+        if info.field_name is None:
+            raise ValueError("Set a script name for field validator")
+
+        return _validate_script_path(v, info.field_name)
+
+    @model_validator(mode="after")
+    def validate_conversation_data(self) -> "ConversationDataConfig":
+        """Validate conversation data consistency."""
+        if not self.conversation:
+            raise ValueError(
+                f"Conversation '{self.conversation_group}' must have at least one evaluation"
+            )
+
+        # Set conversation group for all evaluations
+        for eval_config in self.conversation:
+            eval_config.conversation_group = self.conversation_group
+
+        # Check for duplicate eval_ids within conversation
+        eval_ids = [eval_config.eval_id for eval_config in self.conversation]
+        duplicates = [eval_id for eval_id in eval_ids if eval_ids.count(eval_id) > 1]
+        if duplicates:
+            raise ValueError(
+                f"Duplicate eval_id(s) in conversation '{self.conversation_group}': {duplicates}"
+            )
+
+        return self
+
+
+class EvaluationResult(BaseModel):
+    """Result of a single evaluation."""
+
+    eval_id: str = Field(..., min_length=1, description="Evaluation identifier")
+    query: str = Field(..., min_length=1, description="Query sent to agent")
+    response: str = Field(..., description="Agent response")
+    eval_type: str = Field(..., description="Type of evaluation performed")
+    result: str = Field(..., description="Evaluation result")
+    conversation_group: Optional[str] = Field(None, description="Conversation group")
+    conversation_id: Optional[str] = Field(None, description="Conversation ID")
+    error: Optional[str] = Field(None, description="Error message if any")
+
+    @field_validator("result")
+    @classmethod
+    def validate_result(cls, v: str) -> str:
+        """Validate result is one of the allowed values."""
+        if v not in VALID_EVAL_RESULTS:
+            raise ValueError(f"Result must be one of {VALID_EVAL_RESULTS}, got '{v}'")
+        return v
+
+    @field_validator("eval_type")
+    @classmethod
+    def validate_eval_type(cls, v: str) -> str:
+        """Validate evaluation type."""
+        return _validate_eval_type(v)
+
+
+class EvaluationStats(BaseModel):
+    """Statistics for evaluation runs."""
+
+    total_evaluations: int = Field(..., ge=0, description="Total number of evaluations")
+    total_conversations: int = Field(
+        ..., ge=0, description="Total number of conversations"
+    )
+    passed: int = Field(..., ge=0, description="Number of passed evaluations")
+    failed: int = Field(..., ge=0, description="Number of failed evaluations")
+    errored: int = Field(..., ge=0, description="Number of errored evaluations")
+    success_rate: float = Field(
+        ..., ge=0.0, le=100.0, description="Success rate percentage"
+    )
+    by_conversation: dict[str, dict[str, Any]] = Field(
+        default_factory=dict, description="Statistics by conversation"
+    )
+    by_eval_type: dict[str, dict[str, Any]] = Field(
+        default_factory=dict, description="Statistics by evaluation type"
+    )
+
+    @classmethod
+    def from_results(cls, results: list[EvaluationResult]) -> "EvaluationStats":
+        """Create comprehensive statistics from evaluation results."""
+        total = len(results)
+        passed = sum(1 for r in results if r.result == "PASS")
+        failed = sum(1 for r in results if r.result == "FAIL")
+        errored = sum(1 for r in results if r.result == "ERROR")
+        success_rate = (passed / total * 100) if total > 0 else 0.0
+
+        # Count unique conversations
+        conversations: set[str] = set(
+            r.conversation_group for r in results if r.conversation_group
+        )
+
+        # Calculate statistics by conversation
+        by_conversation = _calculate_stats_by_category(
+            results, lambda r: r.conversation_group or "unknown"
+        )
+
+        # Calculate statistics by eval_type
+        by_eval_type = _calculate_stats_by_category(results, lambda r: r.eval_type)
+
+        return cls(
+            total_evaluations=total,
+            total_conversations=len(conversations),
+            passed=passed,
+            failed=failed,
+            errored=errored,
+            success_rate=success_rate,
+            by_conversation=by_conversation,
+            by_eval_type=by_eval_type,
+        )
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py
index 010cbcd0..8c3c3e28 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py
@@ -1,13 +1,14 @@
 """Results management for agent evaluation."""
 
+import json
 import logging
 from datetime import datetime
 from pathlib import Path
-from typing import Optional
 
 import pandas as pd
 
-from .models import EvaluationResult
+from ..utils.exceptions import AgentEvaluationError
+from .models import EvaluationResult, EvaluationStats
 
 logger = logging.getLogger(__name__)
 
@@ -15,48 +16,76 @@
 class ResultsManager:
     """Manages evaluation results and output."""
 
-    def __init__(self, result_dir: str):
+    def __init__(self, results: list[EvaluationResult]):
         """Initialize results manager."""
-        self.result_dir = result_dir
-        self.result_path = Path(result_dir)
-
-    def save_results(
-        self,
-        results: list[EvaluationResult],
-        filename: Optional[str] = None,
-    ) -> None:
-        """Save evaluation results to CSV file."""
-        # Create directory if it doesn't exist
-        self.result_path.mkdir(parents=True, exist_ok=True)
-
-        # Generate filename with timestamp if not provided
-        if filename is None:
+        self.results = results
+
+        self.results_stats = EvaluationStats.from_results(results)
+
+    def save_results(self, result_dir: str) -> None:
+        """Save evaluation results/statistics to CSV and JSON files."""
+        if not self.results:
+            logger.warning("No result to save")
+            return
+
+        try:
+            output_dir = Path(result_dir)
+            output_dir.mkdir(parents=True, exist_ok=True)
+
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            filename = f"agent_goal_eval_results_{timestamp}.csv"
+            csv_file = output_dir / f"agent_goal_eval_results_{timestamp}.csv"
+            json_file = output_dir / f"agent_goal_eval_summary_{timestamp}.json"
 
-        # Create full file path
-        file_path = self.result_path / filename
+            # Save CSV results
+            self._save_csv_results(csv_file)
+            # Save summary JSON
+            self._save_json_summary(json_file)
 
-        # Convert results to DataFrame
+        except Exception as e:
+            logger.error("Failed to save results: %s", e)
+            raise AgentEvaluationError(f"Failed to save results: {e}") from e
+
+    def _save_csv_results(self, file_path: Path) -> None:
+        """Save results to CSV file."""
         data = []
-        for result in results:
+        for result in self.results:
             data.append(
                 {
+                    "conversation_group": result.conversation_group,
+                    "conversation_id": result.conversation_id,
                     "eval_id": result.eval_id,
                     "query": result.query,
                     "response": result.response,
                     "eval_type": result.eval_type,
                     "result": result.result,
-                    "error": result.error or "",
+                    "error": result.error,
                 }
             )
 
         df = pd.DataFrame(data)
 
-        # Save to CSV using pandas
         df.to_csv(file_path, index=False, encoding="utf-8")
         logger.info("Results saved to %s", file_path)
 
-    def get_output_dir(self) -> str:
-        """Get the output directory path."""
-        return str(self.result_path)
+    def _save_json_summary(self, file_path: Path) -> None:
+        """Save eval summary to JSON file."""
+        statistics = {
+            "summary": {
+                "total_evaluations": self.results_stats.total_evaluations,
+                "total_conversations": self.results_stats.total_conversations,
+                "passed": self.results_stats.passed,
+                "failed": self.results_stats.failed,
+                "errored": self.results_stats.errored,
+                "success_rate": round(self.results_stats.success_rate, 2),
+            },
+            "by_conversation": self.results_stats.by_conversation,
+            "by_eval_type": self.results_stats.by_eval_type,
+        }
+
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(statistics, f, indent=2, ensure_ascii=False)
+        logger.info("Summary saved to %s", file_path)
+
+    def get_results_stats(self) -> EvaluationStats:
+        """Get result stats/summary."""
+        return self.results_stats
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py
index f4ea0ad8..b3ba1208 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py
@@ -1,10 +1,10 @@
-"""Script execution for evaluation."""
+"""Script execution module for evaluation."""
 
 import logging
 import os
 import subprocess
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 from ..utils.exceptions import ScriptExecutionError
 
@@ -25,18 +25,16 @@ def get_environment(self) -> dict:
             env["KUBECONFIG"] = self.kubeconfig
         return env
 
-    def run_script(self, script_path: str, input_text: Optional[str] = None) -> bool:
-        """
-        Execute a script and return success status.
+    def run_script(self, script_path: Union[str, Path]) -> bool:
+        """Execute a script and return success status."""
+        if isinstance(script_path, str):
+            script_path = Path(script_path)
+        script_path = script_path.resolve()
 
-        Path normalization: Relative paths are converted to absolute path.
-        """
-        script_file = Path(script_path).resolve()
-
-        if not script_file.exists():
+        if not script_path.exists():
             raise ScriptExecutionError(f"Script not found: {script_path}")
 
-        if not script_file.is_file():
+        if not script_path.is_file():
             raise ScriptExecutionError(f"Script path is not a file: {script_path}")
 
         try:
@@ -44,17 +42,13 @@ def run_script(self, script_path: str, input_text: Optional[str] = None) -> bool
             env = self.get_environment()
 
             # Make script executable
-            script_file.chmod(0o755)
-
-            # Prepare command
-            cmd = ["bash", str(script_file)]
+            script_path.chmod(0o755)
 
             # Run script
-            logger.debug("Running script: %s", script_file)
+            logger.debug("Running script: %s", script_path)
 
             result = subprocess.run(
-                cmd,
-                input=input_text,
+                [str(script_path)],
                 text=True,
                 capture_output=True,
                 env=env,
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py
new file mode 100644
index 00000000..d59e1716
--- /dev/null
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py
@@ -0,0 +1,42 @@
+"""Utility functions for evaluation processing."""
+
+from typing import Optional
+
+from .models import EvaluationDataConfig, EvaluationResult
+
+
+def create_error_result(
+    eval_config: EvaluationDataConfig,
+    error_message: str,
+    conversation_id: Optional[str] = None,
+) -> EvaluationResult:
+    """Create a standardized error result."""
+    return EvaluationResult(
+        eval_id=eval_config.eval_id,
+        query=eval_config.eval_query,
+        response="",
+        eval_type=eval_config.eval_type,
+        result="ERROR",
+        conversation_group=eval_config.conversation_group,
+        conversation_id=conversation_id,
+        error=error_message,
+    )
+
+
+def create_success_result(
+    eval_config: EvaluationDataConfig,
+    response: str,
+    success: bool,
+    conversation_id: Optional[str] = None,
+) -> EvaluationResult:
+    """Create a standardized success/fail result."""
+    return EvaluationResult(
+        eval_id=eval_config.eval_id,
+        query=eval_config.eval_query,
+        response=response,
+        eval_type=eval_config.eval_type,
+        result="PASS" if success else "FAIL",
+        conversation_group=eval_config.conversation_group,
+        conversation_id=conversation_id,
+        error=None,
+    )
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py b/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py
index d5d6bb55..53645bf4 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py
@@ -47,18 +47,13 @@ def _read_token_file(self, token_file: str) -> str:
             raise AgentAPIError(f"Error reading token file: {e}") from e
 
     def query_agent(
-        self, query: str, provider: str, model: str, timeout: int = 300
-    ) -> str:
+        self, api_input: dict[str, str], timeout: int = 300
+    ) -> tuple[str, str]:
         """Query the agent and return response."""
         if not self.client:
             raise AgentAPIError("HTTP client not initialized")
 
         try:
-            api_input = {
-                "query": query,
-                "provider": provider,
-                "model": model,
-            }
             response = self.client.post(
                 "/v1/query",
                 json=api_input,
@@ -69,8 +64,11 @@ def query_agent(
             response_data = response.json()
             if "response" not in response_data:
                 raise AgentAPIError("Agent response missing 'response' field")
+            agent_response = response_data["response"].strip()
 
-            return response_data["response"].strip()
+            conversation_id = response_data.get("conversation_id", "").strip()
+
+            return agent_response, conversation_id
 
         except httpx.TimeoutException as e:
             raise AgentAPIError(f"Agent query timeout after {timeout} seconds") from e
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py b/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py
index 2ba54ff0..869b5e7e 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py
@@ -5,8 +5,8 @@ class AgentEvaluationError(Exception):
     """Base exception for agent evaluation errors."""
 
 
-class ConfigurationError(AgentEvaluationError):
-    """Configuration-related errors."""
+class EvaluationDataError(AgentEvaluationError):
+    """Evaluation data loading, parsing, and validation errors."""
 
 
 class AgentAPIError(AgentEvaluationError):
diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py
index 87bf578e..d0b57f8b 100644
--- a/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py
+++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py
@@ -1,14 +1,17 @@
 """Tests for agent goal evaluation orchestrator."""
 
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
 
 import pytest
 
 from lsc_agent_eval.core.agent_goal_eval.agent_goal_eval import AgentGoalEval
 from lsc_agent_eval.core.agent_goal_eval.models import (
+    ConversationDataConfig,
     EvaluationDataConfig,
     EvaluationResult,
+    EvaluationStats,
 )
+from lsc_agent_eval.core.utils.exceptions import AgentEvaluationError
 
 
 class TestAgentGoalEval:
@@ -30,22 +33,25 @@ def mock_args(self):
         return args
 
     @pytest.fixture
-    def sample_configs(self):
-        """Sample evaluation configurations."""
-        return [
-            EvaluationDataConfig(
-                eval_id="test_001",
-                eval_query="What is Kubernetes?",
-                eval_type="judge-llm",
-                expected_response="Kubernetes is a container orchestration platform",
-            ),
-            EvaluationDataConfig(
-                eval_id="test_002",
-                eval_query="Deploy nginx",
-                eval_type="script",
-                eval_verify_script="./verify.sh",
-            ),
-        ]
+    def sample_conversation(self):
+        """Sample conversation data configuration."""
+        return ConversationDataConfig(
+            conversation_group="test_conv",
+            conversation=[
+                EvaluationDataConfig(
+                    eval_id="test_001",
+                    eval_query="What is Openshift?",
+                    eval_type="judge-llm",
+                    expected_response="OpenShift is Red Hat's enterprise Kubernetes platform.",
+                ),
+                EvaluationDataConfig(
+                    eval_id="test_002",
+                    eval_query="Deploy nginx",
+                    eval_type="sub-string",
+                    expected_keywords=["nginx", "deployment"],
+                ),
+            ],
+        )
 
     @pytest.fixture
     def sample_results(self):
@@ -57,13 +63,17 @@ def sample_results(self):
                 response="Kubernetes is a container orchestration platform",
                 eval_type="judge-llm",
                 result="PASS",
+                conversation_group="test_conv",
+                conversation_id="conv-id-123",
             ),
             EvaluationResult(
                 eval_id="test_002",
                 query="Deploy nginx",
-                response="kubectl create deployment nginx --image=nginx",
-                eval_type="script",
+                response="oc create deployment nginx --image=nginx",
+                eval_type="sub-string",
                 result="PASS",
+                conversation_group="test_conv",
+                conversation_id="conv-id-123",
             ),
         ]
 
@@ -73,10 +83,10 @@ def sample_results(self):
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
+    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner")
     def test_init_with_judge_manager(
         self,
-        mock_results_manager,
+        mock_script_runner,
         mock_evaluation_runner,
         mock_judge_manager,
         mock_agent_client,
@@ -90,22 +100,22 @@ def test_init_with_judge_manager(
         mock_config_manager.assert_called_once_with("test_data.yaml")
         mock_agent_client.assert_called_once_with("http://localhost:8080", None)
         mock_judge_manager.assert_called_once_with("openai", "gpt-4")
+        mock_script_runner.assert_called_once_with(None)
         mock_evaluation_runner.assert_called_once_with(
             mock_agent_client.return_value,
+            mock_script_runner.return_value,
             mock_judge_manager.return_value,
-            kubeconfig=None,
         )
-        mock_results_manager.assert_called_once_with("results/")
 
     @patch(
         "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager"
     )
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
+    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner")
     def test_init_without_judge_manager(
         self,
-        mock_results_manager,
+        mock_script_runner,
         mock_evaluation_runner,
         mock_agent_client,
         mock_config_manager,
@@ -121,8 +131,8 @@ def test_init_without_judge_manager(
         assert evaluator.judge_manager is None
         mock_evaluation_runner.assert_called_once_with(
             mock_agent_client.return_value,
+            mock_script_runner.return_value,
             None,
-            kubeconfig=None,
         )
 
     @patch(
@@ -131,10 +141,10 @@ def test_init_without_judge_manager(
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
+    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner")
     def test_init_with_kubeconfig(
         self,
-        mock_results_manager,
+        mock_script_runner,
         mock_evaluation_runner,
         mock_judge_manager,
         mock_agent_client,
@@ -146,10 +156,11 @@ def test_init_with_kubeconfig(
 
         AgentGoalEval(mock_args)
 
+        mock_script_runner.assert_called_once_with("~/kubeconfig")
         mock_evaluation_runner.assert_called_once_with(
             mock_agent_client.return_value,
+            mock_script_runner.return_value,
             mock_judge_manager.return_value,
-            kubeconfig="~/kubeconfig",
         )
 
     @patch(
@@ -158,23 +169,34 @@ def test_init_with_kubeconfig(
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
+    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner")
     @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
     def test_run_evaluation_success(
         self,
         mock_results_manager,
+        mock_script_runner,
         mock_evaluation_runner,
         mock_judge_manager,
         mock_agent_client,
         mock_config_manager,
         mock_args,
-        sample_configs,
+        sample_conversation,
         sample_results,
     ):
         """Test successful evaluation execution."""
         # Setup mocks
-        mock_config_manager.return_value.get_eval_data.return_value = sample_configs
+        mock_config_manager.return_value.get_conversations.return_value = [
+            sample_conversation
+        ]
+        mock_config_manager.return_value.get_eval_count.return_value = 2
         mock_evaluation_runner.return_value.run_evaluation.side_effect = sample_results
 
+        # Mock results manager
+        mock_results_mgr_instance = MagicMock()
+        mock_results_manager.return_value = mock_results_mgr_instance
+        mock_stats = EvaluationStats.from_results(sample_results)
+        mock_results_mgr_instance.get_results_stats.return_value = mock_stats
+
         evaluator = AgentGoalEval(mock_args)
 
         # Capture print output
@@ -185,121 +207,15 @@ def test_run_evaluation_success(
         assert mock_evaluation_runner.return_value.run_evaluation.call_count == 2
 
         # Verify results were saved
-        mock_results_manager.return_value.save_results.assert_called_once_with(
-            sample_results
+        mock_results_mgr_instance.save_results.assert_called_once_with(
+            mock_args.result_dir
         )
 
         # Verify summary was printed
         mock_print.assert_called()
 
-    @patch(
-        "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager"
-    )
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
-    def test_run_evaluation_with_errors(
-        self,
-        mock_results_manager,
-        mock_evaluation_runner,
-        mock_judge_manager,
-        mock_agent_client,
-        mock_config_manager,
-        mock_args,
-        sample_configs,
-        capsys,
-    ):
-        """Test evaluation execution with errors."""
-        # Setup results with errors
-        results_with_errors = [
-            EvaluationResult(
-                eval_id="test_001",
-                query="What is Kubernetes?",
-                response="Kubernetes is a container orchestration platform",
-                eval_type="judge-llm",
-                result="PASS",
-            ),
-            EvaluationResult(
-                eval_id="test_002",
-                query="Deploy nginx",
-                response="",
-                eval_type="script",
-                result="ERROR",
-                error="Script execution failed",
-            ),
-        ]
-
-        mock_config_manager.return_value.get_eval_data.return_value = sample_configs
-        mock_evaluation_runner.return_value.run_evaluation.side_effect = (
-            results_with_errors
-        )
-
-        evaluator = AgentGoalEval(mock_args)
-
-        evaluator.run_evaluation()
-
-        # Capture stdout/stderr output
-        captured = capsys.readouterr()
-
-        # Verify error messages are printed to stdout
-        assert "✅ test_001: PASS" in captured.out
-        assert "⚠️  test_002: ERROR" in captured.out
-        assert "   Query: Deploy nginx" in captured.out
-        assert "   Evaluation type: script" in captured.out
-        assert "   Response: " in captured.out
-        assert "   Error message: Script execution failed" in captured.out
-
-        # Verify evaluations were run
-        assert mock_evaluation_runner.return_value.run_evaluation.call_count == 2
-
-        # Verify results were saved
-        mock_results_manager.return_value.save_results.assert_called_once_with(
-            results_with_errors
-        )
-
-    @patch(
-        "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager"
-    )
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
-    def test_run_evaluation_exception(
-        self,
-        mock_results_manager,
-        mock_evaluation_runner,
-        mock_judge_manager,
-        mock_agent_client,
-        mock_config_manager,
-        mock_args,
-    ):
-        """Test evaluation execution with exception."""
-        mock_config_manager.return_value.get_eval_data.side_effect = Exception(
-            "Config error"
-        )
-
-        evaluator = AgentGoalEval(mock_args)
-
-        with patch(
-            "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.logger"
-        ) as mock_logger:
-            with pytest.raises(Exception, match="Config error"):
-                evaluator.run_evaluation()
-
-        # Verify error was logged
-        mock_logger.error.assert_called()
-        args, kwargs = mock_logger.error.call_args
-        assert args[0] == "Evaluation failed: %s"
-        assert str(args[1]) == "Config error"
-
-    def test_print_summary_all_pass(self, mock_args):
-        """Test print summary with all passing results."""
-        results = [
-            EvaluationResult("test_001", "query1", "response1", "judge-llm", "PASS"),
-            EvaluationResult("test_002", "query2", "response2", "script", "PASS"),
-        ]
-
+    def test_get_result_summary_success(self, mock_args):
+        """Test result summary with available results."""
         with (
             patch(
                 "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager"
@@ -313,32 +229,17 @@ def test_print_summary_all_pass(self, mock_args):
             patch(
                 "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner"
             ),
-            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"),
+            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"),
         ):
-
             evaluator = AgentGoalEval(mock_args)
+            evaluator.result_summary = {"TOTAL": 5, "PASS": 3, "FAIL": 1, "ERROR": 1}
 
-            with patch("builtins.print") as mock_print:
-                evaluator._print_summary(results)
-
-            # Check that summary was printed
-            print_calls = [call[0][0] for call in mock_print.call_args_list]
-            summary_text = "\n".join(print_calls)
-
-            assert "Total Evaluations: 2" in summary_text
-            assert "Passed: 2" in summary_text
-            assert "Failed: 0" in summary_text
-            assert "Errored: 0" in summary_text
-            assert "Success Rate: 100.0%" in summary_text
-
-    def test_print_summary_mixed_results(self, mock_args):
-        """Test print summary with mixed results."""
-        results = [
-            EvaluationResult("test_001", "query1", "response1", "judge-llm", "PASS"),
-            EvaluationResult("test_002", "query2", "response2", "script", "FAIL"),
-            EvaluationResult("test_003", "query3", "response3", "script", "ERROR"),
-        ]
+            result = evaluator.get_result_summary()
 
+            assert result == {"TOTAL": 5, "PASS": 3, "FAIL": 1, "ERROR": 1}
+
+    def test_get_result_summary_no_results(self, mock_args):
+        """Test result summary with no available results."""
         with (
             patch(
                 "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager"
@@ -352,23 +253,12 @@ def test_print_summary_mixed_results(self, mock_args):
             patch(
                 "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner"
             ),
-            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"),
+            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"),
         ):
-
             evaluator = AgentGoalEval(mock_args)
 
-            with patch("builtins.print") as mock_print:
-                evaluator._print_summary(results)
-
-            # Check that summary was printed
-            print_calls = [call[0][0] for call in mock_print.call_args_list]
-            summary_text = "\n".join(print_calls)
-
-            assert "Total Evaluations: 3" in summary_text
-            assert "Passed: 1" in summary_text
-            assert "Failed: 1" in summary_text
-            assert "Errored: 1" in summary_text
-            assert "Success Rate: 33.3%" in summary_text
+            with pytest.raises(AgentEvaluationError, match="No results available"):
+                evaluator.get_result_summary()
 
     def test_cleanup_with_client(self, mock_args):
         """Test cleanup method with client."""
@@ -385,7 +275,7 @@ def test_cleanup_with_client(self, mock_args):
             patch(
                 "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner"
             ),
-            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"),
+            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"),
         ):
 
             mock_client = Mock()
@@ -412,7 +302,7 @@ def test_cleanup_exception(self, mock_args):
             patch(
                 "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner"
             ),
-            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"),
+            patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"),
         ):
 
             mock_client = Mock()
@@ -431,63 +321,3 @@ def test_cleanup_exception(self, mock_args):
             args, kwargs = mock_logger.warning.call_args
             assert args[0] == "Error during cleanup: %s"
             assert str(args[1]) == "Cleanup error"
-
-    @patch(
-        "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager"
-    )
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
-    def test_run_evaluation_cleanup_called(
-        self,
-        mock_results_manager,
-        mock_evaluation_runner,
-        mock_judge_manager,
-        mock_agent_client,
-        mock_config_manager,
-        mock_args,
-        sample_configs,
-        sample_results,
-    ):
-        """Test that cleanup is called even on success."""
-        mock_config_manager.return_value.get_eval_data.return_value = sample_configs
-        mock_evaluation_runner.return_value.run_evaluation.side_effect = sample_results
-
-        evaluator = AgentGoalEval(mock_args)
-
-        with patch.object(evaluator, "_cleanup") as mock_cleanup:
-            evaluator.run_evaluation()
-
-        # Verify cleanup was called
-        mock_cleanup.assert_called_once()
-
-    @patch(
-        "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager"
-    )
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner")
-    @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager")
-    def test_run_evaluation_cleanup_called_on_exception(
-        self,
-        mock_results_manager,
-        mock_evaluation_runner,
-        mock_judge_manager,
-        mock_agent_client,
-        mock_config_manager,
-        mock_args,
-    ):
-        """Test that cleanup is called even on exception."""
-        mock_config_manager.return_value.get_eval_data.side_effect = Exception(
-            "Config error"
-        )
-
-        evaluator = AgentGoalEval(mock_args)
-
-        with patch.object(evaluator, "_cleanup") as mock_cleanup:
-            with pytest.raises(Exception):
-                evaluator.run_evaluation()
-
-        # Verify cleanup was called
-        mock_cleanup.assert_called_once()
diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py
index 71f6c291..865e60d1 100644
--- a/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py
+++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py
@@ -8,112 +8,128 @@
 import yaml
 
 from lsc_agent_eval.core.agent_goal_eval.eval_data import AgentGoalEvalDataManager
-from lsc_agent_eval.core.agent_goal_eval.models import EvaluationDataConfig
-from lsc_agent_eval.core.utils.exceptions import ConfigurationError
+from lsc_agent_eval.core.agent_goal_eval.models import (
+    ConversationDataConfig,
+    EvaluationDataConfig,
+)
+from lsc_agent_eval.core.utils.exceptions import EvaluationDataError
 
 
 class TestAgentGoalEvalDataManager:
     """Test AgentGoalEvalDataManager."""
 
     @pytest.fixture
-    def valid_eval_data(self):
-        """Valid evaluation data for testing."""
-        return [
-            {
-                "eval_id": "test_001",
-                "eval_query": "What is Kubernetes?",
-                "eval_type": "judge-llm",
-                "expected_response": "Kubernetes is a container orchestration platform",
-            },
-            {
-                "eval_id": "test_002",
-                "eval_query": "Deploy nginx",
-                "eval_type": "script",
-                "eval_verify_script": "./scripts/verify_nginx.sh",
-            },
-            {
-                "eval_id": "test_003",
-                "eval_query": "Show pods",
-                "eval_type": "sub-string",
-                "expected_keywords": ["pod", "running"],
-            },
-        ]
+    def valid_conversation_yaml_content(self):
+        """Valid YAML content with conversation-based structure."""
+        return """
+- conversation_group: conv1
+  description: Test namespace detection using substring matching
+  conversation:
+    - eval_id: eval1
+      eval_query: is there a openshift-monitoring namespace ?
+      eval_type: sub-string
+      expected_keywords:
+        - 'yes'
+        - openshift-monitoring
+      description: Check for openshift-monitoring namespace existence
+
+- conversation_group: conv2
+  description: Test namespace detection using LLM judge
+  conversation:
+    - eval_id: eval1
+      eval_query: is there a openshift-lightspeed namespace ?
+      eval_type: judge-llm
+      expected_response: there is a openshift-lightspeed namespace.
+      description: Verify openshift-lightspeed namespace with LLM evaluation
+"""
 
     @pytest.fixture
-    def valid_yaml_content(self, valid_eval_data):
-        """Valid YAML content as string."""
-        return yaml.dump(valid_eval_data)
-
-    def test_init_success(self, valid_yaml_content):
+    def multiturn_conversation_yaml_content(self):
+        """Valid YAML content with multi-turn conversation."""
+        return """
+- conversation_group: conv1
+  description: Basic conversation flow testing cluster operations
+  conversation:
+    - eval_id: eval1
+      eval_query: Hi!
+      eval_type: judge-llm
+      expected_response: Hello! I'm an AI assistant for the Assisted Installer.
+      description: Initial greeting to start conversation
+    - eval_id: eval2
+      eval_query: Get me active clusters
+      eval_type: judge-llm
+      expected_response: Active clusters are x1, x2.
+      description: Request for cluster information
+    - eval_id: eval3
+      eval_query: Thank you
+      eval_type: judge-llm
+      expected_response: You're welcome!
+      description: Closing statement
+"""
+
+    def test_init_success(self, valid_conversation_yaml_content):
         """Test successful initialization."""
         with (
-            patch("builtins.open", mock_open(read_data=valid_yaml_content)),
+            patch(
+                "builtins.open", mock_open(read_data=valid_conversation_yaml_content)
+            ),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
             manager = AgentGoalEvalDataManager("test.yaml")
 
-            assert len(manager.eval_data) == 3
-            assert manager.eval_data_file == Path("test.yaml")
-            assert isinstance(manager.eval_data[0], EvaluationDataConfig)
+            assert manager.eval_data_file == "test.yaml"
+            assert len(manager.conversations) == 2
+            assert len(manager.get_conversations()) == 2
+            assert manager.get_eval_count() == 2
 
     def test_init_file_not_found(self):
         """Test initialization with non-existent file."""
-        with patch("pathlib.Path.exists", return_value=False):
-            with pytest.raises(ConfigurationError, match="Eval data file not found"):
-                AgentGoalEvalDataManager("nonexistent.yaml")
+        with pytest.raises(EvaluationDataError, match="Eval data file not found"):
+            AgentGoalEvalDataManager("nonexistent.yaml")
 
-    def test_init_path_not_file(self):
-        """Test initialization when path is not a file."""
-        with (
-            patch("pathlib.Path.exists", return_value=True),
-            patch("pathlib.Path.is_file", return_value=False),
-        ):
-
-            with pytest.raises(ConfigurationError, match="path is not a file"):
-                AgentGoalEvalDataManager("directory/")
+    def test_validate_eval_data_file_not_yaml(self):
+        """Test loading invalid YAML file."""
+        invalid_yaml = "invalid: yaml: content: ["
 
-    def test_validate_eval_data_file_exists(self):
-        """Test file validation when file exists."""
         with (
+            patch("builtins.open", mock_open(read_data=invalid_yaml)),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
-            patch("builtins.open", mock_open(read_data="[]")),
         ):
 
-            # Should not raise exception
-            manager = AgentGoalEvalDataManager("test.yaml")
-            assert manager.eval_data_file == Path("test.yaml")
-
-    def test_load_eval_data_invalid_yaml(self):
-        """Test loading invalid YAML content."""
-        invalid_yaml = "invalid: yaml: content: ["
+            with pytest.raises(EvaluationDataError, match="Invalid YAML"):
+                AgentGoalEvalDataManager("test.yaml")
 
+    def test_load_eval_data_file_read_error(self):
+        """Test loading when file read fails."""
         with (
-            patch("builtins.open", mock_open(read_data=invalid_yaml)),
+            patch("builtins.open", side_effect=IOError("Read error")),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            with pytest.raises(ConfigurationError, match="Invalid YAML"):
+            with pytest.raises(
+                EvaluationDataError, match="Error loading eval data file"
+            ):
                 AgentGoalEvalDataManager("test.yaml")
 
     def test_load_eval_data_not_list(self):
         """Test loading YAML that is not a list."""
-        yaml_dict = yaml.dump({"key": "value"})
+        non_list_yaml = yaml.dump({"not": "a list"})
 
         with (
-            patch("builtins.open", mock_open(read_data=yaml_dict)),
+            patch("builtins.open", mock_open(read_data=non_list_yaml)),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            with pytest.raises(ConfigurationError, match="must contain a list"):
+            with pytest.raises(EvaluationDataError, match="must contain a list"):
                 AgentGoalEvalDataManager("test.yaml")
 
     def test_load_eval_data_empty_list(self):
-        """Test loading empty evaluation list."""
+        """Test loading YAML file with empty list."""
         empty_yaml = yaml.dump([])
 
         with (
@@ -122,25 +138,25 @@ def test_load_eval_data_empty_list(self):
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            manager = AgentGoalEvalDataManager("test.yaml")
-            assert len(manager.eval_data) == 0
-
-    def test_load_eval_data_file_read_error(self):
-        """Test loading when file read fails."""
-        with (
-            patch("builtins.open", side_effect=IOError("Read error")),
-            patch("pathlib.Path.exists", return_value=True),
-            patch("pathlib.Path.is_file", return_value=True),
-        ):
-
             with pytest.raises(
-                ConfigurationError, match="Error loading eval data file"
+                EvaluationDataError, match="must contain at least one conversation"
             ):
                 AgentGoalEvalDataManager("test.yaml")
 
-    def test_validate_eval_data_missing_eval_id(self):
-        """Test validation with missing eval_id."""
-        invalid_data = [{"eval_query": "test query"}]
+    def test_validate_conversation_missing_group(self):
+        """Test validation with missing conversation_group."""
+        invalid_data = [
+            {
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_query": "test query",
+                        "eval_type": "judge-llm",
+                        "expected_response": "test response",
+                    }
+                ]
+            }
+        ]
         yaml_content = yaml.dump(invalid_data)
 
         with (
@@ -149,14 +165,16 @@ def test_validate_eval_data_missing_eval_id(self):
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            with pytest.raises(
-                ConfigurationError, match="Missing required field 'eval_id'"
-            ):
+            with pytest.raises(EvaluationDataError, match=".*Field required.*"):
                 AgentGoalEvalDataManager("test.yaml")
 
-    def test_validate_eval_data_missing_eval_query(self):
-        """Test validation with missing eval_query."""
-        invalid_data = [{"eval_id": "test_001"}]
+    def test_validate_conversation_missing_conversation_list(self):
+        """Test validation with missing conversation list."""
+        invalid_data = [
+            {
+                "conversation_group": "test_conv",
+            }
+        ]
         yaml_content = yaml.dump(invalid_data)
 
         with (
@@ -165,18 +183,21 @@ def test_validate_eval_data_missing_eval_query(self):
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            with pytest.raises(
-                ConfigurationError, match="Missing required field 'eval_query'"
-            ):
+            with pytest.raises(EvaluationDataError, match=".*Field required.*"):
                 AgentGoalEvalDataManager("test.yaml")
 
-    def test_validate_eval_data_invalid_eval_type(self):
-        """Test validation with invalid eval_type."""
+    def test_validate_eval_missing_eval_id(self):
+        """Test validation with missing eval_id."""
         invalid_data = [
             {
-                "eval_id": "test_001",
-                "eval_query": "test query",
-                "eval_type": "invalid_type",
+                "conversation_group": "test_conv",
+                "conversation": [
+                    {
+                        "eval_query": "test query",
+                        "eval_type": "judge-llm",
+                        "expected_response": "test response",
+                    }
+                ],
             }
         ]
         yaml_content = yaml.dump(invalid_data)
@@ -187,18 +208,21 @@ def test_validate_eval_data_invalid_eval_type(self):
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            with pytest.raises(
-                ConfigurationError, match="Invalid eval_type: invalid_type"
-            ):
+            with pytest.raises(EvaluationDataError, match=".*Field required.*"):
                 AgentGoalEvalDataManager("test.yaml")
 
-    def test_validate_eval_data_judge_llm_missing_expected_response(self):
-        """Test validation for judge-llm type missing expected_response."""
+    def test_validate_eval_missing_eval_query(self):
+        """Test validation with missing eval_query."""
         invalid_data = [
             {
-                "eval_id": "test_001",
-                "eval_query": "test query",
-                "eval_type": "judge-llm",
+                "conversation_group": "test_conv",
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_type": "judge-llm",
+                        "expected_response": "test response",
+                    }
+                ],
             }
         ]
         yaml_content = yaml.dump(invalid_data)
@@ -209,18 +233,21 @@ def test_validate_eval_data_judge_llm_missing_expected_response(self):
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            with pytest.raises(
-                ConfigurationError, match="requires 'expected_response' field"
-            ):
+            with pytest.raises(EvaluationDataError, match=".*Field required.*"):
                 AgentGoalEvalDataManager("test.yaml")
 
-    def test_validate_eval_data_sub_string_missing_keywords(self):
-        """Test validation for sub-string type missing expected_keywords."""
+    def test_validate_eval_missing_eval_type(self):
+        """Test validation with missing eval_type."""
         invalid_data = [
             {
-                "eval_id": "test_001",
-                "eval_query": "test query",
-                "eval_type": "sub-string",
+                "conversation_group": "test_conv",
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_query": "test query",
+                        "expected_response": "test response",
+                    }
+                ],
             }
         ]
         yaml_content = yaml.dump(invalid_data)
@@ -231,15 +258,23 @@ def test_validate_eval_data_sub_string_missing_keywords(self):
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            with pytest.raises(
-                ConfigurationError, match="requires 'expected_keywords' field"
-            ):
+            with pytest.raises(EvaluationDataError, match=".*Field required.*"):
                 AgentGoalEvalDataManager("test.yaml")
 
-    def test_validate_eval_data_script_missing_verify_script(self):
-        """Test validation for script type missing eval_verify_script."""
+    def test_validate_eval_invalid_eval_type(self):
+        """Test validation with invalid eval_type."""
         invalid_data = [
-            {"eval_id": "test_001", "eval_query": "test query", "eval_type": "script"}
+            {
+                "conversation_group": "test_conv",
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_query": "test query",
+                        "eval_type": "invalid_type",
+                        "expected_response": "test response",
+                    }
+                ],
+            }
         ]
         yaml_content = yaml.dump(invalid_data)
 
@@ -250,21 +285,25 @@ def test_validate_eval_data_script_missing_verify_script(self):
         ):
 
             with pytest.raises(
-                ConfigurationError, match="requires 'eval_verify_script' field"
+                EvaluationDataError, match=".*eval_type must be one of.*"
             ):
                 AgentGoalEvalDataManager("test.yaml")
 
-    def test_validate_eval_data_default_eval_type(self):
-        """Test validation with default eval_type (judge-llm)."""
-        data_with_default_type = [
+    def test_validate_judge_llm_missing_expected_response(self):
+        """Test validation for judge-llm missing expected_response."""
+        invalid_data = [
             {
-                "eval_id": "test_001",
-                "eval_query": "test query",
-                "expected_response": "test response",
-                # eval_type not specified, should default to judge-llm
+                "conversation_group": "test_conv",
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_query": "test query",
+                        "eval_type": "judge-llm",
+                    }
+                ],
             }
         ]
-        yaml_content = yaml.dump(data_with_default_type)
+        yaml_content = yaml.dump(invalid_data)
 
         with (
             patch("builtins.open", mock_open(read_data=yaml_content)),
@@ -272,205 +311,209 @@ def test_validate_eval_data_default_eval_type(self):
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            manager = AgentGoalEvalDataManager("test.yaml")
-            assert len(manager.eval_data) == 1
-            assert manager.eval_data[0].eval_type == "judge-llm"
+            with pytest.raises(EvaluationDataError, match=".*expected_response.*"):
+                AgentGoalEvalDataManager("test.yaml")
+
+    def test_validate_sub_string_missing_keywords(self):
+        """Test validation for sub-string missing expected_keywords."""
+        invalid_data = [
+            {
+                "conversation_group": "test_conv",
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_query": "test query",
+                        "eval_type": "sub-string",
+                    }
+                ],
+            }
+        ]
+        yaml_content = yaml.dump(invalid_data)
 
-    def test_get_eval_data(self, valid_yaml_content):
-        """Test get_eval_data method."""
         with (
-            patch("builtins.open", mock_open(read_data=valid_yaml_content)),
+            patch("builtins.open", mock_open(read_data=yaml_content)),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            manager = AgentGoalEvalDataManager("test.yaml")
-            eval_data = manager.get_eval_data()
+            with pytest.raises(EvaluationDataError, match=".*expected_keywords.*"):
+                AgentGoalEvalDataManager("test.yaml")
 
-            assert isinstance(eval_data, list)
-            assert len(eval_data) == 3
-            assert all(isinstance(item, EvaluationDataConfig) for item in eval_data)
-            assert eval_data[0].eval_id == "test_001"
-            assert eval_data[1].eval_id == "test_002"
-            assert eval_data[2].eval_id == "test_003"
+    def test_validate_script_missing_verify_script(self):
+        """Test validation for script missing eval_verify_script."""
+        invalid_data = [
+            {
+                "conversation_group": "test_conv",
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_query": "test query",
+                        "eval_type": "script",
+                    }
+                ],
+            }
+        ]
+        yaml_content = yaml.dump(invalid_data)
 
-    def test_get_eval_count(self, valid_yaml_content):
-        """Test get_eval_count method."""
         with (
-            patch("builtins.open", mock_open(read_data=valid_yaml_content)),
+            patch("builtins.open", mock_open(read_data=yaml_content)),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            manager = AgentGoalEvalDataManager("test.yaml")
-            count = manager.get_eval_count()
-
-            assert count == 3
-            assert count == len(manager.eval_data)
+            with pytest.raises(EvaluationDataError, match=".*eval_verify_script.*"):
+                AgentGoalEvalDataManager("test.yaml")
 
-    def test_get_eval_count_empty(self):
-        """Test get_eval_count with empty data."""
-        empty_yaml = yaml.dump([])
+    def test_duplicate_conversation_groups(self):
+        """Test validation with duplicate conversation_group names."""
+        invalid_data = [
+            {
+                "conversation_group": "duplicate_group",
+                "conversation": [
+                    {
+                        "eval_id": "test1",
+                        "eval_query": "test query 1",
+                        "eval_type": "judge-llm",
+                        "expected_response": "test response 1",
+                    }
+                ],
+            },
+            {
+                "conversation_group": "duplicate_group",
+                "conversation": [
+                    {
+                        "eval_id": "test2",
+                        "eval_query": "test query 2",
+                        "eval_type": "judge-llm",
+                        "expected_response": "test response 2",
+                    }
+                ],
+            },
+        ]
+        yaml_content = yaml.dump(invalid_data)
 
         with (
-            patch("builtins.open", mock_open(read_data=empty_yaml)),
+            patch("builtins.open", mock_open(read_data=yaml_content)),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
-            manager = AgentGoalEvalDataManager("test.yaml")
-            count = manager.get_eval_count()
-
-            assert count == 0
-
-    def test_judge_llm_validation_success(self):
-        """Test successful validation for judge-llm type."""
-        judge_llm_data = [
-            {
-                "eval_id": "test_judge",
-                "eval_query": "What is Docker?",
-                "eval_type": "judge-llm",
-                "expected_response": "Docker is a containerization platform",
-            }
-        ]
-        yaml_content = yaml.dump(judge_llm_data)
+            with pytest.raises(
+                EvaluationDataError, match="Duplicate conversation_group"
+            ):
+                AgentGoalEvalDataManager("test.yaml")
 
+    def test_get_conversations(self, valid_conversation_yaml_content):
+        """Test get conversations method."""
         with (
-            patch("builtins.open", mock_open(read_data=yaml_content)),
+            patch(
+                "builtins.open", mock_open(read_data=valid_conversation_yaml_content)
+            ),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
             manager = AgentGoalEvalDataManager("test.yaml")
-            assert len(manager.eval_data) == 1
-            assert manager.eval_data[0].eval_type == "judge-llm"
-            assert (
-                manager.eval_data[0].expected_response
-                == "Docker is a containerization platform"
-            )
-
-    def test_script_validation_success(self):
-        """Test successful validation for script type."""
-        script_data = [
-            {
-                "eval_id": "test_script",
-                "eval_query": "Deploy application",
-                "eval_type": "script",
-                "eval_verify_script": "./verify_deployment.sh",
-            }
-        ]
-        yaml_content = yaml.dump(script_data)
+            conversations = manager.get_conversations()
+
+            assert len(conversations) == 2
+            assert isinstance(conversations[0], ConversationDataConfig)
+            assert conversations[0].conversation_group == "conv1"
+            assert conversations[1].conversation_group == "conv2"
 
+    def test_get_eval_data_via_conversations(self, valid_conversation_yaml_content):
+        """Test getting evaluation data via conversations."""
         with (
-            patch("builtins.open", mock_open(read_data=yaml_content)),
+            patch(
+                "builtins.open", mock_open(read_data=valid_conversation_yaml_content)
+            ),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
             manager = AgentGoalEvalDataManager("test.yaml")
-            assert len(manager.eval_data) == 1
-            assert manager.eval_data[0].eval_type == "script"
-            assert manager.eval_data[0].eval_verify_script == "./verify_deployment.sh"
+            conversations = manager.get_conversations()
 
-    def test_sub_string_validation_success(self):
-        """Test successful validation for sub-string type."""
-        sub_string_data = [
-            {
-                "eval_id": "test_substring",
-                "eval_query": "List services",
-                "eval_type": "sub-string",
-                "expected_keywords": ["service", "active", "running"],
-            }
-        ]
-        yaml_content = yaml.dump(sub_string_data)
+            eval_data = []
+            for conversation in conversations:
+                eval_data.extend(conversation.conversation)
 
+            assert len(eval_data) == 2
+            assert isinstance(eval_data[0], EvaluationDataConfig)
+            assert eval_data[0].eval_id == "eval1"
+            assert eval_data[1].eval_id == "eval1"
+
+    def test_get_eval_count(self, valid_conversation_yaml_content):
+        """Test get_eval_count method."""
         with (
-            patch("builtins.open", mock_open(read_data=yaml_content)),
+            patch(
+                "builtins.open", mock_open(read_data=valid_conversation_yaml_content)
+            ),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
             manager = AgentGoalEvalDataManager("test.yaml")
-            assert len(manager.eval_data) == 1
-            assert manager.eval_data[0].eval_type == "sub-string"
-            assert manager.eval_data[0].expected_keywords == [
-                "service",
-                "active",
-                "running",
-            ]
-
-    def test_mixed_eval_types(self):
-        """Test loading data with mixed evaluation types."""
-        mixed_data = [
-            {
-                "eval_id": "judge_test",
-                "eval_query": "What is Kubernetes?",
-                "eval_type": "judge-llm",
-                "expected_response": "Container orchestration",
-            },
-            {
-                "eval_id": "script_test",
-                "eval_query": "Deploy nginx",
-                "eval_type": "script",
-                "eval_verify_script": "./verify.sh",
-            },
-            {
-                "eval_id": "substring_test",
-                "eval_query": "List pods",
-                "eval_type": "sub-string",
-                "expected_keywords": ["pod", "running"],
-            },
-        ]
-        yaml_content = yaml.dump(mixed_data)
+            count = manager.get_eval_count()
 
+            assert count == 2
+
+    def test_conversation_count_via_conversations(
+        self, valid_conversation_yaml_content
+    ):
+        """Test getting conversation count via conversations list."""
         with (
-            patch("builtins.open", mock_open(read_data=yaml_content)),
+            patch(
+                "builtins.open", mock_open(read_data=valid_conversation_yaml_content)
+            ),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
             manager = AgentGoalEvalDataManager("test.yaml")
-            assert len(manager.eval_data) == 3
+            count = len(manager.conversations)
 
-            types = [item.eval_type for item in manager.eval_data]
-            assert "judge-llm" in types
-            assert "script" in types
-            assert "sub-string" in types
-
-    def test_eval_data_with_optional_fields(self):
-        """Test evaluation data with optional fields."""
-        data_with_optional = [
-            {
-                "eval_id": "test_with_optional",
-                "eval_query": "Deploy app",
-                "eval_type": "script",
-                "eval_verify_script": "./verify.sh",
-                "eval_setup_script": "./setup.sh",
-                "eval_cleanup_script": "./cleanup.sh",
-            }
-        ]
-        yaml_content = yaml.dump(data_with_optional)
+            assert count == 2
 
+    def test_multiturn_conversation_loading(self, multiturn_conversation_yaml_content):
+        """Test loading multi-turn conversation."""
         with (
-            patch("builtins.open", mock_open(read_data=yaml_content)),
+            patch(
+                "builtins.open",
+                mock_open(read_data=multiturn_conversation_yaml_content),
+            ),
             patch("pathlib.Path.exists", return_value=True),
             patch("pathlib.Path.is_file", return_value=True),
         ):
 
             manager = AgentGoalEvalDataManager("test.yaml")
-            assert len(manager.eval_data) == 1
-            eval_item = manager.eval_data[0]
-            assert eval_item.eval_setup_script == "./setup.sh"
-            assert eval_item.eval_cleanup_script == "./cleanup.sh"
+
+            assert len(manager.conversations) == 1
+            assert manager.get_eval_count() == 3
+
+            conversations = manager.get_conversations()
+            conv = conversations[0]
+            assert conv.conversation_group == "conv1"
+            assert len(conv.conversation) == 3
+            assert conv.conversation[0].eval_id == "eval1"
+            assert conv.conversation[1].eval_id == "eval2"
+            assert conv.conversation[2].eval_id == "eval3"
 
     def test_load_real_yaml_file_integration(self):
         """Integration test with a real temporary YAML file."""
         eval_data = [
             {
-                "eval_id": "integration_test",
-                "eval_query": "Test query",
-                "eval_type": "judge-llm",
-                "expected_response": "Test response",
+                "conversation_group": "integration_test",
+                "description": "Integration test conversation",
+                "conversation": [
+                    {
+                        "eval_id": "integration_test_eval",
+                        "eval_query": "Test query",
+                        "eval_type": "judge-llm",
+                        "expected_response": "Test response",
+                        "description": "Integration test evaluation",
+                    }
+                ],
             }
         ]
 
@@ -480,7 +523,13 @@ def test_load_real_yaml_file_integration(self):
 
         try:
             manager = AgentGoalEvalDataManager(temp_file_path)
-            assert len(manager.eval_data) == 1
-            assert manager.eval_data[0].eval_id == "integration_test"
+
+            assert len(manager.conversations) == 1
+            assert manager.get_eval_count() == 1
+
+            conversations = manager.get_conversations()
+            assert conversations[0].conversation_group == "integration_test"
+            assert conversations[0].description == "Integration test conversation"
+
         finally:
             Path(temp_file_path).unlink()  # Clean up temporary file
diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py
index a4c1d96b..69159b3a 100644
--- a/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py
+++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py
@@ -1,6 +1,8 @@
 """Tests for evaluation runner."""
 
-from unittest.mock import Mock, patch
+import os
+import tempfile
+from unittest.mock import Mock
 
 import pytest
 
@@ -9,6 +11,7 @@
     EvaluationDataConfig,
     EvaluationResult,
 )
+from lsc_agent_eval.core.agent_goal_eval.script_runner import ScriptRunner
 from lsc_agent_eval.core.utils.api_client import AgentHttpClient
 from lsc_agent_eval.core.utils.exceptions import AgentAPIError, ScriptExecutionError
 from lsc_agent_eval.core.utils.judge import JudgeModelManager
@@ -21,9 +24,24 @@ class TestEvaluationRunner:
     def mock_agent_client(self):
         """Mock agent client."""
         mock_client = Mock(spec=AgentHttpClient)
-        mock_client.query_agent.return_value = "Test agent response"
+
+        # Mock agent API: return conversation_id from input or generate one
+        def mock_query_agent(api_input, timeout=300):
+            return (
+                "Test agent response",
+                api_input.get("conversation_id", "generated-conversation-id"),
+            )
+
+        mock_client.query_agent.side_effect = mock_query_agent
         return mock_client
 
+    @pytest.fixture
+    def mock_script_runner(self):
+        """Mock script runner."""
+        mock_runner = Mock(spec=ScriptRunner)
+        mock_runner.run_script.return_value = True
+        return mock_runner
+
     @pytest.fixture
     def mock_judge_manager(self):
         """Mock judge manager."""
@@ -36,385 +54,361 @@ def sample_config_judge_llm(self):
         """Sample judge-llm evaluation configuration."""
         return EvaluationDataConfig(
             eval_id="test_001",
-            eval_query="What is Kubernetes?",
+            eval_query="What is Openshift Virtualization?",
             eval_type="judge-llm",
-            expected_response="Kubernetes is a container orchestration platform",
+            expected_response="OpenShift Virtualization is an extension of the OpenShift Container Platform",
         )
 
     @pytest.fixture
-    def sample_config_script(self):
+    def get_test_script_path(self):
+        """Create a temporary test script file and cleanup."""
+        # Setup
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
+            f.write('#!/bin/bash\necho "test script"\nexit 0')
+            script_path = f.name
+        os.chmod(script_path, 0o755)
+
+        yield script_path
+
+        # Cleanup
+        os.unlink(script_path)
+
+    @pytest.fixture
+    def sample_config_script(self, get_test_script_path):
         """Sample script evaluation configuration."""
         return EvaluationDataConfig(
             eval_id="test_002",
             eval_query="Deploy nginx",
             eval_type="script",
-            eval_verify_script="./verify.sh",
+            eval_verify_script=get_test_script_path,
         )
 
     @pytest.fixture
     def sample_config_substring(self):
-        """Sample substring evaluation configuration."""
+        """Sample sub-string evaluation configuration."""
         return EvaluationDataConfig(
             eval_id="test_003",
-            eval_query="What is Docker?",
+            eval_query="What is Podman?",
             eval_type="sub-string",
-            expected_keywords=["container", "docker"],
+            expected_keywords=["container", "podman"],
         )
 
-    def test_init(self, mock_agent_client, mock_judge_manager):
+    def test_init(self, mock_agent_client, mock_script_runner, mock_judge_manager):
         """Test EvaluationRunner initialization."""
         runner = EvaluationRunner(
-            mock_agent_client, mock_judge_manager, kubeconfig="~/kubeconfig"
+            mock_agent_client,
+            mock_script_runner,
+            mock_judge_manager,
         )
 
         assert runner.agent_client == mock_agent_client
+        assert runner.script_runner == mock_script_runner
         assert runner.judge_manager == mock_judge_manager
-        assert runner.kubeconfig == "~/kubeconfig"
 
-    def test_init_without_judge_manager(self, mock_agent_client):
+    def test_init_without_judge_manager(self, mock_agent_client, mock_script_runner):
         """Test EvaluationRunner initialization without judge manager."""
-        runner = EvaluationRunner(mock_agent_client)
+        runner = EvaluationRunner(mock_agent_client, mock_script_runner)
 
         assert runner.agent_client == mock_agent_client
+        assert runner.script_runner == mock_script_runner
         assert runner.judge_manager is None
 
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
     def test_run_evaluation_judge_llm_success(
         self,
-        mock_script_runner,
         mock_agent_client,
+        mock_script_runner,
         mock_judge_manager,
         sample_config_judge_llm,
     ):
         """Test successful judge-llm evaluation."""
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = (
-            "Kubernetes is a container orchestration platform"
+        runner = EvaluationRunner(
+            mock_agent_client, mock_script_runner, mock_judge_manager
         )
 
-        # Mock judge response
-        mock_judge_manager.evaluate_response.return_value = "1"
-
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4")
+        result = runner.run_evaluation(
+            sample_config_judge_llm,
+            "watsonx",
+            "ibm/granite-3-3-8b-instruct",
+            "conv-id-123",
+        )
 
         assert isinstance(result, EvaluationResult)
         assert result.eval_id == "test_001"
-        assert result.result == "PASS"
+        assert result.query == "What is Openshift Virtualization?"
         assert result.eval_type == "judge-llm"
+        assert result.result == "PASS"
+        assert result.conversation_id == "conv-id-123"
         assert result.error is None
 
-        # Verify agent was queried
+        # Verify agent was called
         mock_agent_client.query_agent.assert_called_once_with(
-            "What is Kubernetes?", "openai", "gpt-4"
+            {
+                "query": "What is Openshift Virtualization?",
+                "provider": "watsonx",
+                "model": "ibm/granite-3-3-8b-instruct",
+                "conversation_id": "conv-id-123",
+            }
         )
 
         # Verify judge was called
         mock_judge_manager.evaluate_response.assert_called_once()
 
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_script_success(
-        self, mock_script_runner_class, mock_agent_client, sample_config_script
+    def test_run_evaluation_judge_llm_failure(
+        self,
+        mock_agent_client,
+        mock_script_runner,
+        mock_judge_manager,
+        sample_config_judge_llm,
     ):
-        """Test successful script evaluation."""
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = (
-            "kubectl create deployment nginx --image=nginx"
-        )
+        """Test failed judge-llm evaluation."""
+        # Mock judge to return 0 (failure)
+        mock_judge_manager.evaluate_response.return_value = "0"
 
-        # Mock script runner instance
-        mock_script_runner_instance = Mock()
-        mock_script_runner_instance.run_script.return_value = True
-        mock_script_runner_class.return_value = mock_script_runner_instance
+        runner = EvaluationRunner(
+            mock_agent_client, mock_script_runner, mock_judge_manager
+        )
 
-        runner = EvaluationRunner(mock_agent_client)
-        result = runner.run_evaluation(sample_config_script, "openai", "gpt-4")
+        result = runner.run_evaluation(
+            sample_config_judge_llm,
+            "openai",
+            "gpt-4",
+            "conv-id-123",
+        )
 
-        assert isinstance(result, EvaluationResult)
-        assert result.eval_id == "test_002"
-        assert result.result == "PASS"
-        assert result.eval_type == "script"
+        assert result.result == "FAIL"
         assert result.error is None
 
-        # Verify ScriptRunner was created with the right kubeconfig
-        mock_script_runner_class.assert_called_with(kubeconfig=None)
-        # Verify script was executed
-        mock_script_runner_instance.run_script.assert_called_once_with("./verify.sh")
-
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_script_failure(
-        self, mock_script_runner_class, mock_agent_client, sample_config_script
+    def test_run_evaluation_script_success(
+        self, mock_agent_client, mock_script_runner, sample_config_script
     ):
-        """Test script evaluation failure."""
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = (
-            "kubectl create deployment nginx --image=nginx"
-        )
-
-        # Mock script runner instance returning failure
-        mock_script_runner_instance = Mock()
-        mock_script_runner_instance.run_script.return_value = False
-        mock_script_runner_class.return_value = mock_script_runner_instance
+        """Test successful script evaluation."""
+        runner = EvaluationRunner(mock_agent_client, mock_script_runner)
 
-        runner = EvaluationRunner(mock_agent_client)
-        result = runner.run_evaluation(sample_config_script, "openai", "gpt-4")
+        result = runner.run_evaluation(
+            sample_config_script,
+            "openai",
+            "gpt-4",
+            "conv-id-123",
+        )
 
         assert isinstance(result, EvaluationResult)
         assert result.eval_id == "test_002"
-        assert result.result == "FAIL"
         assert result.eval_type == "script"
+        assert result.result == "PASS"
         assert result.error is None
 
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_script_with_kubeconfig(
-        self, mock_script_runner_class, mock_agent_client, sample_config_script
-    ):
-        """Test script evaluation with kubeconfig."""
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = (
-            "kubectl create deployment nginx --image=nginx"
+        # Verify agent was called
+        mock_agent_client.query_agent.assert_called_once()
+
+        # Verify script was run
+        mock_script_runner.run_script.assert_called_once_with(
+            sample_config_script.eval_verify_script
         )
 
-        # Mock script runner instance
-        mock_script_runner_instance = Mock()
-        mock_script_runner_instance.run_script.return_value = True
-        mock_script_runner_class.return_value = mock_script_runner_instance
+    def test_run_evaluation_script_failure(
+        self, mock_agent_client, mock_script_runner, sample_config_script
+    ):
+        """Test failed script evaluation."""
+        # Mock script to return False (failure)
+        mock_script_runner.run_script.return_value = False
 
-        runner = EvaluationRunner(mock_agent_client, kubeconfig="~/kubeconfig")
-        result = runner.run_evaluation(sample_config_script, "openai", "gpt-4")
+        runner = EvaluationRunner(mock_agent_client, mock_script_runner)
 
-        assert result.result == "PASS"
+        result = runner.run_evaluation(
+            sample_config_script,
+            "openai",
+            "gpt-4",
+            "conv-id-123",
+        )
 
-        # Verify ScriptRunner was created with kubeconfig
-        mock_script_runner_class.assert_called_with(kubeconfig="~/kubeconfig")
-        # Verify script was executed
-        mock_script_runner_instance.run_script.assert_called_once_with("./verify.sh")
+        assert result.result == "FAIL"
+        assert result.error is None
 
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
     def test_run_evaluation_script_execution_error(
-        self, mock_script_runner_class, mock_agent_client, sample_config_script
+        self, mock_agent_client, mock_script_runner, sample_config_script
     ):
         """Test script evaluation with execution error."""
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = (
-            "kubectl create deployment nginx --image=nginx"
-        )
-
-        # Mock script runner instance raising error
-        mock_script_runner_instance = Mock()
-        mock_script_runner_instance.run_script.side_effect = ScriptExecutionError(
+        # Mock script to raise exception
+        mock_script_runner.run_script.side_effect = ScriptExecutionError(
             "Script failed"
         )
-        mock_script_runner_class.return_value = mock_script_runner_instance
 
-        runner = EvaluationRunner(mock_agent_client)
-        result = runner.run_evaluation(sample_config_script, "openai", "gpt-4")
+        runner = EvaluationRunner(mock_agent_client, mock_script_runner)
+
+        result = runner.run_evaluation(
+            sample_config_script,
+            "openai",
+            "gpt-4",
+            "conv-id-123",
+        )
 
-        assert isinstance(result, EvaluationResult)
-        assert result.eval_id == "test_002"
         assert result.result == "ERROR"
-        assert result.error == "Script failed"
+        assert "Script failed" in result.error
 
     def test_run_evaluation_substring_success(
-        self, mock_agent_client, sample_config_substring
+        self, mock_agent_client, mock_script_runner, sample_config_substring
     ):
-        """Test successful substring evaluation."""
+        """Test successful sub-string evaluation."""
+
         # Mock agent response containing expected keywords
-        mock_agent_client.query_agent.return_value = "Docker is a container platform"
+        def mock_query_agent(api_input, timeout=300):
+            return (
+                "Podman is an open-source container engine developed by Red Hat",
+                api_input.get("conversation_id", "test-conversation-id"),
+            )
 
-        runner = EvaluationRunner(mock_agent_client)
-        result = runner.run_evaluation(sample_config_substring, "openai", "gpt-4")
+        mock_agent_client.query_agent.side_effect = mock_query_agent
+
+        runner = EvaluationRunner(mock_agent_client, mock_script_runner)
+
+        result = runner.run_evaluation(
+            sample_config_substring,
+            "openai",
+            "gpt-4",
+            "conv-id-123",
+        )
 
-        assert isinstance(result, EvaluationResult)
         assert result.eval_id == "test_003"
         assert result.result == "PASS"
         assert result.eval_type == "sub-string"
+        assert result.error is None
 
     def test_run_evaluation_substring_failure(
-        self, mock_agent_client, sample_config_substring
+        self, mock_agent_client, mock_script_runner, sample_config_substring
     ):
-        """Test substring evaluation failure."""
+        """Test sub-string evaluation failure."""
+
         # Mock agent response not containing expected keywords
-        mock_agent_client.query_agent.return_value = "This is about virtual machines"
+        def mock_query_agent(api_input, timeout=300):
+            return (
+                "No information available",
+                api_input.get("conversation_id", "test-conversation-id"),
+            )
 
-        runner = EvaluationRunner(mock_agent_client)
-        result = runner.run_evaluation(sample_config_substring, "openai", "gpt-4")
+        mock_agent_client.query_agent.side_effect = mock_query_agent
+
+        runner = EvaluationRunner(mock_agent_client, mock_script_runner)
+
+        result = runner.run_evaluation(
+            sample_config_substring,
+            "openai",
+            "gpt-4",
+            None,
+        )
 
-        assert isinstance(result, EvaluationResult)
         assert result.eval_id == "test_003"
         assert result.result == "FAIL"
         assert result.eval_type == "sub-string"
+        assert result.error is None
 
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_with_setup_script(
-        self, mock_script_runner_class, mock_agent_client, mock_judge_manager
+    def test_run_evaluation_agent_api_error(
+        self, mock_agent_client, mock_script_runner, sample_config_judge_llm
     ):
-        """Test evaluation with setup script."""
-        config = EvaluationDataConfig(
-            eval_id="test_setup",
-            eval_query="Test query",
-            eval_type="judge-llm",
-            expected_response="Test response",
-            eval_setup_script="./setup.sh",
+        """Test evaluation with agent API error."""
+        # Mock agent client to raise API error
+        mock_agent_client.query_agent.side_effect = AgentAPIError(
+            "API connection failed"
         )
 
-        # Mock script runner instance for setup
-        mock_script_runner_instance = Mock()
-        mock_script_runner_instance.run_script.return_value = True
-        mock_script_runner_class.return_value = mock_script_runner_instance
-
-        # Mock agent and judge responses
-        mock_agent_client.query_agent.return_value = "Test response"
-        mock_judge_manager.evaluate_response.return_value = "1"
+        runner = EvaluationRunner(mock_agent_client, mock_script_runner)
 
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        result = runner.run_evaluation(config, "openai", "gpt-4")
+        result = runner.run_evaluation(
+            sample_config_judge_llm,
+            "openai",
+            "gpt-4",
+            "conv-id-123",
+        )
 
-        assert result.result == "PASS"
-        # Verify setup script was called
-        mock_script_runner_instance.run_script.assert_called_with("./setup.sh")
+        assert result.eval_id == "test_001"
+        assert result.result == "ERROR"
+        assert result.eval_type == "judge-llm"
+        assert "API connection failed" in result.error
 
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_setup_script_failure(
-        self, mock_script_runner_class, mock_agent_client, mock_judge_manager
+    def test_substring_evaluation_logic(
+        self, mock_agent_client, mock_script_runner, mock_judge_manager
     ):
-        """Test evaluation with setup script failure."""
+        """Test sub-string evaluation with different keyword combinations."""
+        runner = EvaluationRunner(
+            mock_agent_client, mock_script_runner, mock_judge_manager
+        )
+
         config = EvaluationDataConfig(
-            eval_id="test_setup_fail",
+            eval_id="substring_test",
             eval_query="Test query",
-            eval_type="judge-llm",
-            expected_response="Test response",
-            eval_setup_script="./setup.sh",
+            eval_type="sub-string",
+            expected_keywords=["keyword1", "keyword2"],
         )
 
-        # Mock failing setup script execution
-        mock_script_runner_instance = Mock()
-        mock_script_runner_instance.run_script.return_value = False
-        mock_script_runner_class.return_value = mock_script_runner_instance
+        # Test all keywords present - should PASS
+        def mock_query_agent_all_keywords(api_input, timeout=300):
+            return (
+                "Response with keyword1 and keyword2",
+                api_input.get("conversation_id", "test-conversation-id"),
+            )
 
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        result = runner.run_evaluation(config, "openai", "gpt-4")
+        mock_agent_client.query_agent.side_effect = mock_query_agent_all_keywords
+        result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123")
+        assert result.result == "PASS"
 
-        assert result.result == "ERROR"
-        assert "Setup script failed" in result.error
+        # Test some keywords missing (only one present) - should FAIL
+        def mock_query_agent_one_keyword(api_input, timeout=300):
+            return (
+                "Response with only keyword1",
+                api_input.get("conversation_id", "test-conversation-id"),
+            )
 
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_with_cleanup_script(
-        self, mock_script_runner_class, mock_agent_client, mock_judge_manager
-    ):
-        """Test evaluation with cleanup script."""
-        config = EvaluationDataConfig(
-            eval_id="test_cleanup",
-            eval_query="Test query",
-            eval_type="judge-llm",
-            expected_response="Test response",
-            eval_cleanup_script="./cleanup.sh",
-        )
+        mock_agent_client.query_agent.side_effect = mock_query_agent_one_keyword
+        result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123")
+        assert result.result == "FAIL"
 
-        # Mock successful cleanup script execution
-        mock_script_runner_instance = Mock()
-        mock_script_runner_instance.run_script.return_value = True
-        mock_script_runner_class.return_value = mock_script_runner_instance
+        # Test no keywords present - should FAIL
+        def mock_query_agent_no_keywords(api_input, timeout=300):
+            return (
+                "Response with no matching terms",
+                api_input.get("conversation_id", "test-conversation-id"),
+            )
 
-        # Mock agent and judge responses
-        mock_agent_client.query_agent.return_value = "Test response"
-        mock_judge_manager.evaluate_response.return_value = "1"
+        mock_agent_client.query_agent.side_effect = mock_query_agent_no_keywords
+        result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123")
+        assert result.result == "FAIL"
 
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        result = runner.run_evaluation(config, "openai", "gpt-4")
+        # Test case insensitive matching
+        def mock_query_agent_case_insensitive(api_input, timeout=300):
+            return (
+                "Response with KEYWORD1 and Keyword2",
+                api_input.get("conversation_id", "test-conversation-id"),
+            )
 
+        mock_agent_client.query_agent.side_effect = mock_query_agent_case_insensitive
+        result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123")
         assert result.result == "PASS"
-        # Verify cleanup script was called
-        mock_script_runner_instance.run_script.assert_called_with("./cleanup.sh")
 
-    def test_run_evaluation_agent_api_error(
-        self, mock_agent_client, mock_judge_manager, sample_config_judge_llm
+    def test_conversation_id_propagation(
+        self, mock_agent_client, mock_script_runner, mock_judge_manager
     ):
-        """Test evaluation with agent API error."""
-        # Mock agent API error
-        mock_agent_client.query_agent.side_effect = AgentAPIError(
-            "API connection failed"
+        """Test that conversation ID is properly propagated to results."""
+        runner = EvaluationRunner(
+            mock_agent_client, mock_script_runner, mock_judge_manager
         )
 
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4")
-
-        assert isinstance(result, EvaluationResult)
-        assert result.result == "ERROR"
-        assert "API connection failed" in result.error
-
-    def test_run_evaluation_unknown_type(self, mock_agent_client):
-        """Test evaluation with unknown evaluation type."""
         config = EvaluationDataConfig(
-            eval_id="test_unknown",
+            eval_id="conv_id_test",
             eval_query="Test query",
-            eval_type="unknown-type",
+            eval_type="judge-llm",
+            expected_response="Test response",
         )
 
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = "Test response"
+        test_conv_id = "conv-id-456"
+        result = runner.run_evaluation(config, "openai", "gpt-4", test_conv_id)
 
-        runner = EvaluationRunner(mock_agent_client)
-        result = runner.run_evaluation(config, "openai", "gpt-4")
+        assert result.conversation_id == test_conv_id
 
-        assert isinstance(result, EvaluationResult)
-        assert result.result == "FAIL"
-
-    def test_get_judge_manager(self, mock_agent_client, mock_judge_manager):
-        """Test get_judge_manager method."""
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        assert runner.get_judge_manager() == mock_judge_manager
-
-        runner_no_judge = EvaluationRunner(mock_agent_client)
-        assert runner_no_judge.get_judge_manager() is None
-
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_judge_llm_failure(
-        self,
-        mock_script_runner,
-        mock_agent_client,
-        mock_judge_manager,
-        sample_config_judge_llm,
-    ):
-        """Test judge-llm evaluation failure."""
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = "Some incorrect response"
-
-        # Mock judge response indicating failure
-        mock_judge_manager.evaluate_response.return_value = "0"
-
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4")
-
-        assert isinstance(result, EvaluationResult)
-        assert result.eval_id == "test_001"
-        assert result.result == "FAIL"
-        assert result.eval_type == "judge-llm"
-        assert result.error is None
-
-    @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner")
-    def test_run_evaluation_judge_llm_error(
-        self,
-        mock_script_runner,
-        mock_agent_client,
-        mock_judge_manager,
-        sample_config_judge_llm,
-    ):
-        """Test judge-llm evaluation error."""
-        # Mock agent response
-        mock_agent_client.query_agent.return_value = "Some incorrect response"
-
-        # Mock judge response indicating failure
-        mock_judge_manager.evaluate_response.return_value = "00"
-
-        runner = EvaluationRunner(mock_agent_client, mock_judge_manager)
-        result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4")
-
-        assert isinstance(result, EvaluationResult)
-        assert result.eval_id == "test_001"
-        assert result.result == "ERROR"
-        assert result.eval_type == "judge-llm"
-        assert result.error == (
-            "Invalid response from the judge model. "
-            "Expected value either 0/1. Actual value: 00"
+        # Verify ID was passed to agent client
+        mock_agent_client.query_agent.assert_called_once_with(
+            {
+                "query": "Test query",
+                "provider": "openai",
+                "model": "gpt-4",
+                "conversation_id": test_conv_id,
+            }
         )
diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py
index e646d333..18c271f8 100644
--- a/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py
+++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py
@@ -1,8 +1,16 @@
 """Tests for agent evaluation models."""
 
+from pathlib import Path
+from unittest.mock import mock_open, patch
+
+import pytest
+from pydantic import ValidationError
+
 from lsc_agent_eval.core.agent_goal_eval.models import (
+    ConversationDataConfig,
     EvaluationDataConfig,
     EvaluationResult,
+    EvaluationStats,
 )
 
 
@@ -57,25 +65,35 @@ def test_evaluation_result_defaults(self):
 
         assert result.error is None
 
+    def test_evaluation_result_invalid_result_type(self):
+        """Test EvaluationResult with invalid result type."""
+        with pytest.raises(ValidationError) as exc_info:
+            EvaluationResult(
+                eval_id="test_004",
+                query="Test query",
+                response="Test response",
+                eval_type="judge-llm",
+                result="INVALID",
+            )
 
-class TestEvaluationDataConfig:
-    """Test EvaluationDataConfig data class."""
+        assert "Result must be one of" in str(exc_info.value)
 
-    def test_evaluation_data_config_minimal(self):
-        """Test creating minimal EvaluationDataConfig."""
-        config = EvaluationDataConfig(
-            eval_id="test_001",
-            eval_query="What is Kubernetes?",
-        )
+    def test_evaluation_result_invalid_eval_type(self):
+        """Test EvaluationResult with invalid eval type."""
+        with pytest.raises(ValidationError) as exc_info:
+            EvaluationResult(
+                eval_id="test_005",
+                query="Test query",
+                response="Test response",
+                eval_type="invalid-type",
+                result="PASS",
+            )
 
-        assert config.eval_id == "test_001"
-        assert config.eval_query == "What is Kubernetes?"
-        assert config.eval_type == "judge-llm"  # default
-        assert config.expected_response is None
-        assert config.expected_keywords is None
-        assert config.eval_setup_script is None
-        assert config.eval_verify_script is None
-        assert config.eval_cleanup_script is None
+        assert "eval_type must be one of" in str(exc_info.value)
+
+
+class TestEvaluationDataConfig:
+    """Test EvaluationDataConfig data class."""
 
     def test_evaluation_data_config_judge_llm(self):
         """Test EvaluationDataConfig for judge-llm evaluation."""
@@ -91,19 +109,19 @@ def test_evaluation_data_config_judge_llm(self):
         assert config.eval_type == "judge-llm"
         assert config.expected_response == "Containers are lightweight virtualization"
         assert config.expected_keywords is None
-        assert config.eval_setup_script is None
         assert config.eval_verify_script is None
-        assert config.eval_cleanup_script is None
+        assert config.description is None
 
-    def test_evaluation_data_config_script(self):
+    @patch("builtins.open", mock_open())
+    @patch("pathlib.Path.is_file", return_value=True)
+    @patch("pathlib.Path.exists", return_value=True)
+    def test_evaluation_data_config_script(self, mock_exists, mock_is_file):
         """Test EvaluationDataConfig for script evaluation."""
         config = EvaluationDataConfig(
             eval_id="script_test",
             eval_query="Deploy nginx pod",
             eval_type="script",
-            eval_setup_script="./setup.sh",
-            eval_verify_script="./verify.sh",
-            eval_cleanup_script="./cleanup.sh",
+            eval_verify_script="/mock/script/path.sh",
         )
 
         assert config.eval_id == "script_test"
@@ -111,9 +129,11 @@ def test_evaluation_data_config_script(self):
         assert config.eval_type == "script"
         assert config.expected_response is None
         assert config.expected_keywords is None
-        assert config.eval_setup_script == "./setup.sh"
-        assert config.eval_verify_script == "./verify.sh"
-        assert config.eval_cleanup_script == "./cleanup.sh"
+        assert isinstance(config.eval_verify_script, Path)
+
+        # Verify path validation was called
+        mock_exists.assert_called()
+        mock_is_file.assert_called()
 
     def test_evaluation_data_config_substring(self):
         """Test EvaluationDataConfig for sub-string evaluation."""
@@ -129,28 +149,240 @@ def test_evaluation_data_config_substring(self):
         assert config.eval_type == "sub-string"
         assert config.expected_response is None
         assert config.expected_keywords == ["isolation", "portability", "efficiency"]
-        assert config.eval_setup_script is None
         assert config.eval_verify_script is None
-        assert config.eval_cleanup_script is None
 
-    def test_evaluation_data_config_all_fields(self):
-        """Test EvaluationDataConfig with all fields."""
+    def test_evaluation_data_config_with_description(self):
+        """Test EvaluationDataConfig with description."""
         config = EvaluationDataConfig(
             eval_id="full_test",
             eval_query="What is OpenShift?",
             eval_type="judge-llm",
             expected_response="OpenShift is a Kubernetes platform",
-            expected_keywords=["kubernetes", "platform", "container"],
-            eval_setup_script="./setup.sh",
-            eval_verify_script="./verify.sh",
-            eval_cleanup_script="./cleanup.sh",
+            description="Test evaluation for OpenShift knowledge",
         )
 
         assert config.eval_id == "full_test"
         assert config.eval_query == "What is OpenShift?"
         assert config.eval_type == "judge-llm"
         assert config.expected_response == "OpenShift is a Kubernetes platform"
-        assert config.expected_keywords == ["kubernetes", "platform", "container"]
-        assert config.eval_setup_script == "./setup.sh"
-        assert config.eval_verify_script == "./verify.sh"
-        assert config.eval_cleanup_script == "./cleanup.sh"
+        assert config.description == "Test evaluation for OpenShift knowledge"
+        assert config.expected_keywords is None
+        assert config.eval_verify_script is None
+
+    def test_evaluation_data_config_missing_eval_type(self):
+        """Test EvaluationDataConfig with missing eval_type (should fail)."""
+        with pytest.raises(ValidationError) as exc_info:
+            EvaluationDataConfig(
+                eval_id="test_001",
+                eval_query="What is Kubernetes?",
+            )
+
+        assert "Field required" in str(exc_info.value)
+
+    def test_evaluation_data_config_judge_llm_missing_expected_response(self):
+        """Test judge-llm evaluation missing expected_response."""
+        with pytest.raises(ValidationError) as exc_info:
+            EvaluationDataConfig(
+                eval_id="test_judge",
+                eval_query="Test query",
+                eval_type="judge-llm",
+            )
+
+        assert "requires non-empty 'expected_response'" in str(exc_info.value)
+
+    def test_evaluation_data_config_substring_missing_keywords(self):
+        """Test sub-string evaluation missing expected_keywords."""
+        with pytest.raises(ValidationError) as exc_info:
+            EvaluationDataConfig(
+                eval_id="test_substring",
+                eval_query="Test query",
+                eval_type="sub-string",
+            )
+
+        assert "requires non-empty 'expected_keywords'" in str(exc_info.value)
+
+    def test_evaluation_data_config_script_missing_verify_script(self):
+        """Test script evaluation missing eval_verify_script."""
+        with pytest.raises(ValidationError) as exc_info:
+            EvaluationDataConfig(
+                eval_id="test_script",
+                eval_query="Test query",
+                eval_type="script",
+            )
+
+        assert "requires non-empty 'eval_verify_script'" in str(exc_info.value)
+
+    def test_evaluation_data_config_script_nonexistent_file(self):
+        """Test script evaluation with non-existent script file."""
+        with pytest.raises(ValidationError) as exc_info:
+            EvaluationDataConfig(
+                eval_id="test_script",
+                eval_query="Test query",
+                eval_type="script",
+                eval_verify_script="/non/existent/script.sh",
+            )
+
+        assert "file not found" in str(exc_info.value)
+
+
+class TestConversationDataConfig:
+    """Test Conversation data config."""
+
+    def test_conversation_config_minimal(self):
+        """Test creating minimal Conversation data config."""
+        config = ConversationDataConfig(
+            conversation_group="test_conv",
+            conversation=[
+                EvaluationDataConfig(
+                    eval_id="test_001",
+                    eval_query="What is Kubernetes?",
+                    eval_type="judge-llm",
+                    expected_response="Kubernetes is a platform",
+                )
+            ],
+        )
+
+        assert config.conversation_group == "test_conv"
+        assert len(config.conversation) == 1
+        assert config.conversation[0].eval_id == "test_001"
+        assert config.description is None
+        assert config.setup_script is None
+        assert config.cleanup_script is None
+
+    @patch("builtins.open", mock_open())
+    @patch("pathlib.Path.is_file", return_value=True)
+    @patch("pathlib.Path.exists", return_value=True)
+    def test_conversation_config_with_scripts(self, mock_exists, mock_is_file):
+        """Test Conversation data config with setup and cleanup scripts."""
+        config = ConversationDataConfig(
+            conversation_group="test_conv_scripts",
+            description="Test conversation with scripts",
+            setup_script="/mock/setup.sh",
+            cleanup_script="/mock/cleanup.sh",
+            conversation=[
+                EvaluationDataConfig(
+                    eval_id="test_001",
+                    eval_query="Test query",
+                    eval_type="judge-llm",
+                    expected_response="Test response",
+                )
+            ],
+        )
+
+        assert config.conversation_group == "test_conv_scripts"
+        assert config.description == "Test conversation with scripts"
+        assert isinstance(config.setup_script, Path)
+        assert isinstance(config.cleanup_script, Path)
+
+    def test_conversation_config_empty_group_name(self):
+        """Test Conversation data config with empty group name."""
+        with pytest.raises(ValidationError) as exc_info:
+            ConversationDataConfig(
+                conversation_group="   ",  # Empty after strip
+                conversation=[
+                    EvaluationDataConfig(
+                        eval_id="test_001",
+                        eval_query="Test query",
+                        eval_type="judge-llm",
+                        expected_response="Test response",
+                    )
+                ],
+            )
+
+        assert "cannot be empty" in str(exc_info.value)
+
+    def test_conversation_config_nonexistent_script(self):
+        """Test Conversation data config with non-existent script."""
+        with pytest.raises(ValidationError) as exc_info:
+            ConversationDataConfig(
+                conversation_group="test_conv",
+                setup_script="/non/existent/setup.sh",
+                conversation=[
+                    EvaluationDataConfig(
+                        eval_id="test_001",
+                        eval_query="Test query",
+                        eval_type="judge-llm",
+                        expected_response="Test response",
+                    )
+                ],
+            )
+
+        assert "file not found" in str(exc_info.value)
+
+    def test_conversation_config_duplicate_eval_ids(self):
+        """Test Conversation data config with duplicate eval_ids."""
+        with pytest.raises(ValidationError) as exc_info:
+            ConversationDataConfig(
+                conversation_group="test_conv",
+                conversation=[
+                    EvaluationDataConfig(
+                        eval_id="duplicate_id",
+                        eval_query="First query",
+                        eval_type="judge-llm",
+                        expected_response="First response",
+                    ),
+                    EvaluationDataConfig(
+                        eval_id="duplicate_id",
+                        eval_query="Second query",
+                        eval_type="judge-llm",
+                        expected_response="Second response",
+                    ),
+                ],
+            )
+
+        assert "Duplicate eval_id" in str(exc_info.value)
+
+
+class TestEvaluationStats:
+    """Test Evaluation statistics data class."""
+
+    def test_evaluation_stats_from_results(self):
+        """Test Evaluation statistics creation method."""
+        results = [
+            EvaluationResult(
+                eval_id="test_001",
+                query="Query 1",
+                response="Response 1",
+                eval_type="judge-llm",
+                result="PASS",
+                conversation_group="conv1",
+            ),
+            EvaluationResult(
+                eval_id="test_002",
+                query="Query 2",
+                response="Response 2",
+                eval_type="script",
+                result="FAIL",
+                conversation_group="conv1",
+            ),
+            EvaluationResult(
+                eval_id="test_003",
+                query="Query 3",
+                response="Response 3",
+                eval_type="sub-string",
+                result="PASS",
+                conversation_group="conv2",
+            ),
+        ]
+
+        stats = EvaluationStats.from_results(results)
+
+        assert stats.total_evaluations == 3
+        assert stats.total_conversations == 2
+        assert stats.passed == 2
+        assert stats.failed == 1
+        assert stats.errored == 0
+        assert abs(stats.success_rate - 66.67) < 0.01
+
+        # Check stats by conversation
+        assert "conv1" in stats.by_conversation
+        assert "conv2" in stats.by_conversation
+        assert stats.by_conversation["conv1"]["total"] == 2
+        assert stats.by_conversation["conv1"]["passed"] == 1
+        assert stats.by_conversation["conv2"]["total"] == 1
+        assert stats.by_conversation["conv2"]["passed"] == 1
+
+        # Check stats by eval_type
+        assert "judge-llm" in stats.by_eval_type
+        assert "script" in stats.by_eval_type
+        assert "sub-string" in stats.by_eval_type
diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py
index cffcac15..a20bf192 100644
--- a/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py
+++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py
@@ -1,389 +1,298 @@
 """Tests for results manager."""
 
-from unittest.mock import Mock, patch
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import mock_open, patch
 
+import pandas as pd
 import pytest
 
-from lsc_agent_eval.core.agent_goal_eval.models import EvaluationResult
+from lsc_agent_eval.core.agent_goal_eval.models import EvaluationResult, EvaluationStats
 from lsc_agent_eval.core.agent_goal_eval.results import ResultsManager
+from lsc_agent_eval.core.utils.exceptions import AgentEvaluationError
 
 
 class TestResultsManager:
     """Test ResultsManager."""
 
-    def test_init(self):
-        """Test ResultsManager initialization."""
-        manager = ResultsManager("test_results/")
-
-        assert manager.result_dir == "test_results/"
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_success(self, mock_dataframe, mock_to_csv, mock_mkdir):
-        """Test successful results saving."""
-        # Setup test data
-        results = [
+    @pytest.fixture
+    def sample_results(self):
+        """Sample evaluation results."""
+        return [
             EvaluationResult(
                 eval_id="test_001",
                 query="What is Kubernetes?",
                 response="Kubernetes is a container orchestration platform",
                 eval_type="judge-llm",
                 result="PASS",
+                conversation_group="conv1",
+                conversation_id="conv-id-123",
             ),
             EvaluationResult(
                 eval_id="test_002",
                 query="Deploy nginx",
-                response="kubectl create deployment nginx --image=nginx",
+                response="oc create deployment nginx --image=nginx",
                 eval_type="script",
-                result="PASS",
+                result="FAIL",
+                conversation_group="conv1",
+                conversation_id="conv-id-123",
             ),
-        ]
-
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify directory creation
-        mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
-
-        # Verify DataFrame was created with correct data
-        mock_dataframe.assert_called_once()
-        call_args = mock_dataframe.call_args[0][0]
-        assert len(call_args) == 2
-        assert call_args[0]["eval_id"] == "test_001"
-        assert call_args[1]["eval_id"] == "test_002"
-
-        # Verify to_csv was called
-        mock_df_instance.to_csv.assert_called_once()
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_with_error(self, mock_dataframe, mock_to_csv, mock_mkdir):
-        """Test results saving with error field."""
-        # Setup test data with error
-        results = [
-            EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="",
-                eval_type="script",
-                result="ERROR",
-                error="Script execution failed",
-            ),
-        ]
-
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify DataFrame was created with error field
-        mock_dataframe.assert_called_once()
-        call_args = mock_dataframe.call_args[0][0]
-        assert len(call_args) == 1
-        assert call_args[0]["error"] == "Script execution failed"
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_empty_list(self, mock_dataframe, mock_to_csv, mock_mkdir):
-        """Test saving empty results list."""
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results([])
-
-        # Verify DataFrame was created with empty list
-        mock_dataframe.assert_called_once_with([])
-        mock_df_instance.to_csv.assert_called_once()
-
-    @patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied"))
-    def test_save_results_mkdir_error(self, mock_mkdir):
-        """Test results saving with directory creation error."""
-        results = [
             EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="Test response",
-                eval_type="judge-llm",
+                eval_id="test_003",
+                query="List pods",
+                response="pod1, pod2",
+                eval_type="sub-string",
                 result="PASS",
+                conversation_group="conv2",
+                conversation_id="conv-id-456",
             ),
         ]
 
-        manager = ResultsManager("test_results/")
+    @pytest.fixture
+    def empty_results(self):
+        """Empty results list."""
+        return []
 
-        with pytest.raises(OSError, match="Permission denied"):
-            manager.save_results(results)
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv", side_effect=IOError("File write error"))
-    @patch("pandas.DataFrame")
-    def test_save_results_file_error(self, mock_dataframe, mock_to_csv, mock_mkdir):
-        """Test results saving with file write error."""
-        results = [
-            EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="Test response",
-                eval_type="judge-llm",
-                result="PASS",
-            ),
-        ]
+    def test_init(self, sample_results):
+        """Test ResultsManager initialization."""
+        manager = ResultsManager(sample_results)
 
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-        mock_df_instance.to_csv.side_effect = IOError("File write error")
+        assert manager.results == sample_results
+        assert isinstance(manager.results_stats, EvaluationStats)
+        assert manager.results_stats.total_evaluations == 3
+        assert manager.results_stats.passed == 2
+        assert manager.results_stats.failed == 1
 
-        manager = ResultsManager("test_results/")
+    def test_init_empty_results(self, empty_results):
+        """Test ResultsManager initialization with empty results."""
+        manager = ResultsManager(empty_results)
 
-        with pytest.raises(IOError, match="File write error"):
-            manager.save_results(results)
+        assert manager.results == []
+        assert manager.results_stats.total_evaluations == 0
 
     @patch("pathlib.Path.mkdir")
     @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    @patch("lsc_agent_eval.core.agent_goal_eval.results.datetime")
-    def test_save_results_filename_generation(
-        self, mock_datetime, mock_dataframe, mock_to_csv, mock_mkdir
+    @patch("builtins.open", new_callable=mock_open)
+    def test_save_results_success(
+        self, mock_file_open, mock_to_csv, mock_mkdir, sample_results
     ):
-        """Test CSV filename generation with timestamp."""
-        # Setup mock datetime
-        mock_datetime.now.return_value.strftime.return_value = "20240108_103000"
+        """Test successful results saving."""
+        manager = ResultsManager(sample_results)
 
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
+        manager.save_results("test_results/")
 
-        results = [
-            EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="Test response",
-                eval_type="judge-llm",
-                result="PASS",
-            ),
-        ]
+        # Verify directory creation
+        mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
 
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
+        # Verify CSV saving
+        mock_to_csv.assert_called_once()
 
-        # Verify to_csv was called with correct path
-        mock_df_instance.to_csv.assert_called_once()
-        call_args = mock_df_instance.to_csv.call_args
-        file_path = call_args[0][0]
-        assert "agent_goal_eval_results_20240108_103000.csv" in str(file_path)
+        # Verify JSON saving
+        mock_file_open.assert_called()
 
     @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_csv_parameters(self, mock_dataframe, mock_to_csv, mock_mkdir):
-        """Test CSV parameters are correct."""
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
+    @patch("pandas.DataFrame.to_csv", side_effect=Exception("CSV error"))
+    def test_save_results_csv_error(self, mock_to_csv, mock_mkdir, sample_results):
+        """Test results saving with CSV error."""
+        manager = ResultsManager(sample_results)
 
-        results = [
-            EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="Test response",
-                eval_type="judge-llm",
-                result="PASS",
-            ),
-        ]
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify to_csv was called with correct parameters
-        mock_df_instance.to_csv.assert_called_once()
-        call_args = mock_df_instance.to_csv.call_args
-        assert not call_args[1]["index"]
-        assert call_args[1]["encoding"] == "utf-8"
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_data_conversion(
-        self, mock_dataframe, mock_to_csv, mock_mkdir
-    ):
-        """Test EvaluationResult to dict conversion."""
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
+        with pytest.raises(AgentEvaluationError, match="Failed to save results"):
+            manager.save_results("test_results/")
 
+    @patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied"))
+    def test_save_results_mkdir_error(self, mock_mkdir, sample_results):
+        """Test results saving with directory creation error."""
+        manager = ResultsManager(sample_results)
+
+        with pytest.raises(AgentEvaluationError, match="Failed to save results"):
+            manager.save_results("test_results/")
+
+    def test_csv_data_conversion(self, sample_results):
+        """Test CSV data conversion."""
+        manager = ResultsManager(sample_results)
+
+        data = []
+        for result in manager.results:
+            data.append(
+                {
+                    "conversation_group": result.conversation_group,
+                    "conversation_id": result.conversation_id,
+                    "eval_id": result.eval_id,
+                    "result": result.result,
+                    "eval_type": result.eval_type,
+                    "query": result.query,
+                    "response": result.response,
+                    "error": result.error,
+                }
+            )
+
+        assert len(data) == 3
+        assert data[0]["eval_id"] == "test_001"
+        assert data[0]["result"] == "PASS"
+        assert data[1]["result"] == "FAIL"
+        assert data[2]["eval_type"] == "sub-string"
+
+    def test_get_results_stats(self, sample_results):
+        """Test get results stats method."""
+        manager = ResultsManager(sample_results)
+        stats = manager.get_results_stats()
+
+        assert isinstance(stats, EvaluationStats)
+        assert stats.total_evaluations == 3
+        assert stats.total_conversations == 2
+        assert stats.passed == 2
+        assert stats.failed == 1
+        assert stats.errored == 0
+        assert round(stats.success_rate, 2) == round(2 / 3 * 100, 2)
+
+        # Check conversation breakdown
+        assert "conv1" in stats.by_conversation
+        assert "conv2" in stats.by_conversation
+        assert stats.by_conversation["conv1"]["total"] == 2
+        assert stats.by_conversation["conv2"]["total"] == 1
+
+        # Check eval type breakdown
+        assert "judge-llm" in stats.by_eval_type
+        assert "script" in stats.by_eval_type
+        assert "sub-string" in stats.by_eval_type
+
+    def test_results_with_errors(self):
+        """Test results with error conditions."""
         results = [
             EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="Test response",
+                eval_id="test_error",
+                query="Failing query",
+                response="",
                 eval_type="judge-llm",
-                result="PASS",
-                error=None,
+                result="ERROR",
+                error="API connection failed",
+                conversation_group="test_conv",
+                conversation_id="conv-id-789",
             ),
         ]
 
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify DataFrame was created with correct data
-        mock_dataframe.assert_called_once()
-        call_args = mock_dataframe.call_args[0][0]
-        expected_row = {
-            "eval_id": "test_001",
-            "query": "Test query",
-            "response": "Test response",
-            "eval_type": "judge-llm",
-            "result": "PASS",
-            "error": "",
-        }
-        assert call_args[0] == expected_row
+        manager = ResultsManager(results)
+        stats = manager.get_results_stats()
 
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_multiple_results(
-        self, mock_dataframe, mock_to_csv, mock_mkdir
-    ):
-        """Test saving multiple results."""
-        # Setup test data
-        results = [
-            EvaluationResult("test_001", "query1", "response1", "judge-llm", "PASS"),
-            EvaluationResult("test_002", "query2", "response2", "script", "FAIL"),
-            EvaluationResult("test_003", "query3", "response3", "sub-string", "PASS"),
-        ]
+        assert stats.total_evaluations == 1
+        assert stats.passed == 0
+        assert stats.failed == 0
+        assert stats.errored == 1
+        assert stats.success_rate == 0.0
 
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify DataFrame was created with all results
-        mock_dataframe.assert_called_once()
-        call_args = mock_dataframe.call_args[0][0]
-        assert len(call_args) == 3
-
-        # Verify each result was converted correctly
-        assert call_args[0]["eval_id"] == "test_001"
-        assert call_args[1]["eval_id"] == "test_002"
-        assert call_args[2]["eval_id"] == "test_003"
-
-    def test_result_dir_with_trailing_slash(self):
-        """Test result directory with trailing slash."""
-        manager = ResultsManager("test_results/")
-        assert manager.result_dir == "test_results/"
-
-    def test_result_dir_without_trailing_slash(self):
-        """Test result directory without trailing slash."""
-        manager = ResultsManager("test_results")
-        assert manager.result_dir == "test_results"
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_encoding(self, mock_dataframe, mock_to_csv, mock_mkdir):
-        """Test CSV file is saved with UTF-8 encoding."""
+    def test_results_mixed_types(self):
+        """Test results with mixed evaluation types."""
         results = [
             EvaluationResult(
-                eval_id="test_001",
-                query="What is Kubernetes?",
-                response="Kubernetes is a container orchestration platform",
+                eval_id="judge_test",
+                query="Judge query",
+                response="Judge response",
                 eval_type="judge-llm",
                 result="PASS",
+                conversation_group="mixed_conv",
+                conversation_id="conv-id-mixed",
             ),
-        ]
-
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify to_csv was called with UTF-8 encoding
-        call_args = mock_df_instance.to_csv.call_args
-        assert call_args[1]["encoding"] == "utf-8"
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_no_index(self, mock_dataframe, mock_to_csv, mock_mkdir):
-        """Test CSV file index handling."""
-        results = [
             EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="Test response",
-                eval_type="judge-llm",
-                result="PASS",
+                eval_id="script_test",
+                query="Script query",
+                response="Script response",
+                eval_type="script",
+                result="FAIL",
+                conversation_group="mixed_conv",
+                conversation_id="conv-id-mixed",
             ),
-        ]
-
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify to_csv was called with index=False
-        call_args = mock_df_instance.to_csv.call_args
-        assert not call_args[1]["index"]
-
-    @patch("pathlib.Path.mkdir")
-    @patch("pandas.DataFrame.to_csv")
-    @patch("pandas.DataFrame")
-    def test_save_results_none_error_handling(
-        self, mock_dataframe, mock_to_csv, mock_mkdir
-    ):
-        """Test handling of None error values."""
-        results = [
             EvaluationResult(
-                eval_id="test_001",
-                query="Test query",
-                response="Test response",
-                eval_type="judge-llm",
+                eval_id="substring_test",
+                query="Substring query",
+                response="Substring response",
+                eval_type="sub-string",
                 result="PASS",
-                error=None,
+                conversation_group="mixed_conv",
+                conversation_id="conv-id-mixed",
             ),
         ]
 
-        # Setup mocks
-        mock_df_instance = Mock()
-        mock_dataframe.return_value = mock_df_instance
-
-        # Run test
-        manager = ResultsManager("test_results/")
-        manager.save_results(results)
-
-        # Verify None error is converted to empty string
-        mock_dataframe.assert_called_once()
-        call_args = mock_dataframe.call_args[0][0]
-        assert call_args[0]["error"] == ""
-
-    def test_get_output_dir(self):
-        """Test get_output_dir method."""
-        manager = ResultsManager("test_results/")
-        output_dir = manager.get_output_dir()
-        assert output_dir == str(manager.result_path)
+        manager = ResultsManager(results)
+        stats = manager.get_results_stats()
+
+        assert stats.total_evaluations == 3
+        assert stats.total_conversations == 1
+        assert len(stats.by_eval_type) == 3
+        assert stats.by_eval_type["judge-llm"]["passed"] == 1
+        assert stats.by_eval_type["script"]["failed"] == 1
+        assert stats.by_eval_type["sub-string"]["passed"] == 1
+
+    def test_json_statistics_structure(self, sample_results):
+        """Test JSON statistics structure."""
+        manager = ResultsManager(sample_results)
+        stats = manager.get_results_stats()
+
+        # Convert to dict as would be saved to JSON
+        stats_dict = stats.model_dump()
+
+        assert "total_evaluations" in stats_dict
+        assert "total_conversations" in stats_dict
+        assert "passed" in stats_dict
+        assert "failed" in stats_dict
+        assert "errored" in stats_dict
+        assert "success_rate" in stats_dict
+        assert "by_conversation" in stats_dict
+        assert "by_eval_type" in stats_dict
+
+        # Verify structure of nested stats
+        assert isinstance(stats_dict["by_conversation"], dict)
+        assert isinstance(stats_dict["by_eval_type"], dict)
+
+    def test_filename_generation_format(self, sample_results):
+        """Test that filename generation follows expected format."""
+        manager = ResultsManager(sample_results)
+
+        with patch(
+            "lsc_agent_eval.core.agent_goal_eval.results.datetime"
+        ) as mock_datetime:
+            mock_datetime.now.return_value.strftime.return_value = "20240101_120000"
+
+            with (
+                patch.object(manager, "_save_csv_results"),
+                patch.object(manager, "_save_json_summary"),
+                patch("pathlib.Path.mkdir"),
+            ):
+
+                manager.save_results("test_results/")
+
+                # Verify the filename format is called correctly
+                mock_datetime.now.assert_called_once()
+                mock_datetime.now.return_value.strftime.assert_called_once_with(
+                    "%Y%m%d_%H%M%S"
+                )
+
+    def test_integration_with_real_files(self, sample_results):
+        """Integration test with real temporary files."""
+        manager = ResultsManager(sample_results)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            manager.save_results(temp_dir)
+
+            # Check that files were created
+            result_files = list(Path(temp_dir).glob("agent_goal_eval_results_*.csv"))
+            summary_files = list(Path(temp_dir).glob("agent_goal_eval_summary_*.json"))
+
+            assert len(result_files) == 1
+            assert len(summary_files) == 1
+
+            # Verify CSV content
+            csv_data = pd.read_csv(result_files[0])
+            assert len(csv_data) == 3
+            assert "eval_id" in csv_data.columns
+            assert "result" in csv_data.columns
+            assert "conversation_group" in csv_data.columns
+
+            # Verify JSON content
+            with open(summary_files[0], "r") as f:
+                json_data = json.load(f)
+
+            assert json_data["summary"]["total_evaluations"] == 3
+            assert json_data["summary"]["passed"] == 2
+            assert "by_conversation" in json_data
+            assert "by_eval_type" in json_data
diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py
index 2c1bee00..a73dff6d 100644
--- a/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py
+++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py
@@ -40,8 +40,7 @@ def test_run_script_success(
         mock_exists.assert_called_once()
         mock_chmod.assert_called_once_with(0o755)
         mock_subprocess_run.assert_called_once_with(
-            ["bash", str(Path("test_script.sh").resolve())],
-            input=None,
+            [str(Path("test_script.sh").resolve())],
             text=True,
             capture_output=True,
             env=os.environ.copy(),
@@ -82,8 +81,7 @@ def test_run_script_with_kubeconfig(
         expected_env = os.environ.copy()
         expected_env["KUBECONFIG"] = "./kubeconfig"
         mock_subprocess_run.assert_called_once_with(
-            ["bash", str(Path("test_script.sh").resolve())],
-            input=None,
+            [str(Path("test_script.sh").resolve())],
             text=True,
             capture_output=True,
             env=expected_env,
@@ -186,8 +184,7 @@ def test_run_script_absolute_path(
 
         assert result
         mock_subprocess_run.assert_called_once_with(
-            ["bash", absolute_path],
-            input=None,
+            [absolute_path],
             text=True,
             capture_output=True,
             env=os.environ.copy(),
@@ -217,8 +214,7 @@ def test_run_script_relative_path(
 
         expected_path = str(Path("scripts/test.sh").resolve())
         mock_subprocess_run.assert_called_once_with(
-            ["bash", expected_path],
-            input=None,
+            [expected_path],
             text=True,
             capture_output=True,
             env=os.environ.copy(),
@@ -248,8 +244,7 @@ def test_run_script_environment_preservation(
             # Verify environment includes test variable
             expected_env = os.environ.copy()
             mock_subprocess_run.assert_called_once_with(
-                ["bash", str(Path("test_script.sh").resolve())],
-                input=None,
+                [str(Path("test_script.sh").resolve())],
                 text=True,
                 capture_output=True,
                 env=expected_env,
@@ -278,8 +273,7 @@ def test_run_script_kubeconfig_absolute_path(
         expected_env = os.environ.copy()
         expected_env["KUBECONFIG"] = kubeconfig_path
         mock_subprocess_run.assert_called_once_with(
-            ["bash", str(Path("test_script.sh").resolve())],
-            input=None,
+            [str(Path("test_script.sh").resolve())],
             text=True,
             capture_output=True,
             env=expected_env,
@@ -306,8 +300,7 @@ def test_run_script_no_kubeconfig(
 
         assert result
         mock_subprocess_run.assert_called_once_with(
-            ["bash", str(Path("test_script.sh").resolve())],
-            input=None,
+            [str(Path("test_script.sh").resolve())],
             text=True,
             capture_output=True,
             env=os.environ.copy(),
@@ -337,35 +330,6 @@ def test_run_script_capture_output(
         assert result
         # Note: Instance method returns boolean, not the result object
 
-    @patch("subprocess.run")
-    @patch("pathlib.Path.is_file")
-    @patch("pathlib.Path.exists")
-    @patch("pathlib.Path.chmod")
-    def test_run_script_with_input_text(
-        self, mock_chmod, mock_exists, mock_is_file, mock_subprocess_run
-    ):
-        """Test script execution with input text."""
-        mock_exists.return_value = True
-        mock_is_file.return_value = True
-        mock_result = Mock()
-        mock_result.returncode = 0
-        mock_subprocess_run.return_value = mock_result
-
-        input_text = "test input"
-        runner = ScriptRunner()
-        result = runner.run_script("test_script.sh", input_text=input_text)
-
-        assert result
-        mock_subprocess_run.assert_called_once_with(
-            ["bash", str(Path("test_script.sh").resolve())],
-            input=input_text,
-            text=True,
-            capture_output=True,
-            env=os.environ.copy(),
-            timeout=300,
-            check=False,
-        )
-
     def test_script_runner_init(self):
         """Test ScriptRunner initialization."""
         runner = ScriptRunner()
diff --git a/lsc_agent_eval/tests/core/utils/test_api_client.py b/lsc_agent_eval/tests/core/utils/test_api_client.py
index 8fed7817..345d15e5 100644
--- a/lsc_agent_eval/tests/core/utils/test_api_client.py
+++ b/lsc_agent_eval/tests/core/utils/test_api_client.py
@@ -67,7 +67,11 @@ def test_query_agent_success(self):
         """Test successful agent query."""
         # Mock HTTP response
         mock_response = Mock()
-        mock_response.json.return_value = {"response": "Test agent response"}
+        response_text = "OpenShift Virtualization is an extension of the OpenShift Container Platform"
+        mock_response.json.return_value = {
+            "response": response_text,
+            "conversation_id": "conv-id-123",
+        }
         mock_response.raise_for_status.return_value = None
 
         # Mock HTTP client
@@ -77,16 +81,18 @@ def test_query_agent_success(self):
         with patch("httpx.Client", return_value=mock_client):
             client = AgentHttpClient("http://localhost:8080")
 
-            result = client.query_agent("What is Kubernetes?", "openai", "gpt-4")
+            api_input = {
+                "query": "What is Openshift Virtualization?",
+                "provider": "watsonx",
+                "model": "ibm/granite-3-3-8b-instruct",
+            }
+            result_response, result_conversation_id = client.query_agent(api_input)
 
-            assert result == "Test agent response"
+            assert result_response == response_text
+            assert result_conversation_id == "conv-id-123"
             mock_client.post.assert_called_once_with(
                 "/v1/query",
-                json={
-                    "query": "What is Kubernetes?",
-                    "provider": "openai",
-                    "model": "gpt-4",
-                },
+                json=api_input,
                 timeout=300,
             )
 
@@ -106,8 +112,9 @@ def test_query_agent_http_error(self):
         with patch("httpx.Client", return_value=mock_client):
             client = AgentHttpClient("http://localhost:8080")
 
+            api_input = {"query": "Test query", "provider": "openai", "model": "gpt-4"}
             with pytest.raises(AgentAPIError, match="Agent API error: 500"):
-                client.query_agent("Test query", "openai", "gpt-4")
+                client.query_agent(api_input)
 
     def test_query_agent_timeout(self):
         """Test agent query with timeout."""
@@ -118,8 +125,13 @@ def test_query_agent_timeout(self):
         with patch("httpx.Client", return_value=mock_client):
             client = AgentHttpClient("http://localhost:8080")
 
+            api_input = {
+                "query": "Test query",
+                "provider": "agent_provider",
+                "model": "agent_model",
+            }
             with pytest.raises(AgentAPIError, match="Agent query timeout"):
-                client.query_agent("Test query", "openai", "gpt-4")
+                client.query_agent(api_input)
 
     def test_query_agent_missing_response_field(self):
         """Test agent query with missing response field."""
@@ -135,10 +147,11 @@ def test_query_agent_missing_response_field(self):
         with patch("httpx.Client", return_value=mock_client):
             client = AgentHttpClient("http://localhost:8080")
 
+            api_input = {"query": "Test query", "provider": "openai", "model": "gpt-4"}
             with pytest.raises(
                 AgentAPIError, match="Agent response missing 'response' field"
             ):
-                client.query_agent("Test query", "openai", "gpt-4")
+                client.query_agent(api_input)
 
     def test_query_agent_client_not_initialized(self):
         """Test agent query when client is not initialized."""
diff --git a/lsc_agent_eval/tests/core/utils/test_exceptions.py b/lsc_agent_eval/tests/core/utils/test_exceptions.py
index 94152bea..4c63e9f5 100644
--- a/lsc_agent_eval/tests/core/utils/test_exceptions.py
+++ b/lsc_agent_eval/tests/core/utils/test_exceptions.py
@@ -3,7 +3,7 @@
 from lsc_agent_eval.core.utils.exceptions import (
     AgentAPIError,
     AgentEvaluationError,
-    ConfigurationError,
+    EvaluationDataError,
     JudgeModelError,
     ScriptExecutionError,
 )
@@ -25,20 +25,20 @@ def test_agent_evaluation_error_inheritance(self):
         assert isinstance(error, Exception)
 
 
-class TestConfigurationError:
-    """Test ConfigurationError."""
+class TestEvaluationDataError:
+    """Test EvaluationDataError."""
 
-    def test_configuration_error_creation(self):
-        """Test creating ConfigurationError."""
-        error = ConfigurationError("Invalid configuration")
+    def test_evaluation_data_error_creation(self):
+        """Test creating EvaluationDataError."""
+        error = EvaluationDataError("Invalid configuration")
         assert str(error) == "Invalid configuration"
-        assert isinstance(error, ConfigurationError)
+        assert isinstance(error, EvaluationDataError)
         assert isinstance(error, AgentEvaluationError)
 
-    def test_configuration_error_inheritance(self):
-        """Test ConfigurationError inheritance."""
-        error = ConfigurationError("Config error")
-        assert isinstance(error, ConfigurationError)
+    def test_evaluation_data_error_inheritance(self):
+        """Test EvaluationDataError inheritance."""
+        error = EvaluationDataError("Config error")
+        assert isinstance(error, EvaluationDataError)
         assert isinstance(error, AgentEvaluationError)
         assert isinstance(error, Exception)
 
@@ -103,7 +103,7 @@ class TestExceptionHierarchy:
     def test_all_exceptions_inherit_from_base(self):
         """Test that all custom exceptions inherit from AgentEvaluationError."""
         exceptions = [
-            ConfigurationError("config error"),
+            EvaluationDataError("config error"),
             AgentAPIError("api error"),
             ScriptExecutionError("script error"),
             JudgeModelError("judge error"),
@@ -112,13 +112,3 @@ def test_all_exceptions_inherit_from_base(self):
         for exc in exceptions:
             assert isinstance(exc, AgentEvaluationError)
             assert isinstance(exc, Exception)
-
-    def test_exception_with_none_message(self):
-        """Test exceptions with None message."""
-        error = AgentEvaluationError(None)
-        assert str(error) == "None"
-
-    def test_exception_with_empty_message(self):
-        """Test exceptions with empty message."""
-        error = AgentEvaluationError("")
-        assert str(error) == ""