From eb20bf457916869552aee05b163cdd9439c82762 Mon Sep 17 00:00:00 2001
From: bsatapat <bsatapat@redhat.com>
Date: Tue, 2 Sep 2025 11:32:51 +0530
Subject: [PATCH] [LSC_EVAL] Added test cases scenarios for the
 lightspeed_evaluation framework

[DESC] [0] Generated 75 relevant test cases
       [1] All the test cases passes successfully.
---
 pytest.ini               |  41 +++
 tests/README.md          | 287 +++++++++++++++++++
 tests/conftest.py        | 301 ++++++++++++++++++++
 tests/run_tests.py       | 173 ++++++++++++
 tests/test_cli.py        | 416 ++++++++++++++++++++++++++++
 tests/test_config.py     | 513 ++++++++++++++++++++++++++++++++++
 tests/test_evaluation.py | 428 +++++++++++++++++++++++++++-
 tests/test_metrics.py    | 583 +++++++++++++++++++++++++++++++++++++++
 8 files changed, 2738 insertions(+), 4 deletions(-)
 create mode 100644 pytest.ini
 create mode 100644 tests/README.md
 create mode 100644 tests/conftest.py
 create mode 100644 tests/run_tests.py
 create mode 100644 tests/test_cli.py
 create mode 100644 tests/test_config.py
 create mode 100644 tests/test_metrics.py

diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..2c0e76cc
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,41 @@
+[tool:pytest]
+# Pytest configuration for LightSpeed Evaluation Framework
+
+# Test discovery
+testpaths = tests
+python_files = test_*.py *_test.py
+python_classes = Test*
+python_functions = test_*
+
+# Output options
+addopts = 
+    -v
+    --tb=short
+    --strict-markers
+    --disable-warnings
+    --color=yes
+
+# Markers
+markers =
+    integration: Integration tests that require real config files
+    slow: Tests that take a long time to run
+    unit: Fast unit tests
+    config: Configuration-related tests
+    metrics: Metric evaluation tests
+    output: Output generation tests
+
+# Minimum version
+minversion = 6.0
+
+# Test timeout (in seconds)
+timeout = 300
+
+# Coverage options (if pytest-cov is installed)
+# addopts = --cov=lightspeed_evaluation --cov-report=html --cov-report=term-missing
+
+# Ignore certain warnings
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
+    ignore::UserWarning:matplotlib.*
+    ignore::UserWarning:seaborn.*
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..00709978
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,287 @@
+# LightSpeed Evaluation Framework - Test Suite
+
+This directory contains comprehensive tests for the LightSpeed Evaluation Framework. The test suite covers all major components and provides both unit and integration tests.
+
+## Test Structure
+
+```
+tests/
+├── README.md                 # This file
+├── conftest.py              # Pytest configuration and shared fixtures
+├── run_tests.py             # Test runner script for convenient test execution
+├── test_evaluation.py       # Main evaluation tests
+├── test_config.py           # Configuration loading and validation tests
+├── test_metrics.py          # Metrics evaluation tests
+└── test_cli.py              # Command-line interface tests
+```
+
+## Test Categories
+
+The tests are organized into several categories using pytest markers:
+
+### By Component
+- **`config`**: Configuration loading, validation, and environment setup
+- **`metrics`**: Metric evaluation (Ragas, DeepEval, Custom)
+- **`cli`**: Command-line interface and argument parsing
+- **`output`**: Report generation and output handling
+
+### By Type
+- **`unit`**: Fast unit tests with mocked dependencies
+- **`integration`**: Integration tests using real configuration files
+- **`slow`**: Tests that take longer to run (usually integration tests)
+
+## Running Tests
+
+### Prerequisites
+
+Install the required testing dependencies:
+
+```bash
+pip install pytest pytest-cov
+```
+
+### Basic Usage
+
+```bash
+# Run all tests
+python -m pytest tests/
+
+# Run with verbose output
+python -m pytest tests/ -v
+
+# Run specific test file
+python -m pytest tests/test_config.py
+
+# Run specific test class
+python -m pytest tests/test_config.py::TestSystemConfig
+
+# Run specific test method
+python -m pytest tests/test_config.py::TestSystemConfig::test_system_config_defaults
+```
+
+### Using the Test Runner Script
+
+The project includes a convenient test runner script located in the `tests/` directory:
+
+```bash
+# Run all tests
+python tests/run_tests.py
+
+# Run only unit tests
+python tests/run_tests.py --type unit
+
+# Run only integration tests
+python tests/run_tests.py --type integration
+
+# Run tests by component
+python tests/run_tests.py --type config
+python tests/run_tests.py --type metrics
+python tests/run_tests.py --type cli
+
+# Run with coverage report
+python tests/run_tests.py --coverage
+
+# Run with verbose output
+python tests/run_tests.py --verbose
+
+# Run fast tests only (exclude slow tests)
+python tests/run_tests.py --type fast
+
+# Run specific test file
+python tests/run_tests.py test_config.py
+
+# Custom markers
+python tests/run_tests.py --markers "unit and not slow"
+```
+
+### Test Markers
+
+Use pytest markers to run specific test categories:
+
+```bash
+# Run only unit tests
+python -m pytest -m unit
+
+# Run only integration tests
+python -m pytest -m integration
+
+# Run config-related tests
+python -m pytest -m config
+
+# Run metrics-related tests
+python -m pytest -m metrics
+
+# Run CLI-related tests
+python -m pytest -m cli
+
+# Exclude slow tests
+python -m pytest -m "not slow"
+
+# Combine markers
+python -m pytest -m "unit and config"
+```
+
+## Test Configuration
+
+### Environment Variables
+
+The tests automatically set up required environment variables:
+
+- `OPENAI_API_KEY`: Set to a test value for mocking
+- `DEEPEVAL_TELEMETRY_OPT_OUT`: Disabled for testing
+- `LITELLM_LOG_LEVEL`: Set to ERROR to reduce noise
+
+### Fixtures
+
+The test suite provides several useful fixtures in `conftest.py`:
+
+- **`sample_system_config`**: Pre-configured SystemConfig object
+- **`sample_llm_config`**: Pre-configured LLMConfig object
+- **`sample_turn_data`**: Sample conversation turn data
+- **`sample_evaluation_data`**: Complete evaluation data structure
+- **`mock_llm_manager`**: Mocked LLM manager for testing
+- **`temp_config_files`**: Temporary configuration files
+- **`temp_output_dir`**: Temporary output directory
+
+## Test Coverage
+
+To generate a coverage report:
+
+```bash
+# Generate HTML coverage report
+python -m pytest --cov=lightspeed_evaluation --cov-report=html tests/
+
+# Generate terminal coverage report
+python -m pytest --cov=lightspeed_evaluation --cov-report=term-missing tests/
+
+# Using the test runner
+python tests/run_tests.py --coverage
+```
+
+The HTML coverage report will be generated in `htmlcov/index.html`.
+
+## Writing New Tests
+
+### Test File Organization
+
+- **Unit tests**: Test individual functions/classes with mocked dependencies
+- **Integration tests**: Test component interactions with real or realistic data
+- **Use descriptive test names**: `test_load_system_config_with_valid_file`
+- **Group related tests**: Use test classes to organize related functionality
+
+### Example Test Structure
+
+```python
+class TestMyComponent:
+    """Test MyComponent functionality."""
+    
+    def test_basic_functionality(self):
+        """Test basic functionality with valid input."""
+        # Arrange
+        component = MyComponent()
+        
+        # Act
+        result = component.do_something()
+        
+        # Assert
+        assert result is not None
+    
+    def test_error_handling(self):
+        """Test error handling with invalid input."""
+        component = MyComponent()
+        
+        with pytest.raises(ValueError, match="Expected error message"):
+            component.do_something_invalid()
+    
+    @pytest.mark.integration
+    def test_integration_scenario(self):
+        """Test integration with other components."""
+        # Integration test code here
+        pass
+```
+
+### Using Fixtures
+
+```python
+def test_with_fixtures(sample_system_config, temp_output_dir):
+    """Test using provided fixtures."""
+    # Use the fixtures in your test
+    assert sample_system_config.llm_provider == "openai"
+    assert Path(temp_output_dir).exists()
+```
+
+### Mocking External Dependencies
+
+```python
+@patch('lightspeed_evaluation.core.metrics.ragas.evaluate')
+def test_with_mocked_dependency(mock_evaluate):
+    """Test with mocked external dependency."""
+    # Configure mock
+    mock_evaluate.return_value = MagicMock()
+    
+    # Run test
+    result = my_function_that_uses_ragas()
+    
+    # Verify mock was called
+    mock_evaluate.assert_called_once()
+```
+
+## Continuous Integration
+
+The test suite is designed to work in CI environments:
+
+- All external dependencies are mocked
+- Temporary files are properly cleaned up
+- Tests are deterministic and don't rely on external services
+- Environment variables are properly managed
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Import Errors**: Make sure the package is installed in development mode:
+   ```bash
+   pip install -e .
+   ```
+
+2. **Missing Dependencies**: Install test dependencies:
+   ```bash
+   pip install pytest pytest-cov
+   ```
+
+3. **Configuration File Tests**: Some tests require the actual config files to exist:
+   - `config/system.yaml`
+   - `config/evaluation_data.yaml`
+
+4. **Environment Variables**: Tests automatically set required environment variables, but you can override them if needed.
+
+### Debug Mode
+
+Run tests with more verbose output for debugging:
+
+```bash
+python -m pytest tests/ -v -s --tb=long
+```
+
+### Running Individual Tests
+
+For debugging specific tests:
+
+```bash
+# Run a specific test with full output
+python -m pytest tests/test_config.py::TestSystemConfig::test_system_config_defaults -v -s
+
+# Run with pdb debugger on failure
+python -m pytest tests/test_config.py --pdb
+```
+
+## Contributing
+
+When adding new functionality:
+
+1. Write tests for new features
+2. Ensure good test coverage (aim for >90%)
+3. Use appropriate markers for test categorization
+4. Mock external dependencies
+5. Add integration tests for complex workflows
+6. Update this README if adding new test categories or patterns
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..6dd17948
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,301 @@
+"""Pytest configuration and shared fixtures."""
+
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+import yaml
+
+from lightspeed_evaluation.core.config import EvaluationData, SystemConfig, TurnData
+from lightspeed_evaluation.core.llm.manager import LLMConfig, LLMManager
+
+
+@pytest.fixture(scope="session")
+def test_data_dir():
+    """Provide test data directory."""
+    return Path(__file__).parent / "data"
+
+
+@pytest.fixture(scope="session")
+def config_dir():
+    """Provide configuration directory."""
+    return Path(__file__).parent.parent / "config"
+
+
+@pytest.fixture
+def sample_system_config():
+    """Provide a sample SystemConfig for testing."""
+    return SystemConfig(
+        llm_provider="openai",
+        llm_model="gpt-4o-mini",
+        llm_temperature=0.0,
+        llm_max_tokens=512,
+        output_dir="./test_output",
+        base_filename="test_evaluation",
+        include_graphs=True
+    )
+
+
+@pytest.fixture
+def sample_llm_config():
+    """Provide a sample LLMConfig for testing."""
+    return LLMConfig(
+        provider="openai",
+        model="gpt-4o-mini",
+        temperature=0.0,
+        max_tokens=512,
+        timeout=300,
+        num_retries=3
+    )
+
+
+@pytest.fixture
+def sample_turn_data():
+    """Provide sample TurnData for testing."""
+    return TurnData(
+        turn_id=1,
+        query="What is Python?",
+        response="Python is a high-level programming language.",
+        contexts=[
+            {"content": "Python is a programming language created by Guido van Rossum."},
+            {"content": "Python is widely used for web development, data science, and automation."}
+        ],
+        expected_response="Python is a high-level programming language used for various applications."
+    )
+
+
+@pytest.fixture
+def sample_evaluation_data(sample_turn_data):
+    """Provide sample EvaluationData for testing."""
+    return EvaluationData(
+        conversation_group_id="test_conversation",
+        description="Test conversation for evaluation",
+        turn_metrics=["ragas:faithfulness", "ragas:response_relevancy"],
+        conversation_metrics=["deepeval:conversation_completeness"],
+        turns=[sample_turn_data]
+    )
+
+
+@pytest.fixture
+def mock_llm_manager():
+    """Provide a mock LLM manager."""
+    manager = MagicMock(spec=LLMManager)
+    manager.get_model_name.return_value = "gpt-4o-mini"
+    manager.get_litellm_params.return_value = {
+        "model": "gpt-4o-mini",
+        "temperature": 0.0,
+        "max_tokens": 512,
+        "timeout": 300,
+        "num_retries": 3
+    }
+    manager.config = LLMConfig(
+        provider="openai",
+        model="gpt-4o-mini",
+        temperature=0.0,
+        max_tokens=512,
+        timeout=300,
+        num_retries=3
+    )
+    return manager
+
+
+@pytest.fixture
+def temp_config_files():
+    """Create temporary configuration files for testing."""
+    system_config_data = {
+        "llm": {
+            "provider": "openai",
+            "model": "gpt-4o-mini",
+            "temperature": 0.0,
+            "max_tokens": 512,
+            "timeout": 300,
+            "num_retries": 3
+        },
+        "environment": {
+            "DEEPEVAL_TELEMETRY_OPT_OUT": "YES",
+            "LITELLM_LOG_LEVEL": "ERROR"
+        },
+        "logging": {
+            "source_level": "INFO",
+            "package_level": "ERROR"
+        },
+        "metrics_metadata": {
+            "turn_level": {
+                "ragas:faithfulness": {
+                    "threshold": 0.8,
+                    "type": "turn",
+                    "framework": "ragas"
+                }
+            },
+            "conversation_level": {
+                "deepeval:conversation_completeness": {
+                    "threshold": 0.7,
+                    "type": "conversation",
+                    "framework": "deepeval"
+                }
+            }
+        },
+        "output": {
+            "base_directory": "./test_output",
+            "base_filename": "test_evaluation",
+            "formats": {"csv": True, "json": True, "txt": True},
+            "include_graphs": True
+        },
+        "visualization": {
+            "figsize": [12, 8],
+            "dpi": 300
+        }
+    }
+    
+    eval_data = [
+        {
+            "conversation_group_id": "test_conv_1",
+            "description": "Test conversation 1",
+            "turn_metrics": ["ragas:faithfulness", "ragas:response_relevancy"],
+            "conversation_metrics": [],
+            "turn_metrics_metadata": {},
+            "conversation_metrics_metadata": {},
+            "turns": [
+                {
+                    "turn_id": 1,
+                    "query": "What is machine learning?",
+                    "response": "Machine learning is a subset of AI.",
+                    "contexts": [
+                        {"content": "Machine learning is a method of data analysis."}
+                    ],
+                    "expected_response": "Machine learning is a subset of artificial intelligence."
+                }
+            ]
+        },
+        {
+            "conversation_group_id": "test_conv_2",
+            "description": "Test conversation 2",
+            "turn_metrics": ["custom:answer_correctness"],
+            "conversation_metrics": ["deepeval:conversation_completeness"],
+            "turn_metrics_metadata": {},
+            "conversation_metrics_metadata": {},
+            "turns": [
+                {
+                    "turn_id": 1,
+                    "query": "Explain neural networks",
+                    "response": "Neural networks are computing systems inspired by biological neural networks.",
+                    "contexts": [
+                        {"content": "Neural networks consist of interconnected nodes."}
+                    ],
+                    "expected_response": "Neural networks are computational models inspired by the human brain."
+                },
+                {
+                    "turn_id": 2,
+                    "query": "What are the applications?",
+                    "response": "Neural networks are used in image recognition, NLP, and more.",
+                    "contexts": [
+                        {"content": "Applications include computer vision and natural language processing."}
+                    ],
+                    "expected_response": "Applications include computer vision, NLP, and pattern recognition."
+                }
+            ]
+        }
+    ]
+    
+    # Create temporary files
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_file:
+        yaml.dump(system_config_data, system_file, default_flow_style=False)
+        system_config_path = system_file.name
+    
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_file:
+        yaml.dump(eval_data, eval_file, default_flow_style=False)
+        eval_data_path = eval_file.name
+    
+    yield {
+        "system_config": system_config_path,
+        "eval_data": eval_data_path
+    }
+    
+    # Cleanup
+    os.unlink(system_config_path)
+    os.unlink(eval_data_path)
+
+
+@pytest.fixture
+def temp_output_dir():
+    """Create a temporary output directory for testing."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        yield temp_dir
+
+
+@pytest.fixture(autouse=True)
+def setup_test_environment():
+    """Set up test environment variables."""
+    # Set required environment variables for testing
+    test_env_vars = {
+        "OPENAI_API_KEY": "test-api-key-for-testing",
+        "DEEPEVAL_TELEMETRY_OPT_OUT": "YES",
+        "DEEPEVAL_DISABLE_PROGRESS_BAR": "YES",
+        "LITELLM_LOG_LEVEL": "ERROR"
+    }
+    
+    # Store original values
+    original_values = {}
+    for key, value in test_env_vars.items():
+        original_values[key] = os.environ.get(key)
+        os.environ[key] = value
+    
+    yield
+    
+    # Restore original values
+    for key, original_value in original_values.items():
+        if original_value is None:
+            os.environ.pop(key, None)
+        else:
+            os.environ[key] = original_value
+
+
+# Pytest markers for different test categories
+def pytest_configure(config):
+    """Configure pytest with custom markers."""
+    config.addinivalue_line(
+        "markers", "unit: mark test as a unit test"
+    )
+    config.addinivalue_line(
+        "markers", "integration: mark test as an integration test"
+    )
+    config.addinivalue_line(
+        "markers", "slow: mark test as slow running"
+    )
+    config.addinivalue_line(
+        "markers", "config: mark test as configuration-related"
+    )
+    config.addinivalue_line(
+        "markers", "metrics: mark test as metrics-related"
+    )
+    config.addinivalue_line(
+        "markers", "output: mark test as output-related"
+    )
+    config.addinivalue_line(
+        "markers", "cli: mark test as CLI-related"
+    )
+
+
+# Custom pytest collection hook to organize tests
+def pytest_collection_modifyitems(config, items):
+    """Modify test collection to add markers based on test names and locations."""
+    for item in items:
+        # Add markers based on test file names
+        if "test_config" in item.fspath.basename:
+            item.add_marker(pytest.mark.config)
+        elif "test_metrics" in item.fspath.basename:
+            item.add_marker(pytest.mark.metrics)
+        elif "test_cli" in item.fspath.basename:
+            item.add_marker(pytest.mark.cli)
+        elif "test_output" in item.fspath.basename:
+            item.add_marker(pytest.mark.output)
+        
+        # Add markers based on test names
+        if "integration" in item.name:
+            item.add_marker(pytest.mark.integration)
+        elif "slow" in item.name:
+            item.add_marker(pytest.mark.slow)
+        else:
+            item.add_marker(pytest.mark.unit)
diff --git a/tests/run_tests.py b/tests/run_tests.py
new file mode 100644
index 00000000..461d1c49
--- /dev/null
+++ b/tests/run_tests.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Test runner script for LightSpeed Evaluation Framework."""
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_tests(test_type="all", verbose=False, coverage=False, markers=None):
+    """Run tests with specified options."""
+    
+    # Use the same Python executable that's running this script
+    python_exe = sys.executable
+    
+    # Base pytest command
+    cmd = [python_exe, "-m", "pytest"]
+    
+    # Add verbosity
+    if verbose:
+        cmd.append("-v")
+    else:
+        cmd.append("-q")
+    
+    # Add coverage if requested
+    if coverage:
+        cmd.extend([
+            "--cov=lightspeed_evaluation",
+            "--cov-report=html",
+            "--cov-report=term-missing",
+            "--cov-report=xml"
+        ])
+    
+    # Add markers if specified
+    if markers:
+        cmd.extend(["-m", markers])
+    
+    # Get the tests directory (current directory since we're inside tests/)
+    tests_dir = Path(__file__).parent
+    
+    # Add test selection based on type
+    if test_type == "unit":
+        cmd.extend(["-m", "unit"])
+    elif test_type == "integration":
+        cmd.extend(["-m", "integration"])
+    elif test_type == "config":
+        cmd.extend(["-m", "config"])
+    elif test_type == "metrics":
+        cmd.extend(["-m", "metrics"])
+    elif test_type == "cli":
+        cmd.extend(["-m", "cli"])
+    elif test_type == "output":
+        cmd.extend(["-m", "output"])
+    elif test_type == "slow":
+        cmd.extend(["-m", "slow"])
+    elif test_type == "fast":
+        cmd.extend(["-m", "not slow"])
+    elif test_type != "all":
+        # Specific test file or pattern
+        # If it's a relative path, make it relative to tests directory
+        if not test_type.startswith("/") and not test_type.startswith("tests/"):
+            test_type = str(tests_dir / test_type)
+        cmd.append(test_type)
+    
+    # Add tests directory for general test types
+    if test_type == "all" or test_type in ["unit", "integration", "config", "metrics", "cli", "output", "slow", "fast"]:
+        cmd.append(str(tests_dir))
+    
+    print(f"Running command: {' '.join(cmd)}")
+    
+    # Run the tests
+    try:
+        result = subprocess.run(cmd, check=False)
+        return result.returncode
+    except KeyboardInterrupt:
+        print("\nTests interrupted by user")
+        return 1
+    except Exception as e:
+        print(f"Error running tests: {e}")
+        return 1
+
+
+def main():
+    """Main function for test runner."""
+    parser = argparse.ArgumentParser(
+        description="Test runner for LightSpeed Evaluation Framework",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""
+Examples:
+  {sys.executable} tests/run_tests.py                    # Run all tests
+  {sys.executable} tests/run_tests.py --type unit        # Run only unit tests
+  {sys.executable} tests/run_tests.py --type integration # Run only integration tests
+  {sys.executable} tests/run_tests.py --type config      # Run only config tests
+  {sys.executable} tests/run_tests.py --type metrics     # Run only metrics tests
+  {sys.executable} tests/run_tests.py --type cli         # Run only CLI tests
+  {sys.executable} tests/run_tests.py --type fast        # Run fast tests (exclude slow)
+  {sys.executable} tests/run_tests.py --coverage         # Run with coverage report
+  {sys.executable} tests/run_tests.py --verbose          # Run with verbose output
+  {sys.executable} tests/run_tests.py --markers "unit and not slow"  # Custom markers
+  {sys.executable} tests/run_tests.py test_config.py     # Run specific test file
+        """
+    )
+    
+    parser.add_argument(
+        "--type", "-t",
+        choices=["all", "unit", "integration", "config", "metrics", "cli", "output", "slow", "fast"],
+        default="all",
+        help="Type of tests to run (default: all)"
+    )
+    
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Run tests with verbose output"
+    )
+    
+    parser.add_argument(
+        "--coverage", "-c",
+        action="store_true",
+        help="Run tests with coverage report"
+    )
+    
+    parser.add_argument(
+        "--markers", "-m",
+        help="Custom pytest markers (e.g., 'unit and not slow')"
+    )
+    
+    parser.add_argument(
+        "test_path",
+        nargs="?",
+        help="Specific test file or directory to run"
+    )
+    
+    args = parser.parse_args()
+    
+    # Use test_path if provided, otherwise use type
+    test_type = args.test_path if args.test_path else args.type
+    
+    # Use the same Python executable that's running this script
+    python_exe = sys.executable
+    
+    # Check if pytest is available
+    try:
+        subprocess.run([python_exe, "-m", "pytest", "--version"], 
+                      check=True, capture_output=True)
+    except subprocess.CalledProcessError:
+        print("Error: pytest is not installed. Please install it with:")
+        print(f"  {python_exe} -m pip install pytest")
+        if args.coverage:
+            print(f"  {python_exe} -m pip install pytest-cov  # for coverage support")
+        return 1
+    
+    # Check if coverage is requested but not available
+    if args.coverage:
+        try:
+            subprocess.run([python_exe, "-m", "pytest_cov", "--version"], 
+                          check=True, capture_output=True)
+        except subprocess.CalledProcessError:
+            print("Warning: pytest-cov is not installed. Coverage disabled.")
+            print(f"Install it with: {python_exe} -m pip install pytest-cov")
+            args.coverage = False
+    
+    # Run the tests
+    return run_tests(
+        test_type=test_type,
+        verbose=args.verbose,
+        coverage=args.coverage,
+        markers=args.markers
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 00000000..21e0a33d
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,416 @@
+"""Tests for command-line interface."""
+
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+import yaml
+
+from lightspeed_evaluation.runner.evaluation import main, run_evaluation
+
+
+class TestCLIInterface:
+    """Test command-line interface functionality."""
+
+    def test_main_with_help_argument(self):
+        """Test main function with help argument."""
+        with patch('sys.argv', ['lightspeed-eval', '--help']):
+            with pytest.raises(SystemExit) as exc_info:
+                main()
+            # Help should exit with code 0
+            assert exc_info.value.code == 0
+
+    def test_main_with_missing_system_config(self):
+        """Test main function with missing system config file."""
+        with patch('sys.argv', ['lightspeed-eval', '--system-config', 'nonexistent.yaml']):
+            with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'):
+                result = main()
+                assert result == 1  # Should return error code
+
+    def test_main_with_missing_eval_data(self):
+        """Test main function with missing evaluation data file."""
+        # Create temporary system config
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+            yaml.dump({"llm": {"provider": "openai"}}, f)
+            system_config_path = f.name
+        
+        try:
+            with patch('sys.argv', [
+                'lightspeed-eval',
+                '--system-config', system_config_path,
+                '--eval-data', 'nonexistent.yaml'
+            ]):
+                with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'):
+                    result = main()
+                    assert result == 1  # Should return error code
+        finally:
+            os.unlink(system_config_path)
+
+    @patch('lightspeed_evaluation.runner.evaluation.run_evaluation')
+    @patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables')
+    def test_main_with_valid_arguments(self, mock_setup_env, mock_run_eval):
+        """Test main function with valid arguments."""
+        # Mock successful evaluation
+        mock_run_eval.return_value = {
+            "TOTAL": 5,
+            "PASS": 3,
+            "FAIL": 1,
+            "ERROR": 1
+        }
+        
+        # Create temporary config files
+        system_config_data = {
+            "llm": {"provider": "openai", "model": "gpt-4"},
+            "output": {"base_directory": "./test_output"}
+        }
+        
+        eval_data = [
+            {
+                "conversation_group_id": "test_conv",
+                "turn_metrics": ["ragas:faithfulness"],
+                "turns": [{"turn_id": 1, "query": "q", "response": "r"}]
+            }
+        ]
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config:
+            yaml.dump(system_config_data, system_config)
+            system_config_path = system_config.name
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config:
+            yaml.dump(eval_data, eval_config)
+            eval_data_path = eval_config.name
+        
+        try:
+            with patch('sys.argv', [
+                'lightspeed-eval',
+                '--system-config', system_config_path,
+                '--eval-data', eval_data_path
+            ]):
+                result = main()
+                assert result == 0  # Should return success code
+                
+                # Verify that run_evaluation was called with correct arguments
+                mock_run_eval.assert_called_once_with(
+                    system_config_path, eval_data_path, None
+                )
+                
+        finally:
+            os.unlink(system_config_path)
+            os.unlink(eval_data_path)
+
+    @patch('lightspeed_evaluation.runner.evaluation.run_evaluation')
+    @patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables')
+    def test_main_with_output_dir_override(self, mock_setup_env, mock_run_eval):
+        """Test main function with output directory override."""
+        mock_run_eval.return_value = {"TOTAL": 1, "PASS": 1, "FAIL": 0, "ERROR": 0}
+        
+        # Create temporary config files
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config:
+            yaml.dump({"llm": {"provider": "openai"}}, system_config)
+            system_config_path = system_config.name
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config:
+            yaml.dump([{"conversation_group_id": "test"}], eval_config)
+            eval_data_path = eval_config.name
+        
+        try:
+            with patch('sys.argv', [
+                'lightspeed-eval',
+                '--system-config', system_config_path,
+                '--eval-data', eval_data_path,
+                '--output-dir', '/custom/output/dir'
+            ]):
+                result = main()
+                assert result == 0
+                
+                # Verify that run_evaluation was called with custom output dir
+                mock_run_eval.assert_called_once_with(
+                    system_config_path, eval_data_path, '/custom/output/dir'
+                )
+                
+        finally:
+            os.unlink(system_config_path)
+            os.unlink(eval_data_path)
+
+    @patch('lightspeed_evaluation.runner.evaluation.run_evaluation')
+    @patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables')
+    def test_main_with_evaluation_failure(self, mock_setup_env, mock_run_eval):
+        """Test main function when evaluation fails."""
+        # Mock failed evaluation
+        mock_run_eval.return_value = None
+        
+        # Create temporary config files
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config:
+            yaml.dump({"llm": {"provider": "openai"}}, system_config)
+            system_config_path = system_config.name
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config:
+            yaml.dump([{"conversation_group_id": "test"}], eval_config)
+            eval_data_path = eval_config.name
+        
+        try:
+            with patch('sys.argv', [
+                'lightspeed-eval',
+                '--system-config', system_config_path,
+                '--eval-data', eval_data_path
+            ]):
+                result = main()
+                assert result == 1  # Should return error code
+                
+        finally:
+            os.unlink(system_config_path)
+            os.unlink(eval_data_path)
+
+
+class TestRunEvaluation:
+    """Test run_evaluation function."""
+
+    @patch('lightspeed_evaluation.runner.evaluation.EvaluationDriver')
+    @patch('lightspeed_evaluation.runner.evaluation.OutputHandler')
+    @patch('lightspeed_evaluation.runner.evaluation.DataValidator')
+    @patch('lightspeed_evaluation.runner.evaluation.ConfigLoader')
+    def test_run_evaluation_success(self, mock_config_loader_class, mock_validator_class, 
+                                   mock_output_handler_class, mock_driver_class):
+        """Test successful run_evaluation execution."""
+        # Mock ConfigLoader
+        mock_loader = mock_config_loader_class.return_value
+        mock_system_config = mock_loader.load_system_config.return_value
+        mock_system_config.llm_provider = "openai"
+        mock_system_config.llm_model = "gpt-4"
+        mock_system_config.output_dir = "./test_output"
+        mock_system_config.base_filename = "test_eval"
+        mock_system_config.include_graphs = True
+        
+        # Mock DataValidator
+        mock_validator = mock_validator_class.return_value
+        mock_validator.load_evaluation_data.return_value = ["mock_data"]
+        
+        # Mock EvaluationDriver
+        mock_driver = mock_driver_class.return_value
+        mock_results = ["mock_result1", "mock_result2"]
+        mock_driver.run_evaluation.return_value = mock_results
+        
+        # Mock OutputHandler
+        mock_output_handler = mock_output_handler_class.return_value
+        
+        # Create temporary config files
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config:
+            yaml.dump({"llm": {"provider": "openai"}}, system_config)
+            system_config_path = system_config.name
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config:
+            yaml.dump([{"conversation_group_id": "test"}], eval_config)
+            eval_data_path = eval_config.name
+        
+        try:
+            with patch('lightspeed_evaluation.runner.evaluation.calculate_basic_stats') as mock_stats:
+                mock_stats.return_value = {
+                    "TOTAL": 2,
+                    "PASS": 1,
+                    "FAIL": 1,
+                    "ERROR": 0
+                }
+                
+                result = run_evaluation(system_config_path, eval_data_path)
+                
+                # Verify result
+                assert result is not None
+                assert result["TOTAL"] == 2
+                assert result["PASS"] == 1
+                assert result["FAIL"] == 1
+                assert result["ERROR"] == 0
+                
+                # Verify method calls
+                mock_loader.load_system_config.assert_called_once_with(system_config_path)
+                mock_validator.load_evaluation_data.assert_called_once_with(eval_data_path)
+                mock_driver.run_evaluation.assert_called_once_with(["mock_data"])
+                mock_output_handler.generate_reports.assert_called_once_with(mock_results, include_graphs=True)
+                
+        finally:
+            os.unlink(system_config_path)
+            os.unlink(eval_data_path)
+
+    @patch('lightspeed_evaluation.runner.evaluation.ConfigLoader')
+    def test_run_evaluation_config_loading_failure(self, mock_config_loader_class):
+        """Test run_evaluation with configuration loading failure."""
+        # Mock ConfigLoader to raise exception
+        mock_loader = mock_config_loader_class.return_value
+        mock_loader.load_system_config.side_effect = FileNotFoundError("Config not found")
+        
+        result = run_evaluation("nonexistent_system.yaml", "nonexistent_data.yaml")
+        
+        assert result is None
+
+    @patch('lightspeed_evaluation.runner.evaluation.EvaluationDriver')
+    @patch('lightspeed_evaluation.runner.evaluation.DataValidator')
+    @patch('lightspeed_evaluation.runner.evaluation.ConfigLoader')
+    def test_run_evaluation_with_custom_output_dir(self, mock_config_loader_class, 
+                                                  mock_validator_class, mock_driver_class):
+        """Test run_evaluation with custom output directory."""
+        # Mock dependencies
+        mock_loader = mock_config_loader_class.return_value
+        mock_system_config = mock_loader.load_system_config.return_value
+        mock_system_config.llm_provider = "openai"
+        mock_system_config.llm_model = "gpt-4"
+        mock_system_config.output_dir = "./default_output"
+        mock_system_config.base_filename = "test_eval"
+        mock_system_config.include_graphs = False
+        
+        mock_validator = mock_validator_class.return_value
+        mock_validator.load_evaluation_data.return_value = []
+        
+        mock_driver = mock_driver_class.return_value
+        mock_driver.run_evaluation.return_value = []
+        
+        # Create temporary config files
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config:
+            yaml.dump({"llm": {"provider": "openai"}}, system_config)
+            system_config_path = system_config.name
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config:
+            yaml.dump([], eval_config)
+            eval_data_path = eval_config.name
+        
+        try:
+            with patch('lightspeed_evaluation.runner.evaluation.OutputHandler') as mock_output_handler_class:
+                with patch('lightspeed_evaluation.runner.evaluation.calculate_basic_stats') as mock_stats:
+                    mock_stats.return_value = {"TOTAL": 0, "PASS": 0, "FAIL": 0, "ERROR": 0}
+                    
+                    custom_output_dir = "/custom/output/path"
+                    result = run_evaluation(system_config_path, eval_data_path, custom_output_dir)
+                    
+                    # Verify that OutputHandler was called with custom output directory
+                    mock_output_handler_class.assert_called_once_with(
+                        output_dir=custom_output_dir,
+                        base_filename="test_eval",
+                        system_config=mock_system_config
+                    )
+                    
+                    assert result is not None
+                    
+        finally:
+            os.unlink(system_config_path)
+            os.unlink(eval_data_path)
+
+
+class TestCLIArgumentParsing:
+    """Test CLI argument parsing."""
+
+    def test_default_arguments(self):
+        """Test CLI with default arguments."""
+        with patch('sys.argv', ['lightspeed-eval']):
+            with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'):
+                with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path:
+                    # Mock Path.exists to return False for default paths
+                    mock_path.return_value.exists.return_value = False
+                    
+                    result = main()
+                    assert result == 1  # Should fail due to missing files
+
+    def test_custom_config_paths(self):
+        """Test CLI with custom configuration paths."""
+        custom_system_config = "/path/to/custom/system.yaml"
+        custom_eval_data = "/path/to/custom/eval_data.yaml"
+        
+        with patch('sys.argv', [
+            'lightspeed-eval',
+            '--system-config', custom_system_config,
+            '--eval-data', custom_eval_data
+        ]):
+            with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'):
+                with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path:
+                    # Mock Path.exists to return False
+                    mock_path.return_value.exists.return_value = False
+                    
+                    result = main()
+                    assert result == 1  # Should fail due to missing files
+
+    def test_argument_validation(self):
+        """Test argument validation in CLI."""
+        # Test that the argument parser accepts the expected arguments
+        import argparse
+        from lightspeed_evaluation.runner.evaluation import main
+        
+        # This test verifies that the argument parser is set up correctly
+        # by checking that it doesn't raise an exception with valid arguments
+        with patch('sys.argv', [
+            'lightspeed-eval',
+            '--system-config', 'test_system.yaml',
+            '--eval-data', 'test_eval.yaml',
+            '--output-dir', '/test/output'
+        ]):
+            with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'):
+                with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path:
+                    mock_path.return_value.exists.return_value = False
+                    
+                    # Should not raise an ArgumentError
+                    result = main()
+                    assert result == 1  # Fails due to missing files, but args are valid
+
+
+class TestCLIRealWorldScenarios:
+    """Test CLI with real-world scenarios."""
+
+    @patch('lightspeed_evaluation.runner.evaluation.run_evaluation')
+    def test_cli_with_custom_output_directory(self, mock_run_eval):
+        """Test CLI with custom output directory."""
+        mock_run_eval.return_value = {"TOTAL": 5, "PASS": 4, "FAIL": 1, "ERROR": 0}
+        
+        with patch('sys.argv', [
+            'lightspeed-eval',
+            '--system-config', 'config/system.yaml',
+            '--eval-data', 'config/evaluation_data.yaml',
+            '--output-dir', '/custom/output/path'
+        ]):
+            with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'):
+                with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path:
+                    mock_path.return_value.exists.return_value = True
+                    
+                    result = main()
+                    assert result == 0
+
+    @patch('lightspeed_evaluation.runner.evaluation.run_evaluation')
+    def test_cli_evaluation_with_mixed_results(self, mock_run_eval):
+        """Test CLI when evaluation has mixed results."""
+        mock_run_eval.return_value = {"TOTAL": 10, "PASS": 6, "FAIL": 3, "ERROR": 1}
+        
+        with patch('sys.argv', [
+            'lightspeed-eval',
+            '--system-config', 'config/system.yaml',
+            '--eval-data', 'config/evaluation_data.yaml'
+        ]):
+            with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'):
+                with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path:
+                    mock_path.return_value.exists.return_value = True
+                    
+                    result = main()
+                    # Should still return 0 (success) as the evaluation completed
+                    assert result == 0
+
+    def test_cli_with_environment_variables(self):
+        """Test CLI behavior with environment variables."""
+        env_vars = {
+            'OPENAI_API_KEY': 'test-key-123',
+            'DEEPEVAL_TELEMETRY_OPT_OUT': 'YES',
+            'LITELLM_LOG_LEVEL': 'ERROR'
+        }
+        
+        with patch('sys.argv', [
+            'lightspeed-eval',
+            '--system-config', 'config/system.yaml',
+            '--eval-data', 'config/evaluation_data.yaml'
+        ]):
+            with patch.dict(os.environ, env_vars):
+                with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables') as mock_setup:
+                    with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path:
+                        mock_path.return_value.exists.return_value = False
+                        
+                        result = main()
+                        
+                        # Verify environment setup was called
+                        mock_setup.assert_called_once()
+                        
+                        # Should fail due to missing files, but env setup should have been called
+                        assert result == 1
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 00000000..a54ca75a
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,513 @@
+"""Tests for configuration components."""
+
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+import yaml
+
+from lightspeed_evaluation.core.config import (
+    ConfigLoader,
+    DataValidator,
+    EvaluationData,
+    LLMConfig,
+    SystemConfig,
+    TurnData,
+    setup_environment_variables,
+)
+
+
+class TestSystemConfig:
+    """Test SystemConfig model."""
+
+    def test_system_config_defaults(self):
+        """Test SystemConfig with default values."""
+        config = SystemConfig()
+        
+        assert config.llm_provider == "openai"
+        assert config.llm_model == "gpt-4o-mini"
+        assert config.llm_temperature == 0.0
+        assert config.output_dir == "./eval_output"
+        assert config.include_graphs is True
+
+    def test_system_config_custom_values(self):
+        """Test SystemConfig with custom values."""
+        config = SystemConfig(
+            llm_provider="anthropic",
+            llm_model="claude-3-sonnet",
+            llm_temperature=0.5,
+            output_dir="./custom_output",
+            include_graphs=False
+        )
+        
+        assert config.llm_provider == "anthropic"
+        assert config.llm_model == "claude-3-sonnet"
+        assert config.llm_temperature == 0.5
+        assert config.output_dir == "./custom_output"
+        assert config.include_graphs is False
+
+
+class TestLLMConfig:
+    """Test LLMConfig model."""
+
+    def test_llm_config_validation(self):
+        """Test LLMConfig validation."""
+        config = LLMConfig(
+            provider="openai",
+            model="gpt-4",
+            temperature=0.7,
+            max_tokens=1000,
+            timeout=60,
+            num_retries=2
+        )
+        
+        assert config.provider == "openai"
+        assert config.model == "gpt-4"
+        assert config.temperature == 0.7
+        assert config.max_tokens == 1000
+        assert config.timeout == 60
+        assert config.num_retries == 2
+
+    def test_llm_config_invalid_temperature(self):
+        """Test LLMConfig with invalid temperature."""
+        with pytest.raises(ValueError):
+            LLMConfig(
+                provider="openai",
+                model="gpt-4",
+                temperature=3.0  # Invalid: > 2.0
+            )
+
+    def test_llm_config_from_dict(self):
+        """Test creating LLMConfig from dictionary."""
+        config_dict = {
+            "provider": "anthropic",
+            "model": "claude-3-haiku",
+            "temperature": 0.3,
+            "max_tokens": 800,
+            "timeout": 120,
+            "num_retries": 1
+        }
+        
+        config = LLMConfig.from_dict(config_dict)
+        
+        assert config.provider == "anthropic"
+        assert config.model == "claude-3-haiku"
+        assert config.temperature == 0.3
+
+
+class TestTurnData:
+    """Test TurnData model."""
+
+    def test_valid_turn_data(self):
+        """Test valid TurnData creation."""
+        turn = TurnData(
+            turn_id=1,
+            query="What is AI?",
+            response="AI is artificial intelligence.",
+            contexts=[{"content": "AI context"}],
+            expected_response="AI stands for artificial intelligence."
+        )
+        
+        assert turn.turn_id == 1
+        assert turn.query == "What is AI?"
+        assert turn.response == "AI is artificial intelligence."
+        assert len(turn.contexts) == 1
+        assert turn.contexts[0]["content"] == "AI context"
+
+    def test_turn_data_validation_empty_query(self):
+        """Test TurnData validation with empty query."""
+        with pytest.raises(ValueError, match="Query and response cannot be empty"):
+            TurnData(
+                turn_id=1,
+                query="",
+                response="Valid response"
+            )
+
+    def test_turn_data_validation_invalid_turn_id(self):
+        """Test TurnData validation with invalid turn_id."""
+        with pytest.raises(ValueError, match="Turn ID must be positive"):
+            TurnData(
+                turn_id=0,  # Invalid: must be positive
+                query="Valid query",
+                response="Valid response"
+            )
+
+    def test_turn_data_context_validation(self):
+        """Test TurnData context validation."""
+        with pytest.raises(ValueError, match='Context 0 must have a "content" field'):
+            TurnData(
+                turn_id=1,
+                query="Valid query",
+                response="Valid response",
+                contexts=[{"invalid": "no content field"}]
+            )
+
+
+class TestEvaluationData:
+    """Test EvaluationData model."""
+
+    def test_valid_evaluation_data(self):
+        """Test valid EvaluationData creation."""
+        turn = TurnData(
+            turn_id=1,
+            query="Test query",
+            response="Test response"
+        )
+        
+        eval_data = EvaluationData(
+            conversation_group_id="test_conv",
+            description="Test conversation",
+            turn_metrics=["ragas:faithfulness"],
+            conversation_metrics=["deepeval:completeness"],
+            turns=[turn]
+        )
+        
+        assert eval_data.conversation_group_id == "test_conv"
+        assert eval_data.description == "Test conversation"
+        assert len(eval_data.turn_metrics) == 1
+        assert len(eval_data.conversation_metrics) == 1
+        assert len(eval_data.turns) == 1
+
+    def test_evaluation_data_empty_conversation_id(self):
+        """Test EvaluationData with empty conversation_group_id."""
+        with pytest.raises(ValueError, match="Conversation group ID cannot be empty"):
+            EvaluationData(
+                conversation_group_id="",
+                turns=[TurnData(turn_id=1, query="q", response="r")]
+            )
+
+    def test_evaluation_data_empty_turns(self):
+        """Test EvaluationData with empty turns."""
+        with pytest.raises(ValueError, match="Conversation must have at least one turn"):
+            EvaluationData(
+                conversation_group_id="test_conv",
+                turns=[]
+            )
+
+    def test_evaluation_data_invalid_metric_format(self):
+        """Test EvaluationData with invalid metric format."""
+        turn = TurnData(turn_id=1, query="q", response="r")
+        
+        with pytest.raises(ValueError, match='must be in format "framework:metric_name"'):
+            EvaluationData(
+                conversation_group_id="test_conv",
+                turn_metrics=["invalid_metric"],  # Missing colon
+                turns=[turn]
+            )
+
+
+class TestConfigLoader:
+    """Test ConfigLoader functionality."""
+
+    def test_config_loader_initialization(self):
+        """Test ConfigLoader initialization."""
+        loader = ConfigLoader()
+        
+        assert loader.system_config is None
+        assert loader.evaluation_data is None
+        assert loader.logger is None
+
+    @patch('lightspeed_evaluation.core.config.loader.setup_logging')
+    def test_load_system_config_with_mock(self, mock_setup_logging):
+        """Test loading system config with mocked dependencies."""
+        # Create temporary config file
+        config_data = {
+            "llm": {
+                "provider": "openai",
+                "model": "gpt-4",
+                "temperature": 0.5
+            },
+            "output": {
+                "base_directory": "./test_output"
+            },
+            "logging": {
+                "source_level": "DEBUG"
+            },
+            "metrics_metadata": {
+                "turn_level": {},
+                "conversation_level": {}
+            }
+        }
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+            yaml.dump(config_data, f)
+            config_path = f.name
+        
+        try:
+            loader = ConfigLoader()
+            system_config = loader.load_system_config(config_path)
+            
+            assert system_config.llm_provider == "openai"
+            assert system_config.llm_model == "gpt-4"
+            assert system_config.llm_temperature == 0.5
+            assert system_config.output_dir == "./test_output"
+            
+        finally:
+            os.unlink(config_path)
+
+
+class TestEnvironmentSetup:
+    """Test environment variable setup."""
+
+    def test_setup_environment_variables_success(self):
+        """Test successful environment variable setup."""
+        config_data = {
+            "environment": {
+                "TEST_VAR": "test_value",
+                "ANOTHER_VAR": "another_value"
+            }
+        }
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+            yaml.dump(config_data, f)
+            config_path = f.name
+        
+        try:
+            # Clear any existing values
+            os.environ.pop("TEST_VAR", None)
+            os.environ.pop("ANOTHER_VAR", None)
+            
+            setup_environment_variables(config_path)
+            
+            assert os.environ.get("TEST_VAR") == "test_value"
+            assert os.environ.get("ANOTHER_VAR") == "another_value"
+            
+        finally:
+            os.unlink(config_path)
+            # Clean up
+            os.environ.pop("TEST_VAR", None)
+            os.environ.pop("ANOTHER_VAR", None)
+
+    def test_setup_environment_variables_fallback(self):
+        """Test environment variable setup with fallback."""
+        # Test with non-existent file
+        setup_environment_variables("nonexistent_config.yaml")
+        
+        # Should set fallback values
+        assert os.environ.get("DEEPEVAL_TELEMETRY_OPT_OUT") == "YES"
+        assert os.environ.get("LITELLM_LOG_LEVEL") == "ERROR"
+
+
+class TestDataValidator:
+    """Test DataValidator functionality."""
+
+    def test_data_validator_initialization(self):
+        """Test DataValidator initialization."""
+        validator = DataValidator()
+        
+        assert validator.validation_errors == []
+        assert validator.evaluation_data is None
+
+    def test_load_evaluation_data_from_yaml(self):
+        """Test loading evaluation data from YAML file."""
+        eval_data = [
+            {
+                "conversation_group_id": "test_conv",
+                "turn_metrics": ["ragas:faithfulness"],
+                "conversation_metrics": [],
+                "turns": [
+                    {
+                        "turn_id": 1,
+                        "query": "Test query",
+                        "response": "Test response",
+                        "contexts": [
+                            {"content": "Test context for faithfulness metric"}
+                        ]
+                    }
+                ]
+            }
+        ]
+        
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+            yaml.dump(eval_data, f)
+            data_path = f.name
+        
+        try:
+            # Mock the metric validation to avoid the validation error
+            with patch('lightspeed_evaluation.core.config.validator.TURN_LEVEL_METRICS', {"ragas:faithfulness"}):
+                with patch('lightspeed_evaluation.core.config.validator.CONVERSATION_LEVEL_METRICS', set()):
+                    validator = DataValidator()
+                    loaded_data = validator.load_evaluation_data(data_path)
+                    
+                    assert len(loaded_data) == 1
+                    assert loaded_data[0].conversation_group_id == "test_conv"
+                    assert len(loaded_data[0].turns) == 1
+            
+        finally:
+            os.unlink(data_path)
+
+
+class TestConfigurationScenarios:
+    """Test realistic configuration scenarios."""
+
+    def test_system_config_with_different_providers(self):
+        """Test SystemConfig with different LLM providers."""
+        providers_config = [
+            {"provider": "openai", "model": "gpt-4o-mini", "temperature": 0.0},
+            {"provider": "anthropic", "model": "claude-3-sonnet", "temperature": 0.1},
+        ]
+        
+        for config_data in providers_config:
+            config = SystemConfig(
+                llm_provider=config_data["provider"],
+                llm_model=config_data["model"],
+                llm_temperature=config_data["temperature"]
+            )
+            
+            assert config.llm_provider == config_data["provider"]
+            assert config.llm_model == config_data["model"]
+            assert config.llm_temperature == config_data["temperature"]
+
+    def test_evaluation_data_with_multiple_metrics(self):
+        """Test EvaluationData with comprehensive metric configurations."""
+        eval_data = EvaluationData(
+            conversation_group_id="comprehensive_eval",
+            description="Full evaluation with multiple metrics",
+            turn_metrics=[
+                "ragas:faithfulness",
+                "ragas:response_relevancy", 
+                "custom:answer_correctness"
+            ],
+            conversation_metrics=[
+                "deepeval:conversation_completeness"
+            ],
+            turn_metrics_metadata={
+                "ragas:faithfulness": {"threshold": 0.85},
+                "custom:answer_correctness": {"threshold": 0.80}
+            },
+            conversation_metrics_metadata={
+                "deepeval:conversation_completeness": {"threshold": 0.75}
+            },
+            turns=[
+                TurnData(
+                    turn_id=1,
+                    query="What are the benefits of cloud computing?",
+                    response="Cloud computing offers scalability, cost-effectiveness, and accessibility.",
+                    contexts=[
+                        {"content": "Cloud computing provides on-demand access to computing resources."},
+                        {"content": "Benefits include reduced infrastructure costs and improved scalability."}
+                    ],
+                    expected_response="Cloud computing provides scalable, cost-effective computing resources."
+                )
+            ]
+        )
+        
+        assert len(eval_data.turn_metrics) == 3
+        assert len(eval_data.conversation_metrics) == 1
+        assert len(eval_data.turns) == 1
+        assert eval_data.turn_metrics_metadata["ragas:faithfulness"]["threshold"] == 0.85
+
+    def test_turn_data_with_rich_context(self):
+        """Test TurnData with comprehensive context information."""
+        turn = TurnData(
+            turn_id=1,
+            query="How does machine learning model training work?",
+            response="Machine learning model training involves feeding data to algorithms that learn patterns and make predictions.",
+            contexts=[
+                {"content": "Machine learning training requires large datasets and computational resources."},
+                {"content": "The training process involves iterative optimization of model parameters."},
+                {"content": "Validation datasets help prevent overfitting during training."}
+            ],
+            expected_response="ML training feeds data to algorithms to learn patterns through iterative optimization."
+        )
+        
+        assert len(turn.contexts) == 3
+        assert all("content" in ctx for ctx in turn.contexts)
+        assert "machine learning" in turn.query.lower()
+        assert "training" in turn.response.lower()
+
+    def test_load_evaluation_data_invalid_yaml(self):
+        """Test loading invalid YAML evaluation data."""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+            f.write("invalid: yaml: content: [")
+            data_path = f.name
+        
+        try:
+            validator = DataValidator()
+            with pytest.raises((ValueError, yaml.YAMLError)):
+                validator.load_evaluation_data(data_path)
+                
+        finally:
+            os.unlink(data_path)
+
+
+class TestConfigurationScenarios:
+    """Test realistic configuration scenarios."""
+
+    def test_system_config_with_different_providers(self):
+        """Test SystemConfig with different LLM providers."""
+        providers_config = [
+            {"provider": "openai", "model": "gpt-4o-mini", "temperature": 0.0},
+            {"provider": "anthropic", "model": "claude-3-sonnet", "temperature": 0.1},
+        ]
+        
+        for config_data in providers_config:
+            config = SystemConfig(
+                llm_provider=config_data["provider"],
+                llm_model=config_data["model"],
+                llm_temperature=config_data["temperature"]
+            )
+            
+            assert config.llm_provider == config_data["provider"]
+            assert config.llm_model == config_data["model"]
+            assert config.llm_temperature == config_data["temperature"]
+
+    def test_evaluation_data_with_multiple_metrics(self):
+        """Test EvaluationData with comprehensive metric configurations."""
+        eval_data = EvaluationData(
+            conversation_group_id="comprehensive_eval",
+            description="Full evaluation with multiple metrics",
+            turn_metrics=[
+                "ragas:faithfulness",
+                "ragas:response_relevancy", 
+                "custom:answer_correctness"
+            ],
+            conversation_metrics=[
+                "deepeval:conversation_completeness"
+            ],
+            turn_metrics_metadata={
+                "ragas:faithfulness": {"threshold": 0.85},
+                "custom:answer_correctness": {"threshold": 0.80}
+            },
+            conversation_metrics_metadata={
+                "deepeval:conversation_completeness": {"threshold": 0.75}
+            },
+            turns=[
+                TurnData(
+                    turn_id=1,
+                    query="What are the benefits of cloud computing?",
+                    response="Cloud computing offers scalability, cost-effectiveness, and accessibility.",
+                    contexts=[
+                        {"content": "Cloud computing provides on-demand access to computing resources."},
+                        {"content": "Benefits include reduced infrastructure costs and improved scalability."}
+                    ],
+                    expected_response="Cloud computing provides scalable, cost-effective computing resources."
+                )
+            ]
+        )
+        
+        assert len(eval_data.turn_metrics) == 3
+        assert len(eval_data.conversation_metrics) == 1
+        assert len(eval_data.turns) == 1
+        assert eval_data.turn_metrics_metadata["ragas:faithfulness"]["threshold"] == 0.85
+
+    def test_turn_data_with_rich_context(self):
+        """Test TurnData with comprehensive context information."""
+        turn = TurnData(
+            turn_id=1,
+            query="How does machine learning model training work?",
+            response="Machine learning model training involves feeding data to algorithms that learn patterns and make predictions.",
+            contexts=[
+                {"content": "Machine learning training requires large datasets and computational resources."},
+                {"content": "The training process involves iterative optimization of model parameters."},
+                {"content": "Validation datasets help prevent overfitting during training."}
+            ],
+            expected_response="ML training feeds data to algorithms to learn patterns through iterative optimization."
+        )
+        
+        assert len(turn.contexts) == 3
+        assert all("content" in ctx for ctx in turn.contexts)
+        assert "machine learning" in turn.query.lower()
+        assert "training" in turn.response.lower()
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 259c8822..bfc95c56 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -1,6 +1,426 @@
-"""Evaluation tests"""
+"""Comprehensive tests for LightSpeed Evaluation Framework."""
 
+import json
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
 
-def test_evaluation():
-    """Test evaluation"""
-    assert True
+import pytest
+import yaml
+
+from lightspeed_evaluation import (
+    ConfigLoader,
+    DataValidator,
+    EvaluationDriver,
+    OutputHandler,
+)
+from lightspeed_evaluation.core.config import EvaluationData, EvaluationResult, TurnData
+from lightspeed_evaluation.runner.evaluation import main, run_evaluation
+
+
+class TestConfigLoading:
+    """Test configuration loading functionality."""
+
+    def test_load_system_config_success(self):
+        """Test successful loading of system configuration."""
+        config_path = "config/system.yaml"
+        
+        # Skip if config file doesn't exist
+        if not Path(config_path).exists():
+            pytest.skip(f"Config file {config_path} not found")
+        
+        loader = ConfigLoader()
+        system_config = loader.load_system_config(config_path)
+        
+        # Verify basic configuration
+        assert system_config.llm_provider == "openai"
+        assert system_config.llm_model == "gpt-4o-mini"
+        assert system_config.llm_temperature == 0.0
+        assert system_config.output_dir == "./eval_output"
+        assert system_config.include_graphs is True
+
+    def test_load_evaluation_data_success(self):
+        """Test successful loading of evaluation data."""
+        data_path = "config/evaluation_data.yaml"
+        
+        # Skip if data file doesn't exist
+        if not Path(data_path).exists():
+            pytest.skip(f"Data file {data_path} not found")
+        
+        validator = DataValidator()
+        evaluation_data = validator.load_evaluation_data(data_path)
+        
+        # Verify data structure
+        assert len(evaluation_data) == 3  # Based on the sample data
+        assert evaluation_data[0].conversation_group_id == "conv_group_1"
+        assert len(evaluation_data[0].turns) == 1
+        assert evaluation_data[0].turns[0].query == "User query"
+
+    def test_load_nonexistent_config_file(self):
+        """Test loading non-existent configuration file."""
+        loader = ConfigLoader()
+        
+        with pytest.raises(FileNotFoundError):
+            loader.load_system_config("nonexistent_config.yaml")
+
+    def test_load_invalid_yaml_config(self):
+        """Test loading invalid YAML configuration."""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+            f.write("invalid: yaml: content: [")
+            invalid_config_path = f.name
+        
+        try:
+            loader = ConfigLoader()
+            with pytest.raises(yaml.YAMLError):
+                loader.load_system_config(invalid_config_path)
+        finally:
+            os.unlink(invalid_config_path)
+
+
+class TestDataValidation:
+    """Test data validation functionality."""
+
+    def test_valid_evaluation_data(self):
+        """Test validation of valid evaluation data."""
+        valid_data = [
+            EvaluationData(
+                conversation_group_id="test_conv",
+                turn_metrics=["ragas:faithfulness"],
+                conversation_metrics=[],
+                turns=[
+                    TurnData(
+                        turn_id=1,
+                        query="Test query",
+                        response="Test response",
+                        contexts=[{"content": "Test context"}],
+                        expected_response="Expected response"
+                    )
+                ]
+            )
+        ]
+        
+        validator = DataValidator()
+        result = validator.validate_evaluation_data(valid_data)
+        assert result is True
+
+    def test_invalid_evaluation_data_empty_turns(self):
+        """Test validation fails for empty turns."""
+        with pytest.raises(ValueError, match="Conversation must have at least one turn"):
+            EvaluationData(
+                conversation_group_id="test_conv",
+                turn_metrics=["ragas:faithfulness"],
+                conversation_metrics=[],
+                turns=[]  # Empty turns should fail
+            )
+
+    def test_invalid_evaluation_data_empty_query(self):
+        """Test validation fails for empty query."""
+        with pytest.raises(ValueError, match="Query and response cannot be empty"):
+            TurnData(
+                turn_id=1,
+                query="",  # Empty query should fail
+                response="Test response"
+            )
+
+
+class TestEvaluationDriver:
+    """Test EvaluationDriver functionality."""
+
+    @pytest.fixture
+    def mock_config_loader(self):
+        """Create a mock config loader."""
+        loader = MagicMock(spec=ConfigLoader)
+        loader.get_llm_config_dict.return_value = {
+            "llm": {
+                "provider": "openai",
+                "model": "gpt-4o-mini",
+                "temperature": 0.0,
+                "max_tokens": 512,
+                "timeout": 300,
+                "num_retries": 3
+            }
+        }
+        # Add system_config attribute
+        loader.system_config = MagicMock()
+        loader.system_config.default_turn_metrics_metadata = {}
+        loader.system_config.default_conversation_metrics_metadata = {}
+        return loader
+
+    @pytest.fixture
+    def sample_evaluation_data(self):
+        """Create sample evaluation data."""
+        return [
+            EvaluationData(
+                conversation_group_id="test_conv",
+                turn_metrics=["ragas:faithfulness"],
+                conversation_metrics=[],
+                turns=[
+                    TurnData(
+                        turn_id=1,
+                        query="What is Python?",
+                        response="Python is a programming language.",
+                        contexts=[{"content": "Python is a high-level programming language."}],
+                        expected_response="Python is a programming language used for development."
+                    )
+                ]
+            )
+        ]
+
+    @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"})
+    def test_evaluation_driver_initialization(self, mock_config_loader):
+        """Test EvaluationDriver initialization."""
+        driver = EvaluationDriver(mock_config_loader)
+        assert driver.config_loader == mock_config_loader
+        assert driver.data_validator is not None
+        assert driver.metrics_manager is not None
+
+    @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"})
+    @patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate')
+    def test_evaluation_driver_run_evaluation(self, mock_ragas_evaluate, mock_config_loader, sample_evaluation_data):
+        """Test running evaluation with mocked metrics."""
+        # Mock the ragas evaluation to return a score
+        mock_ragas_evaluate.return_value = (0.85, "Mocked faithfulness evaluation")
+        
+        driver = EvaluationDriver(mock_config_loader)
+        results = driver.run_evaluation(sample_evaluation_data)
+        
+        assert len(results) == 1
+        assert results[0].conversation_group_id == "test_conv"
+        assert results[0].metric_identifier == "ragas:faithfulness"
+        assert results[0].score == 0.85
+
+
+class TestOutputGeneration:
+    """Test output and report generation."""
+
+    @pytest.fixture
+    def sample_results(self):
+        """Create sample evaluation results."""
+        return [
+            EvaluationResult(
+                conversation_group_id="test_conv",
+                turn_id=1,
+                metric_identifier="ragas:faithfulness",
+                result="PASS",
+                score=0.85,
+                threshold=0.8,
+                reason="Good faithfulness score",
+                query="Test query",
+                response="Test response",
+                execution_time=1.5
+            ),
+            EvaluationResult(
+                conversation_group_id="test_conv",
+                turn_id=1,
+                metric_identifier="ragas:response_relevancy",
+                result="FAIL",
+                score=0.65,
+                threshold=0.8,
+                reason="Low relevancy score",
+                query="Test query",
+                response="Test response",
+                execution_time=1.2
+            )
+        ]
+
+    def test_output_handler_initialization(self):
+        """Test OutputHandler initialization."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            handler = OutputHandler(
+                output_dir=temp_dir,
+                base_filename="test_evaluation"
+            )
+            assert handler.output_dir == Path(temp_dir)
+            assert handler.base_filename == "test_evaluation"
+
+    def test_generate_reports(self, sample_results):
+        """Test report generation."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            handler = OutputHandler(
+                output_dir=temp_dir,
+                base_filename="test_evaluation"
+            )
+            
+            # Generate reports without graphs to avoid matplotlib issues in tests
+            handler.generate_reports(sample_results, include_graphs=False)
+            
+            # Check that files were created
+            output_files = list(Path(temp_dir).glob("test_evaluation_*"))
+            assert len(output_files) >= 3  # CSV, JSON, TXT files
+
+
+class TestIntegrationWithRealConfigs:
+    """Integration tests using real configuration files."""
+
+    @pytest.mark.integration
+    @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"})
+    @patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate')
+    @patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalMetrics.evaluate')
+    @patch('lightspeed_evaluation.core.metrics.custom.CustomMetrics.evaluate')
+    def test_full_evaluation_pipeline(self, mock_custom, mock_deepeval, mock_ragas):
+        """Test the complete evaluation pipeline with real config files."""
+        system_config_path = "config/system.yaml"
+        eval_data_path = "config/evaluation_data.yaml"
+        
+        # Skip if config files don't exist
+        if not (Path(system_config_path).exists() and Path(eval_data_path).exists()):
+            pytest.skip("Config files not found")
+        
+        # Mock all metric evaluations
+        mock_ragas.return_value = (0.85, "Mocked ragas evaluation")
+        mock_deepeval.return_value = (0.75, "Mocked deepeval evaluation")
+        mock_custom.return_value = (0.80, "Mocked custom evaluation")
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            summary = run_evaluation(
+                system_config_path=system_config_path,
+                evaluation_data_path=eval_data_path,
+                output_dir=temp_dir
+            )
+            
+            # Verify summary statistics
+            assert summary is not None
+            assert "TOTAL" in summary
+            assert "PASS" in summary
+            assert "FAIL" in summary
+            assert "ERROR" in summary
+            assert summary["TOTAL"] > 0
+            
+            # Verify output files were created
+            output_files = list(Path(temp_dir).glob("evaluation_*"))
+            assert len(output_files) >= 3  # At least CSV, JSON, TXT
+
+    @pytest.mark.integration
+    def test_evaluation_with_mixed_results(self):
+        """Test evaluation pipeline with mixed pass/fail results."""
+        # Create test data with scenarios that should pass and fail
+        test_data = [
+            EvaluationData(
+                conversation_group_id="high_quality_conv",
+                turn_metrics=["ragas:faithfulness", "ragas:response_relevancy"],
+                turns=[
+                    TurnData(
+                        turn_id=1,
+                        query="What is renewable energy?",
+                        response="Renewable energy comes from natural sources that replenish themselves, such as solar, wind, and hydroelectric power.",
+                        contexts=[{"content": "Renewable energy sources are naturally replenishing and include solar, wind, water, and geothermal power."}]
+                    )
+                ]
+            ),
+            EvaluationData(
+                conversation_group_id="low_quality_conv",
+                turn_metrics=["ragas:faithfulness"],
+                turns=[
+                    TurnData(
+                        turn_id=1,
+                        query="Explain quantum computing",
+                        response="Quantum computing uses quantum bits.",
+                        contexts=[{"content": "Quantum computing leverages quantum mechanical phenomena like superposition and entanglement to process information in fundamentally different ways than classical computers."}]
+                    )
+                ]
+            )
+        ]
+        
+        with patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate') as mock_ragas:
+            # Mock different scores for different conversations
+            def side_effect(metric_name, conv_data, scope):
+                if conv_data.conversation_group_id == "high_quality_conv":
+                    return (0.92, "High quality response with good faithfulness")
+                else:
+                    return (0.45, "Low quality response, lacks detail")
+            
+            mock_ragas.side_effect = side_effect
+            
+            mock_config_loader = MagicMock(spec=ConfigLoader)
+            mock_config_loader.get_llm_config_dict.return_value = {
+                "llm": {"provider": "openai", "model": "gpt-4o-mini", "temperature": 0.0}
+            }
+            mock_config_loader.system_config = MagicMock()
+            mock_config_loader.system_config.default_turn_metrics_metadata = {
+                "ragas:faithfulness": {"threshold": 0.8}
+            }
+            
+            with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+                driver = EvaluationDriver(mock_config_loader)
+                results = driver.run_evaluation(test_data)
+                
+                # Should have mixed results
+                assert len(results) == 3  # 2 from first conv, 1 from second
+                pass_results = [r for r in results if r.result == "PASS"]
+                fail_results = [r for r in results if r.result == "FAIL"]
+                
+                assert len(pass_results) >= 1
+                assert len(fail_results) >= 1
+
+    def test_evaluation_with_missing_context_data(self):
+        """Test evaluation behavior when required context data is missing."""
+        test_data = [
+            EvaluationData(
+                conversation_group_id="missing_context_conv",
+                turn_metrics=["ragas:faithfulness"],  # Requires context
+                turns=[
+                    TurnData(
+                        turn_id=1,
+                        query="What is AI?",
+                        response="AI is artificial intelligence.",
+                        contexts=[]  # Missing required context
+                    )
+                ]
+            )
+        ]
+        
+        validator = DataValidator()
+        
+        # Should fail validation due to missing context for faithfulness metric
+        with patch('lightspeed_evaluation.core.config.validator.TURN_LEVEL_METRICS', {"ragas:faithfulness"}):
+            result = validator.validate_evaluation_data(test_data)
+            assert result is False
+            assert len(validator.validation_errors) > 0
+            assert "requires contexts" in validator.validation_errors[0]
+
+    def test_evaluation_with_threshold_variations(self):
+        """Test evaluation with different threshold configurations."""
+        test_data = [
+            EvaluationData(
+                conversation_group_id="threshold_test_conv",
+                turn_metrics=["ragas:faithfulness"],
+                turn_metrics_metadata={
+                    "ragas:faithfulness": {"threshold": 0.9}  # High threshold
+                },
+                turns=[
+                    TurnData(
+                        turn_id=1,
+                        query="Explain photosynthesis",
+                        response="Photosynthesis is how plants make food using sunlight.",
+                        contexts=[{"content": "Photosynthesis is the process by which plants convert light energy into chemical energy."}]
+                    )
+                ]
+            )
+        ]
+        
+        with patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate') as mock_ragas:
+            mock_ragas.return_value = (0.85, "Good faithfulness score")  # Below 0.9 threshold
+            
+            mock_config_loader = MagicMock(spec=ConfigLoader)
+            mock_config_loader.get_llm_config_dict.return_value = {
+                "llm": {"provider": "openai", "model": "gpt-4o-mini"}
+            }
+            mock_config_loader.system_config = MagicMock()
+            mock_config_loader.system_config.default_turn_metrics_metadata = {}
+            
+            with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+                driver = EvaluationDriver(mock_config_loader)
+                results = driver.run_evaluation(test_data)
+                
+                assert len(results) == 1
+                assert results[0].result == "FAIL"  # 0.85 < 0.9 threshold
+                assert results[0].threshold == 0.9
+
+
+# Pytest configuration
+def pytest_configure(config):
+    """Configure pytest with custom markers."""
+    config.addinivalue_line(
+        "markers", "integration: mark test as integration test"
+    )
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 00000000..50bac2ae
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,583 @@
+"""Tests for metrics components."""
+
+import os
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from lightspeed_evaluation.core.config import EvaluationData, TurnData
+from lightspeed_evaluation.core.llm.manager import LLMConfig, LLMManager
+from lightspeed_evaluation.core.metrics.custom import CustomMetrics
+from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics
+from lightspeed_evaluation.core.metrics.ragas import RagasMetrics
+from lightspeed_evaluation.core.output.statistics import EvaluationScope
+
+
+class TestLLMManager:
+    """Test LLM Manager functionality."""
+
+    def test_llm_manager_initialization(self):
+        """Test LLM Manager initialization with OpenAI."""
+        config = LLMConfig(
+            provider="openai",
+            model="gpt-4o-mini",
+            temperature=0.0,
+            max_tokens=512,
+            timeout=300,
+            num_retries=3
+        )
+        
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+            manager = LLMManager(config)
+            
+            assert manager.config == config
+            assert manager.model_name == "gpt-4o-mini"
+
+    def test_llm_manager_missing_api_key(self):
+        """Test LLM Manager with missing API key."""
+        config = LLMConfig(provider="openai", model="gpt-4o-mini")
+        
+        with patch.dict(os.environ, {}, clear=True):
+            with pytest.raises(Exception, match="OPENAI_API_KEY"):
+                LLMManager(config)
+
+    def test_get_litellm_params(self):
+        """Test getting LiteLLM parameters."""
+        config = LLMConfig(
+            provider="openai",
+            model="gpt-4o-mini",
+            temperature=0.0,
+            max_tokens=512,
+            timeout=300,
+            num_retries=3
+        )
+        
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+            manager = LLMManager(config)
+            params = manager.get_litellm_params()
+            
+            assert params["model"] == "gpt-4o-mini"
+            assert params["temperature"] == 0.0
+            assert params["max_tokens"] == 512
+            assert params["timeout"] == 300
+            assert params["num_retries"] == 3
+
+
+class TestCustomMetrics:
+    """Test Custom Metrics functionality."""
+
+    @pytest.fixture
+    def mock_llm_manager(self):
+        """Create a mock LLM manager."""
+        manager = MagicMock(spec=LLMManager)
+        manager.get_model_name.return_value = "gpt-4o-mini"
+        manager.get_litellm_params.return_value = {
+            "model": "gpt-4o-mini",
+            "temperature": 0.0,
+            "max_tokens": 512,
+            "timeout": 300,
+            "num_retries": 3
+        }
+        return manager
+
+    def test_custom_metrics_initialization(self, mock_llm_manager):
+        """Test CustomMetrics initialization."""
+        metrics = CustomMetrics(mock_llm_manager)
+        
+        assert metrics.model_name == "gpt-4o-mini"
+        assert "answer_correctness" in metrics.supported_metrics
+
+    @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion')
+    def test_answer_correctness_evaluation(self, mock_completion, mock_llm_manager):
+        """Test answer correctness evaluation with expected response."""
+        # Mock LiteLLM response
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = "Score: 0.85\nReason: Response accurately describes Python as a programming language"
+        mock_completion.return_value = mock_response
+        
+        metrics = CustomMetrics(mock_llm_manager)
+        
+        turn_data = TurnData(
+            turn_id=1,
+            query="What is Python?",
+            response="Python is a programming language used for web development, data science, and automation.",
+            contexts=[{"content": "Python is a high-level programming language."}],
+            expected_response="Python is a high-level programming language used for various applications."
+        )
+        
+        scope = EvaluationScope(
+            turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False
+        )
+        
+        score, reason = metrics.evaluate("answer_correctness", None, scope)
+        
+        assert score == 0.85
+        assert "Custom answer correctness" in reason
+
+    def test_score_parsing_different_formats(self, mock_llm_manager):
+        """Test parsing scores in different formats."""
+        metrics = CustomMetrics(mock_llm_manager)
+        
+        # Test different score formats
+        test_cases = [
+            ("Score: 0.75\nReason: Good", 0.75),
+            ("8.5/10 - Excellent response", 0.85),
+            ("Rating: 4 out of 5", 0.8),
+            ("The score is 90%", 0.9),
+        ]
+        
+        for response_text, expected_score in test_cases:
+            score, reason = metrics._parse_score_response(response_text)
+            assert score == expected_score
+
+    def test_unsupported_metric(self, mock_llm_manager):
+        """Test evaluation of unsupported metric."""
+        metrics = CustomMetrics(mock_llm_manager)
+        
+        scope = EvaluationScope(is_conversation=False)
+        score, reason = metrics.evaluate("unsupported_metric", None, scope)
+        
+        assert score is None
+        assert "Unsupported custom metric" in reason
+
+
+class TestRagasMetrics:
+    """Test Ragas Metrics functionality."""
+
+    @pytest.fixture
+    def mock_llm_manager(self):
+        """Create a mock LLM manager."""
+        manager = MagicMock(spec=LLMManager)
+        manager.get_model_name.return_value = "gpt-4o-mini"
+        manager.get_litellm_params.return_value = {
+            "model": "gpt-4o-mini",
+            "temperature": 0.0,
+            "max_tokens": 512
+        }
+        return manager
+
+    @patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager')
+    def test_ragas_metrics_initialization(self, mock_ragas_llm_manager, mock_llm_manager):
+        """Test RagasMetrics initialization."""
+        metrics = RagasMetrics(mock_llm_manager)
+        
+        # Verify that RagasLLMManager was called with correct parameters
+        mock_ragas_llm_manager.assert_called_once_with("gpt-4o-mini", mock_llm_manager.get_litellm_params())
+        
+        assert "faithfulness" in metrics.supported_metrics
+        assert "response_relevancy" in metrics.supported_metrics
+        assert "context_recall" in metrics.supported_metrics
+
+    def test_faithfulness_evaluation_with_context(self, mock_llm_manager):
+        """Test faithfulness evaluation with proper context data."""
+        with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'):
+            metrics = RagasMetrics(mock_llm_manager)
+            
+            # Mock the _evaluate_metric method directly
+            with patch.object(metrics, '_evaluate_metric', return_value=(0.92, "Ragas faithfulness: 0.92")):
+                turn_data = TurnData(
+                    turn_id=1,
+                    query="What are the benefits of renewable energy?",
+                    response="Renewable energy reduces carbon emissions and provides sustainable power generation.",
+                    contexts=[
+                        {"content": "Renewable energy sources like solar and wind power help reduce greenhouse gas emissions."},
+                        {"content": "Sustainable energy systems provide long-term environmental benefits."}
+                    ]
+                )
+                
+                scope = EvaluationScope(
+                    turn_idx=0,
+                    turn_data=turn_data,
+                    is_conversation=False
+                )
+                
+                score, reason = metrics.evaluate("faithfulness", None, scope)
+                
+                assert score == 0.92
+                assert "Ragas faithfulness" in reason
+
+    def test_response_relevancy_evaluation(self, mock_llm_manager):
+        """Test response relevancy evaluation."""
+        with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'):
+            metrics = RagasMetrics(mock_llm_manager)
+            
+            with patch.object(metrics, '_evaluate_metric', return_value=(0.88, "Ragas response relevancy: 0.88")):
+                turn_data = TurnData(
+                    turn_id=1,
+                    query="How does machine learning work?",
+                    response="Machine learning uses algorithms to learn patterns from data and make predictions."
+                )
+                
+                scope = EvaluationScope(
+                    turn_idx=0,
+                    turn_data=turn_data,
+                    is_conversation=False
+                )
+                
+                score, reason = metrics.evaluate("response_relevancy", None, scope)
+                
+                assert score == 0.88
+                assert "response relevancy" in reason
+
+    def test_conversation_level_metric_error(self, mock_llm_manager):
+        """Test error when using turn-level metric for conversation."""
+        with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'):
+            metrics = RagasMetrics(mock_llm_manager)
+            
+            scope = EvaluationScope(is_conversation=True)
+            score, reason = metrics.evaluate("faithfulness", None, scope)
+            
+            assert score is None
+            assert "turn-level metric" in reason
+
+
+class TestDeepEvalMetrics:
+    """Test DeepEval Metrics functionality."""
+
+    @pytest.fixture
+    def mock_llm_manager(self):
+        """Create a mock LLM manager."""
+        manager = MagicMock(spec=LLMManager)
+        manager.get_model_name.return_value = "gpt-4o-mini"
+        manager.get_litellm_params.return_value = {
+            "model": "gpt-4o-mini",
+            "temperature": 0.0,
+            "max_tokens": 512
+        }
+        return manager
+
+    @patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager')
+    def test_deepeval_metrics_initialization(self, mock_deepeval_llm_manager, mock_llm_manager):
+        """Test DeepEvalMetrics initialization."""
+        metrics = DeepEvalMetrics(mock_llm_manager)
+        
+        # Verify that DeepEvalLLMManager was called with correct parameters
+        mock_deepeval_llm_manager.assert_called_once_with("gpt-4o-mini", mock_llm_manager.get_litellm_params())
+        
+        assert "conversation_completeness" in metrics.supported_metrics
+        assert "conversation_relevancy" in metrics.supported_metrics
+        assert "knowledge_retention" in metrics.supported_metrics
+
+    @patch('lightspeed_evaluation.core.metrics.deepeval.ConversationCompletenessMetric')
+    def test_conversation_completeness_evaluation(self, mock_metric_class, mock_llm_manager):
+        """Test conversation completeness evaluation with multi-turn conversation."""
+        # Mock metric instance
+        mock_metric = MagicMock()
+        mock_metric.score = 0.82
+        mock_metric.reason = "Conversation addresses user needs comprehensively"
+        mock_metric_class.return_value = mock_metric
+        
+        with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'):
+            metrics = DeepEvalMetrics(mock_llm_manager)
+            
+            conv_data = EvaluationData(
+                conversation_group_id="customer_support_conv",
+                turns=[
+                    TurnData(turn_id=1, query="I need help with my account", response="I can help you with your account. What specific issue are you experiencing?"),
+                    TurnData(turn_id=2, query="I can't log in", response="Let me help you reset your password. Please check your email for instructions."),
+                    TurnData(turn_id=3, query="I got the email, thanks!", response="Great! Is there anything else I can help you with today?")
+                ]
+            )
+            
+            scope = EvaluationScope(is_conversation=True)
+            
+            score, reason = metrics.evaluate("conversation_completeness", conv_data, scope)
+            
+            assert score == 0.82
+            assert "comprehensively" in reason
+
+    def test_turn_level_metric_error(self, mock_llm_manager):
+        """Test error when using conversation-level metric for turn."""
+        with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'):
+            metrics = DeepEvalMetrics(mock_llm_manager)
+            
+            scope = EvaluationScope(is_conversation=False)
+            score, reason = metrics.evaluate("conversation_completeness", None, scope)
+            
+            assert score is None
+            assert "conversation-level metric" in reason
+
+
+class TestMetricsIntegration:
+    """Integration tests for metrics components."""
+
+    @pytest.mark.integration
+    @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"})
+    def test_metrics_manager_integration(self):
+        """Test integration between different metric types."""
+        from lightspeed_evaluation.drivers.evaluation import MetricsManager
+        
+        config = LLMConfig(provider="openai", model="gpt-4o-mini")
+        llm_manager = LLMManager(config)
+        
+        metrics_manager = MetricsManager(llm_manager)
+        
+        # Verify all handlers are initialized
+        assert "ragas" in metrics_manager.handlers
+        assert "deepeval" in metrics_manager.handlers
+        assert "custom" in metrics_manager.handlers
+        
+        # Verify supported frameworks
+        frameworks = metrics_manager.get_supported_frameworks()
+        assert "ragas" in frameworks
+        assert "deepeval" in frameworks
+        assert "custom" in frameworks
+
+    def test_evaluation_scope_factory_methods(self):
+        """Test EvaluationScope creation for different scenarios."""
+        # Turn-level scope
+        turn_data = TurnData(
+            turn_id=1,
+            query="What is machine learning?",
+            response="Machine learning is a subset of AI that enables computers to learn from data."
+        )
+        
+        turn_scope = EvaluationScope(
+            turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False
+        )
+        
+        assert turn_scope.turn_idx == 0
+        assert turn_scope.turn_data == turn_data
+        assert turn_scope.is_conversation is False
+        
+        # Conversation-level scope
+        conv_scope = EvaluationScope(is_conversation=True)
+        
+        assert conv_scope.turn_idx is None
+        assert conv_scope.turn_data is None
+        assert conv_scope.is_conversation is True
+
+
+class TestRealWorldScenarios:
+    """Test real-world evaluation scenarios."""
+
+    @pytest.fixture
+    def mock_llm_manager(self):
+        """Create a mock LLM manager."""
+        manager = MagicMock(spec=LLMManager)
+        manager.get_model_name.return_value = "gpt-4o-mini"
+        manager.get_litellm_params.return_value = {
+            "model": "gpt-4o-mini",
+            "temperature": 0.0,
+            "max_tokens": 512,
+            "timeout": 300,
+            "num_retries": 3
+        }
+        return manager
+
+    @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion')
+    def test_technical_documentation_evaluation(self, mock_completion, mock_llm_manager):
+        """Test evaluation of technical documentation responses."""
+        # Mock LLM response for technical accuracy
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = "Score: 0.88\nReason: Response provides accurate technical information about Kubernetes with proper context and examples"
+        mock_completion.return_value = mock_response
+        
+        metrics = CustomMetrics(mock_llm_manager)
+        
+        turn_data = TurnData(
+            turn_id=1,
+            query="How do I deploy a microservice using Kubernetes?",
+            response="To deploy a microservice using Kubernetes, you need to create a Deployment manifest that specifies the container image, replicas, and resource requirements. Then use kubectl apply to deploy it to your cluster. You'll also need a Service to expose the microservice to other components.",
+            contexts=[
+                {"content": "Kubernetes deployments manage the lifecycle of containerized applications and ensure desired state."},
+                {"content": "Services in Kubernetes provide stable network endpoints for accessing pods."}
+            ],
+            expected_response="Deploy microservices in Kubernetes by creating Deployment and Service manifests, then applying them with kubectl."
+        )
+        
+        scope = EvaluationScope(
+            turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False
+        )
+        
+        score, reason = metrics.evaluate("answer_correctness", None, scope)
+        
+        assert score == 0.88
+        assert "technical information" in reason
+
+    def test_customer_support_conversation_evaluation(self, mock_llm_manager):
+        """Test evaluation of customer support conversation completeness."""
+        with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'):
+            with patch('lightspeed_evaluation.core.metrics.deepeval.ConversationCompletenessMetric') as mock_metric_class:
+                # Mock metric for customer support scenario
+                mock_metric = MagicMock()
+                mock_metric.score = 0.91
+                mock_metric.reason = "Conversation fully addresses customer issue with clear resolution steps"
+                mock_metric_class.return_value = mock_metric
+                
+                metrics = DeepEvalMetrics(mock_llm_manager)
+                
+                conv_data = EvaluationData(
+                    conversation_group_id="customer_billing_issue",
+                    turns=[
+                        TurnData(turn_id=1, query="I was charged twice for my subscription", 
+                                response="I understand your concern about the duplicate charge. Let me look into your account to investigate this billing issue."),
+                        TurnData(turn_id=2, query="When will this be resolved?", 
+                                response="I can see the duplicate charge in your account. I'm processing a refund right now, which should appear in 3-5 business days."),
+                        TurnData(turn_id=3, query="Thank you for the help", 
+                                response="You're welcome! I've sent you a confirmation email with the refund details. Is there anything else I can help you with today?")
+                    ]
+                )
+                
+                scope = EvaluationScope(is_conversation=True)
+                
+                score, reason = metrics.evaluate("conversation_completeness", conv_data, scope)
+                
+                assert score == 0.91
+                assert "fully addresses" in reason
+
+    def test_code_explanation_faithfulness(self, mock_llm_manager):
+        """Test faithfulness evaluation for code explanation scenarios."""
+        with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'):
+            metrics = RagasMetrics(mock_llm_manager)
+            
+            with patch.object(metrics, '_evaluate_metric', return_value=(0.94, "Ragas faithfulness: 0.94")):
+                turn_data = TurnData(
+                    turn_id=1,
+                    query="Explain what this Python function does: def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
+                    response="This is a recursive function that calculates the nth Fibonacci number. It uses the base case where if n is 0 or 1, it returns n directly. Otherwise, it recursively calls itself with n-1 and n-2 and adds the results together.",
+                    contexts=[
+                        {"content": "The Fibonacci sequence is defined as F(0)=0, F(1)=1, and F(n)=F(n-1)+F(n-2) for n>1."},
+                        {"content": "Recursive functions call themselves with modified parameters until reaching a base case."}
+                    ]
+                )
+                
+                scope = EvaluationScope(
+                    turn_idx=0,
+                    turn_data=turn_data,
+                    is_conversation=False
+                )
+                
+                score, reason = metrics.evaluate("faithfulness", None, scope)
+                
+                assert score == 0.94
+                assert "faithfulness" in reason
+
+    @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion')
+    def test_multilingual_content_evaluation(self, mock_completion, mock_llm_manager):
+        """Test evaluation of responses involving multilingual content."""
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = "Score: 0.82\nReason: Response correctly explains the concept in English while acknowledging the Spanish term"
+        mock_completion.return_value = mock_response
+        
+        metrics = CustomMetrics(mock_llm_manager)
+        
+        turn_data = TurnData(
+            turn_id=1,
+            query="What does 'inteligencia artificial' mean and how is it used in technology?",
+            response="'Inteligencia artificial' is Spanish for 'artificial intelligence'. It refers to computer systems that can perform tasks typically requiring human intelligence, such as learning, reasoning, and problem-solving. It's widely used in technology for applications like machine learning, natural language processing, and computer vision.",
+            contexts=[
+                {"content": "Artificial intelligence (AI) encompasses machine learning, neural networks, and automated decision-making systems."}
+            ],
+            expected_response="Inteligencia artificial means artificial intelligence in Spanish, referring to computer systems that simulate human intelligence for various technological applications."
+        )
+        
+        scope = EvaluationScope(
+            turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False
+        )
+        
+        score, reason = metrics.evaluate("answer_correctness", None, scope)
+        
+        assert score == 0.82
+        assert "Spanish term" in reason
+
+    def test_complex_multi_turn_technical_conversation(self, mock_llm_manager):
+        """Test evaluation of complex multi-turn technical conversations."""
+        with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'):
+            with patch('lightspeed_evaluation.core.metrics.deepeval.KnowledgeRetentionMetric') as mock_metric_class:
+                mock_metric = MagicMock()
+                mock_metric.score = 0.87
+                mock_metric.reason = "Good knowledge retention across technical discussion about Docker and Kubernetes"
+                mock_metric_class.return_value = mock_metric
+                
+                metrics = DeepEvalMetrics(mock_llm_manager)
+                
+                conv_data = EvaluationData(
+                    conversation_group_id="docker_kubernetes_discussion",
+                    turns=[
+                        TurnData(turn_id=1, query="What's the difference between Docker and Kubernetes?", 
+                                response="Docker is a containerization platform that packages applications, while Kubernetes is an orchestration system that manages Docker containers at scale."),
+                        TurnData(turn_id=2, query="How do they work together in a microservices architecture?", 
+                                response="In microservices, Docker containers package individual services, and Kubernetes orchestrates these containers, handling deployment, scaling, and service discovery across the cluster."),
+                        TurnData(turn_id=3, query="What about the networking between these Docker containers you mentioned?", 
+                                response="Kubernetes provides networking through Services and Ingress controllers. Each Docker container gets an IP address, and Services create stable endpoints for communication between the containerized microservices.")
+                    ]
+                )
+                
+                scope = EvaluationScope(is_conversation=True)
+                
+                score, reason = metrics.evaluate("knowledge_retention", conv_data, scope)
+                
+                assert score == 0.87
+                assert "knowledge retention" in reason
+
+    def test_evaluation_with_incomplete_responses(self, mock_llm_manager):
+        """Test evaluation of incomplete or partial responses."""
+        with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'):
+            metrics = RagasMetrics(mock_llm_manager)
+            
+            with patch.object(metrics, '_evaluate_metric', return_value=(0.34, "Ragas response relevancy: 0.34")):
+                turn_data = TurnData(
+                    turn_id=1,
+                    query="Explain the complete process of photosynthesis including light and dark reactions",
+                    response="Photosynthesis uses sunlight."  # Incomplete response
+                )
+                
+                scope = EvaluationScope(
+                    turn_idx=0,
+                    turn_data=turn_data,
+                    is_conversation=False
+                )
+                
+                score, reason = metrics.evaluate("response_relevancy", None, scope)
+                
+                assert score == 0.34  # Low score for incomplete response
+                assert "response relevancy" in reason
+
+    @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion')
+    def test_evaluation_with_edge_case_scoring(self, mock_completion, mock_llm_manager):
+        """Test evaluation with edge case scoring scenarios."""
+        # Test different score formats that might come from LLM
+        test_cases = [
+            ("Perfect score: 1.0\nReason: Excellent", 1.0),
+            ("Score: 0\nReason: Completely incorrect", 0.0),
+            ("Rating: 7.5 out of 10", 0.75),
+            ("85% accuracy", 0.85),
+            ("Score: 0.999", 0.999),
+        ]
+        
+        metrics = CustomMetrics(mock_llm_manager)
+        
+        for response_text, expected_score in test_cases:
+            mock_response = MagicMock()
+            mock_response.choices = [MagicMock()]
+            mock_response.choices[0].message.content = response_text
+            mock_completion.return_value = mock_response
+            
+            turn_data = TurnData(
+                turn_id=1,
+                query="Test query",
+                response="Test response",
+                expected_response="Expected response"
+            )
+            
+            scope = EvaluationScope(
+                turn_idx=0,
+                turn_data=turn_data,
+                is_conversation=False
+            )
+            
+            score, reason = metrics.evaluate("answer_correctness", None, scope)
+            
+            assert score == expected_score, f"Failed for response: {response_text}"