From eb20bf457916869552aee05b163cdd9439c82762 Mon Sep 17 00:00:00 2001 From: bsatapat Date: Tue, 2 Sep 2025 11:32:51 +0530 Subject: [PATCH] [LSC_EVAL] Added test cases scenarios for the lightspeed_evaluation framework [DESC] [0] Generated 75 relevant test cases [1] All the test cases passes successfully. --- pytest.ini | 41 +++ tests/README.md | 287 +++++++++++++++++++ tests/conftest.py | 301 ++++++++++++++++++++ tests/run_tests.py | 173 ++++++++++++ tests/test_cli.py | 416 ++++++++++++++++++++++++++++ tests/test_config.py | 513 ++++++++++++++++++++++++++++++++++ tests/test_evaluation.py | 428 +++++++++++++++++++++++++++- tests/test_metrics.py | 583 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 2738 insertions(+), 4 deletions(-) create mode 100644 pytest.ini create mode 100644 tests/README.md create mode 100644 tests/conftest.py create mode 100644 tests/run_tests.py create mode 100644 tests/test_cli.py create mode 100644 tests/test_config.py create mode 100644 tests/test_metrics.py diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..2c0e76cc --- /dev/null +++ b/pytest.ini @@ -0,0 +1,41 @@ +[tool:pytest] +# Pytest configuration for LightSpeed Evaluation Framework + +# Test discovery +testpaths = tests +python_files = test_*.py *_test.py +python_classes = Test* +python_functions = test_* + +# Output options +addopts = + -v + --tb=short + --strict-markers + --disable-warnings + --color=yes + +# Markers +markers = + integration: Integration tests that require real config files + slow: Tests that take a long time to run + unit: Fast unit tests + config: Configuration-related tests + metrics: Metric evaluation tests + output: Output generation tests + +# Minimum version +minversion = 6.0 + +# Test timeout (in seconds) +timeout = 300 + +# Coverage options (if pytest-cov is installed) +# addopts = --cov=lightspeed_evaluation --cov-report=html --cov-report=term-missing + +# Ignore certain warnings +filterwarnings = + ignore::DeprecationWarning + ignore::PendingDeprecationWarning + ignore::UserWarning:matplotlib.* + ignore::UserWarning:seaborn.* diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..00709978 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,287 @@ +# LightSpeed Evaluation Framework - Test Suite + +This directory contains comprehensive tests for the LightSpeed Evaluation Framework. The test suite covers all major components and provides both unit and integration tests. + +## Test Structure + +``` +tests/ +├── README.md # This file +├── conftest.py # Pytest configuration and shared fixtures +├── run_tests.py # Test runner script for convenient test execution +├── test_evaluation.py # Main evaluation tests +├── test_config.py # Configuration loading and validation tests +├── test_metrics.py # Metrics evaluation tests +└── test_cli.py # Command-line interface tests +``` + +## Test Categories + +The tests are organized into several categories using pytest markers: + +### By Component +- **`config`**: Configuration loading, validation, and environment setup +- **`metrics`**: Metric evaluation (Ragas, DeepEval, Custom) +- **`cli`**: Command-line interface and argument parsing +- **`output`**: Report generation and output handling + +### By Type +- **`unit`**: Fast unit tests with mocked dependencies +- **`integration`**: Integration tests using real configuration files +- **`slow`**: Tests that take longer to run (usually integration tests) + +## Running Tests + +### Prerequisites + +Install the required testing dependencies: + +```bash +pip install pytest pytest-cov +``` + +### Basic Usage + +```bash +# Run all tests +python -m pytest tests/ + +# Run with verbose output +python -m pytest tests/ -v + +# Run specific test file +python -m pytest tests/test_config.py + +# Run specific test class +python -m pytest tests/test_config.py::TestSystemConfig + +# Run specific test method +python -m pytest tests/test_config.py::TestSystemConfig::test_system_config_defaults +``` + +### Using the Test Runner Script + +The project includes a convenient test runner script located in the `tests/` directory: + +```bash +# Run all tests +python tests/run_tests.py + +# Run only unit tests +python tests/run_tests.py --type unit + +# Run only integration tests +python tests/run_tests.py --type integration + +# Run tests by component +python tests/run_tests.py --type config +python tests/run_tests.py --type metrics +python tests/run_tests.py --type cli + +# Run with coverage report +python tests/run_tests.py --coverage + +# Run with verbose output +python tests/run_tests.py --verbose + +# Run fast tests only (exclude slow tests) +python tests/run_tests.py --type fast + +# Run specific test file +python tests/run_tests.py test_config.py + +# Custom markers +python tests/run_tests.py --markers "unit and not slow" +``` + +### Test Markers + +Use pytest markers to run specific test categories: + +```bash +# Run only unit tests +python -m pytest -m unit + +# Run only integration tests +python -m pytest -m integration + +# Run config-related tests +python -m pytest -m config + +# Run metrics-related tests +python -m pytest -m metrics + +# Run CLI-related tests +python -m pytest -m cli + +# Exclude slow tests +python -m pytest -m "not slow" + +# Combine markers +python -m pytest -m "unit and config" +``` + +## Test Configuration + +### Environment Variables + +The tests automatically set up required environment variables: + +- `OPENAI_API_KEY`: Set to a test value for mocking +- `DEEPEVAL_TELEMETRY_OPT_OUT`: Disabled for testing +- `LITELLM_LOG_LEVEL`: Set to ERROR to reduce noise + +### Fixtures + +The test suite provides several useful fixtures in `conftest.py`: + +- **`sample_system_config`**: Pre-configured SystemConfig object +- **`sample_llm_config`**: Pre-configured LLMConfig object +- **`sample_turn_data`**: Sample conversation turn data +- **`sample_evaluation_data`**: Complete evaluation data structure +- **`mock_llm_manager`**: Mocked LLM manager for testing +- **`temp_config_files`**: Temporary configuration files +- **`temp_output_dir`**: Temporary output directory + +## Test Coverage + +To generate a coverage report: + +```bash +# Generate HTML coverage report +python -m pytest --cov=lightspeed_evaluation --cov-report=html tests/ + +# Generate terminal coverage report +python -m pytest --cov=lightspeed_evaluation --cov-report=term-missing tests/ + +# Using the test runner +python tests/run_tests.py --coverage +``` + +The HTML coverage report will be generated in `htmlcov/index.html`. + +## Writing New Tests + +### Test File Organization + +- **Unit tests**: Test individual functions/classes with mocked dependencies +- **Integration tests**: Test component interactions with real or realistic data +- **Use descriptive test names**: `test_load_system_config_with_valid_file` +- **Group related tests**: Use test classes to organize related functionality + +### Example Test Structure + +```python +class TestMyComponent: + """Test MyComponent functionality.""" + + def test_basic_functionality(self): + """Test basic functionality with valid input.""" + # Arrange + component = MyComponent() + + # Act + result = component.do_something() + + # Assert + assert result is not None + + def test_error_handling(self): + """Test error handling with invalid input.""" + component = MyComponent() + + with pytest.raises(ValueError, match="Expected error message"): + component.do_something_invalid() + + @pytest.mark.integration + def test_integration_scenario(self): + """Test integration with other components.""" + # Integration test code here + pass +``` + +### Using Fixtures + +```python +def test_with_fixtures(sample_system_config, temp_output_dir): + """Test using provided fixtures.""" + # Use the fixtures in your test + assert sample_system_config.llm_provider == "openai" + assert Path(temp_output_dir).exists() +``` + +### Mocking External Dependencies + +```python +@patch('lightspeed_evaluation.core.metrics.ragas.evaluate') +def test_with_mocked_dependency(mock_evaluate): + """Test with mocked external dependency.""" + # Configure mock + mock_evaluate.return_value = MagicMock() + + # Run test + result = my_function_that_uses_ragas() + + # Verify mock was called + mock_evaluate.assert_called_once() +``` + +## Continuous Integration + +The test suite is designed to work in CI environments: + +- All external dependencies are mocked +- Temporary files are properly cleaned up +- Tests are deterministic and don't rely on external services +- Environment variables are properly managed + +## Troubleshooting + +### Common Issues + +1. **Import Errors**: Make sure the package is installed in development mode: + ```bash + pip install -e . + ``` + +2. **Missing Dependencies**: Install test dependencies: + ```bash + pip install pytest pytest-cov + ``` + +3. **Configuration File Tests**: Some tests require the actual config files to exist: + - `config/system.yaml` + - `config/evaluation_data.yaml` + +4. **Environment Variables**: Tests automatically set required environment variables, but you can override them if needed. + +### Debug Mode + +Run tests with more verbose output for debugging: + +```bash +python -m pytest tests/ -v -s --tb=long +``` + +### Running Individual Tests + +For debugging specific tests: + +```bash +# Run a specific test with full output +python -m pytest tests/test_config.py::TestSystemConfig::test_system_config_defaults -v -s + +# Run with pdb debugger on failure +python -m pytest tests/test_config.py --pdb +``` + +## Contributing + +When adding new functionality: + +1. Write tests for new features +2. Ensure good test coverage (aim for >90%) +3. Use appropriate markers for test categorization +4. Mock external dependencies +5. Add integration tests for complex workflows +6. Update this README if adding new test categories or patterns diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..6dd17948 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,301 @@ +"""Pytest configuration and shared fixtures.""" + +import os +import tempfile +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +import yaml + +from lightspeed_evaluation.core.config import EvaluationData, SystemConfig, TurnData +from lightspeed_evaluation.core.llm.manager import LLMConfig, LLMManager + + +@pytest.fixture(scope="session") +def test_data_dir(): + """Provide test data directory.""" + return Path(__file__).parent / "data" + + +@pytest.fixture(scope="session") +def config_dir(): + """Provide configuration directory.""" + return Path(__file__).parent.parent / "config" + + +@pytest.fixture +def sample_system_config(): + """Provide a sample SystemConfig for testing.""" + return SystemConfig( + llm_provider="openai", + llm_model="gpt-4o-mini", + llm_temperature=0.0, + llm_max_tokens=512, + output_dir="./test_output", + base_filename="test_evaluation", + include_graphs=True + ) + + +@pytest.fixture +def sample_llm_config(): + """Provide a sample LLMConfig for testing.""" + return LLMConfig( + provider="openai", + model="gpt-4o-mini", + temperature=0.0, + max_tokens=512, + timeout=300, + num_retries=3 + ) + + +@pytest.fixture +def sample_turn_data(): + """Provide sample TurnData for testing.""" + return TurnData( + turn_id=1, + query="What is Python?", + response="Python is a high-level programming language.", + contexts=[ + {"content": "Python is a programming language created by Guido van Rossum."}, + {"content": "Python is widely used for web development, data science, and automation."} + ], + expected_response="Python is a high-level programming language used for various applications." + ) + + +@pytest.fixture +def sample_evaluation_data(sample_turn_data): + """Provide sample EvaluationData for testing.""" + return EvaluationData( + conversation_group_id="test_conversation", + description="Test conversation for evaluation", + turn_metrics=["ragas:faithfulness", "ragas:response_relevancy"], + conversation_metrics=["deepeval:conversation_completeness"], + turns=[sample_turn_data] + ) + + +@pytest.fixture +def mock_llm_manager(): + """Provide a mock LLM manager.""" + manager = MagicMock(spec=LLMManager) + manager.get_model_name.return_value = "gpt-4o-mini" + manager.get_litellm_params.return_value = { + "model": "gpt-4o-mini", + "temperature": 0.0, + "max_tokens": 512, + "timeout": 300, + "num_retries": 3 + } + manager.config = LLMConfig( + provider="openai", + model="gpt-4o-mini", + temperature=0.0, + max_tokens=512, + timeout=300, + num_retries=3 + ) + return manager + + +@pytest.fixture +def temp_config_files(): + """Create temporary configuration files for testing.""" + system_config_data = { + "llm": { + "provider": "openai", + "model": "gpt-4o-mini", + "temperature": 0.0, + "max_tokens": 512, + "timeout": 300, + "num_retries": 3 + }, + "environment": { + "DEEPEVAL_TELEMETRY_OPT_OUT": "YES", + "LITELLM_LOG_LEVEL": "ERROR" + }, + "logging": { + "source_level": "INFO", + "package_level": "ERROR" + }, + "metrics_metadata": { + "turn_level": { + "ragas:faithfulness": { + "threshold": 0.8, + "type": "turn", + "framework": "ragas" + } + }, + "conversation_level": { + "deepeval:conversation_completeness": { + "threshold": 0.7, + "type": "conversation", + "framework": "deepeval" + } + } + }, + "output": { + "base_directory": "./test_output", + "base_filename": "test_evaluation", + "formats": {"csv": True, "json": True, "txt": True}, + "include_graphs": True + }, + "visualization": { + "figsize": [12, 8], + "dpi": 300 + } + } + + eval_data = [ + { + "conversation_group_id": "test_conv_1", + "description": "Test conversation 1", + "turn_metrics": ["ragas:faithfulness", "ragas:response_relevancy"], + "conversation_metrics": [], + "turn_metrics_metadata": {}, + "conversation_metrics_metadata": {}, + "turns": [ + { + "turn_id": 1, + "query": "What is machine learning?", + "response": "Machine learning is a subset of AI.", + "contexts": [ + {"content": "Machine learning is a method of data analysis."} + ], + "expected_response": "Machine learning is a subset of artificial intelligence." + } + ] + }, + { + "conversation_group_id": "test_conv_2", + "description": "Test conversation 2", + "turn_metrics": ["custom:answer_correctness"], + "conversation_metrics": ["deepeval:conversation_completeness"], + "turn_metrics_metadata": {}, + "conversation_metrics_metadata": {}, + "turns": [ + { + "turn_id": 1, + "query": "Explain neural networks", + "response": "Neural networks are computing systems inspired by biological neural networks.", + "contexts": [ + {"content": "Neural networks consist of interconnected nodes."} + ], + "expected_response": "Neural networks are computational models inspired by the human brain." + }, + { + "turn_id": 2, + "query": "What are the applications?", + "response": "Neural networks are used in image recognition, NLP, and more.", + "contexts": [ + {"content": "Applications include computer vision and natural language processing."} + ], + "expected_response": "Applications include computer vision, NLP, and pattern recognition." + } + ] + } + ] + + # Create temporary files + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_file: + yaml.dump(system_config_data, system_file, default_flow_style=False) + system_config_path = system_file.name + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_file: + yaml.dump(eval_data, eval_file, default_flow_style=False) + eval_data_path = eval_file.name + + yield { + "system_config": system_config_path, + "eval_data": eval_data_path + } + + # Cleanup + os.unlink(system_config_path) + os.unlink(eval_data_path) + + +@pytest.fixture +def temp_output_dir(): + """Create a temporary output directory for testing.""" + with tempfile.TemporaryDirectory() as temp_dir: + yield temp_dir + + +@pytest.fixture(autouse=True) +def setup_test_environment(): + """Set up test environment variables.""" + # Set required environment variables for testing + test_env_vars = { + "OPENAI_API_KEY": "test-api-key-for-testing", + "DEEPEVAL_TELEMETRY_OPT_OUT": "YES", + "DEEPEVAL_DISABLE_PROGRESS_BAR": "YES", + "LITELLM_LOG_LEVEL": "ERROR" + } + + # Store original values + original_values = {} + for key, value in test_env_vars.items(): + original_values[key] = os.environ.get(key) + os.environ[key] = value + + yield + + # Restore original values + for key, original_value in original_values.items(): + if original_value is None: + os.environ.pop(key, None) + else: + os.environ[key] = original_value + + +# Pytest markers for different test categories +def pytest_configure(config): + """Configure pytest with custom markers.""" + config.addinivalue_line( + "markers", "unit: mark test as a unit test" + ) + config.addinivalue_line( + "markers", "integration: mark test as an integration test" + ) + config.addinivalue_line( + "markers", "slow: mark test as slow running" + ) + config.addinivalue_line( + "markers", "config: mark test as configuration-related" + ) + config.addinivalue_line( + "markers", "metrics: mark test as metrics-related" + ) + config.addinivalue_line( + "markers", "output: mark test as output-related" + ) + config.addinivalue_line( + "markers", "cli: mark test as CLI-related" + ) + + +# Custom pytest collection hook to organize tests +def pytest_collection_modifyitems(config, items): + """Modify test collection to add markers based on test names and locations.""" + for item in items: + # Add markers based on test file names + if "test_config" in item.fspath.basename: + item.add_marker(pytest.mark.config) + elif "test_metrics" in item.fspath.basename: + item.add_marker(pytest.mark.metrics) + elif "test_cli" in item.fspath.basename: + item.add_marker(pytest.mark.cli) + elif "test_output" in item.fspath.basename: + item.add_marker(pytest.mark.output) + + # Add markers based on test names + if "integration" in item.name: + item.add_marker(pytest.mark.integration) + elif "slow" in item.name: + item.add_marker(pytest.mark.slow) + else: + item.add_marker(pytest.mark.unit) diff --git a/tests/run_tests.py b/tests/run_tests.py new file mode 100644 index 00000000..461d1c49 --- /dev/null +++ b/tests/run_tests.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +"""Test runner script for LightSpeed Evaluation Framework.""" + +import argparse +import subprocess +import sys +from pathlib import Path + + +def run_tests(test_type="all", verbose=False, coverage=False, markers=None): + """Run tests with specified options.""" + + # Use the same Python executable that's running this script + python_exe = sys.executable + + # Base pytest command + cmd = [python_exe, "-m", "pytest"] + + # Add verbosity + if verbose: + cmd.append("-v") + else: + cmd.append("-q") + + # Add coverage if requested + if coverage: + cmd.extend([ + "--cov=lightspeed_evaluation", + "--cov-report=html", + "--cov-report=term-missing", + "--cov-report=xml" + ]) + + # Add markers if specified + if markers: + cmd.extend(["-m", markers]) + + # Get the tests directory (current directory since we're inside tests/) + tests_dir = Path(__file__).parent + + # Add test selection based on type + if test_type == "unit": + cmd.extend(["-m", "unit"]) + elif test_type == "integration": + cmd.extend(["-m", "integration"]) + elif test_type == "config": + cmd.extend(["-m", "config"]) + elif test_type == "metrics": + cmd.extend(["-m", "metrics"]) + elif test_type == "cli": + cmd.extend(["-m", "cli"]) + elif test_type == "output": + cmd.extend(["-m", "output"]) + elif test_type == "slow": + cmd.extend(["-m", "slow"]) + elif test_type == "fast": + cmd.extend(["-m", "not slow"]) + elif test_type != "all": + # Specific test file or pattern + # If it's a relative path, make it relative to tests directory + if not test_type.startswith("/") and not test_type.startswith("tests/"): + test_type = str(tests_dir / test_type) + cmd.append(test_type) + + # Add tests directory for general test types + if test_type == "all" or test_type in ["unit", "integration", "config", "metrics", "cli", "output", "slow", "fast"]: + cmd.append(str(tests_dir)) + + print(f"Running command: {' '.join(cmd)}") + + # Run the tests + try: + result = subprocess.run(cmd, check=False) + return result.returncode + except KeyboardInterrupt: + print("\nTests interrupted by user") + return 1 + except Exception as e: + print(f"Error running tests: {e}") + return 1 + + +def main(): + """Main function for test runner.""" + parser = argparse.ArgumentParser( + description="Test runner for LightSpeed Evaluation Framework", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f""" +Examples: + {sys.executable} tests/run_tests.py # Run all tests + {sys.executable} tests/run_tests.py --type unit # Run only unit tests + {sys.executable} tests/run_tests.py --type integration # Run only integration tests + {sys.executable} tests/run_tests.py --type config # Run only config tests + {sys.executable} tests/run_tests.py --type metrics # Run only metrics tests + {sys.executable} tests/run_tests.py --type cli # Run only CLI tests + {sys.executable} tests/run_tests.py --type fast # Run fast tests (exclude slow) + {sys.executable} tests/run_tests.py --coverage # Run with coverage report + {sys.executable} tests/run_tests.py --verbose # Run with verbose output + {sys.executable} tests/run_tests.py --markers "unit and not slow" # Custom markers + {sys.executable} tests/run_tests.py test_config.py # Run specific test file + """ + ) + + parser.add_argument( + "--type", "-t", + choices=["all", "unit", "integration", "config", "metrics", "cli", "output", "slow", "fast"], + default="all", + help="Type of tests to run (default: all)" + ) + + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Run tests with verbose output" + ) + + parser.add_argument( + "--coverage", "-c", + action="store_true", + help="Run tests with coverage report" + ) + + parser.add_argument( + "--markers", "-m", + help="Custom pytest markers (e.g., 'unit and not slow')" + ) + + parser.add_argument( + "test_path", + nargs="?", + help="Specific test file or directory to run" + ) + + args = parser.parse_args() + + # Use test_path if provided, otherwise use type + test_type = args.test_path if args.test_path else args.type + + # Use the same Python executable that's running this script + python_exe = sys.executable + + # Check if pytest is available + try: + subprocess.run([python_exe, "-m", "pytest", "--version"], + check=True, capture_output=True) + except subprocess.CalledProcessError: + print("Error: pytest is not installed. Please install it with:") + print(f" {python_exe} -m pip install pytest") + if args.coverage: + print(f" {python_exe} -m pip install pytest-cov # for coverage support") + return 1 + + # Check if coverage is requested but not available + if args.coverage: + try: + subprocess.run([python_exe, "-m", "pytest_cov", "--version"], + check=True, capture_output=True) + except subprocess.CalledProcessError: + print("Warning: pytest-cov is not installed. Coverage disabled.") + print(f"Install it with: {python_exe} -m pip install pytest-cov") + args.coverage = False + + # Run the tests + return run_tests( + test_type=test_type, + verbose=args.verbose, + coverage=args.coverage, + markers=args.markers + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 00000000..21e0a33d --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,416 @@ +"""Tests for command-line interface.""" + +import os +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pytest +import yaml + +from lightspeed_evaluation.runner.evaluation import main, run_evaluation + + +class TestCLIInterface: + """Test command-line interface functionality.""" + + def test_main_with_help_argument(self): + """Test main function with help argument.""" + with patch('sys.argv', ['lightspeed-eval', '--help']): + with pytest.raises(SystemExit) as exc_info: + main() + # Help should exit with code 0 + assert exc_info.value.code == 0 + + def test_main_with_missing_system_config(self): + """Test main function with missing system config file.""" + with patch('sys.argv', ['lightspeed-eval', '--system-config', 'nonexistent.yaml']): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'): + result = main() + assert result == 1 # Should return error code + + def test_main_with_missing_eval_data(self): + """Test main function with missing evaluation data file.""" + # Create temporary system config + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + yaml.dump({"llm": {"provider": "openai"}}, f) + system_config_path = f.name + + try: + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', system_config_path, + '--eval-data', 'nonexistent.yaml' + ]): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'): + result = main() + assert result == 1 # Should return error code + finally: + os.unlink(system_config_path) + + @patch('lightspeed_evaluation.runner.evaluation.run_evaluation') + @patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables') + def test_main_with_valid_arguments(self, mock_setup_env, mock_run_eval): + """Test main function with valid arguments.""" + # Mock successful evaluation + mock_run_eval.return_value = { + "TOTAL": 5, + "PASS": 3, + "FAIL": 1, + "ERROR": 1 + } + + # Create temporary config files + system_config_data = { + "llm": {"provider": "openai", "model": "gpt-4"}, + "output": {"base_directory": "./test_output"} + } + + eval_data = [ + { + "conversation_group_id": "test_conv", + "turn_metrics": ["ragas:faithfulness"], + "turns": [{"turn_id": 1, "query": "q", "response": "r"}] + } + ] + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config: + yaml.dump(system_config_data, system_config) + system_config_path = system_config.name + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config: + yaml.dump(eval_data, eval_config) + eval_data_path = eval_config.name + + try: + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', system_config_path, + '--eval-data', eval_data_path + ]): + result = main() + assert result == 0 # Should return success code + + # Verify that run_evaluation was called with correct arguments + mock_run_eval.assert_called_once_with( + system_config_path, eval_data_path, None + ) + + finally: + os.unlink(system_config_path) + os.unlink(eval_data_path) + + @patch('lightspeed_evaluation.runner.evaluation.run_evaluation') + @patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables') + def test_main_with_output_dir_override(self, mock_setup_env, mock_run_eval): + """Test main function with output directory override.""" + mock_run_eval.return_value = {"TOTAL": 1, "PASS": 1, "FAIL": 0, "ERROR": 0} + + # Create temporary config files + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config: + yaml.dump({"llm": {"provider": "openai"}}, system_config) + system_config_path = system_config.name + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config: + yaml.dump([{"conversation_group_id": "test"}], eval_config) + eval_data_path = eval_config.name + + try: + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', system_config_path, + '--eval-data', eval_data_path, + '--output-dir', '/custom/output/dir' + ]): + result = main() + assert result == 0 + + # Verify that run_evaluation was called with custom output dir + mock_run_eval.assert_called_once_with( + system_config_path, eval_data_path, '/custom/output/dir' + ) + + finally: + os.unlink(system_config_path) + os.unlink(eval_data_path) + + @patch('lightspeed_evaluation.runner.evaluation.run_evaluation') + @patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables') + def test_main_with_evaluation_failure(self, mock_setup_env, mock_run_eval): + """Test main function when evaluation fails.""" + # Mock failed evaluation + mock_run_eval.return_value = None + + # Create temporary config files + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config: + yaml.dump({"llm": {"provider": "openai"}}, system_config) + system_config_path = system_config.name + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config: + yaml.dump([{"conversation_group_id": "test"}], eval_config) + eval_data_path = eval_config.name + + try: + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', system_config_path, + '--eval-data', eval_data_path + ]): + result = main() + assert result == 1 # Should return error code + + finally: + os.unlink(system_config_path) + os.unlink(eval_data_path) + + +class TestRunEvaluation: + """Test run_evaluation function.""" + + @patch('lightspeed_evaluation.runner.evaluation.EvaluationDriver') + @patch('lightspeed_evaluation.runner.evaluation.OutputHandler') + @patch('lightspeed_evaluation.runner.evaluation.DataValidator') + @patch('lightspeed_evaluation.runner.evaluation.ConfigLoader') + def test_run_evaluation_success(self, mock_config_loader_class, mock_validator_class, + mock_output_handler_class, mock_driver_class): + """Test successful run_evaluation execution.""" + # Mock ConfigLoader + mock_loader = mock_config_loader_class.return_value + mock_system_config = mock_loader.load_system_config.return_value + mock_system_config.llm_provider = "openai" + mock_system_config.llm_model = "gpt-4" + mock_system_config.output_dir = "./test_output" + mock_system_config.base_filename = "test_eval" + mock_system_config.include_graphs = True + + # Mock DataValidator + mock_validator = mock_validator_class.return_value + mock_validator.load_evaluation_data.return_value = ["mock_data"] + + # Mock EvaluationDriver + mock_driver = mock_driver_class.return_value + mock_results = ["mock_result1", "mock_result2"] + mock_driver.run_evaluation.return_value = mock_results + + # Mock OutputHandler + mock_output_handler = mock_output_handler_class.return_value + + # Create temporary config files + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config: + yaml.dump({"llm": {"provider": "openai"}}, system_config) + system_config_path = system_config.name + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config: + yaml.dump([{"conversation_group_id": "test"}], eval_config) + eval_data_path = eval_config.name + + try: + with patch('lightspeed_evaluation.runner.evaluation.calculate_basic_stats') as mock_stats: + mock_stats.return_value = { + "TOTAL": 2, + "PASS": 1, + "FAIL": 1, + "ERROR": 0 + } + + result = run_evaluation(system_config_path, eval_data_path) + + # Verify result + assert result is not None + assert result["TOTAL"] == 2 + assert result["PASS"] == 1 + assert result["FAIL"] == 1 + assert result["ERROR"] == 0 + + # Verify method calls + mock_loader.load_system_config.assert_called_once_with(system_config_path) + mock_validator.load_evaluation_data.assert_called_once_with(eval_data_path) + mock_driver.run_evaluation.assert_called_once_with(["mock_data"]) + mock_output_handler.generate_reports.assert_called_once_with(mock_results, include_graphs=True) + + finally: + os.unlink(system_config_path) + os.unlink(eval_data_path) + + @patch('lightspeed_evaluation.runner.evaluation.ConfigLoader') + def test_run_evaluation_config_loading_failure(self, mock_config_loader_class): + """Test run_evaluation with configuration loading failure.""" + # Mock ConfigLoader to raise exception + mock_loader = mock_config_loader_class.return_value + mock_loader.load_system_config.side_effect = FileNotFoundError("Config not found") + + result = run_evaluation("nonexistent_system.yaml", "nonexistent_data.yaml") + + assert result is None + + @patch('lightspeed_evaluation.runner.evaluation.EvaluationDriver') + @patch('lightspeed_evaluation.runner.evaluation.DataValidator') + @patch('lightspeed_evaluation.runner.evaluation.ConfigLoader') + def test_run_evaluation_with_custom_output_dir(self, mock_config_loader_class, + mock_validator_class, mock_driver_class): + """Test run_evaluation with custom output directory.""" + # Mock dependencies + mock_loader = mock_config_loader_class.return_value + mock_system_config = mock_loader.load_system_config.return_value + mock_system_config.llm_provider = "openai" + mock_system_config.llm_model = "gpt-4" + mock_system_config.output_dir = "./default_output" + mock_system_config.base_filename = "test_eval" + mock_system_config.include_graphs = False + + mock_validator = mock_validator_class.return_value + mock_validator.load_evaluation_data.return_value = [] + + mock_driver = mock_driver_class.return_value + mock_driver.run_evaluation.return_value = [] + + # Create temporary config files + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as system_config: + yaml.dump({"llm": {"provider": "openai"}}, system_config) + system_config_path = system_config.name + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as eval_config: + yaml.dump([], eval_config) + eval_data_path = eval_config.name + + try: + with patch('lightspeed_evaluation.runner.evaluation.OutputHandler') as mock_output_handler_class: + with patch('lightspeed_evaluation.runner.evaluation.calculate_basic_stats') as mock_stats: + mock_stats.return_value = {"TOTAL": 0, "PASS": 0, "FAIL": 0, "ERROR": 0} + + custom_output_dir = "/custom/output/path" + result = run_evaluation(system_config_path, eval_data_path, custom_output_dir) + + # Verify that OutputHandler was called with custom output directory + mock_output_handler_class.assert_called_once_with( + output_dir=custom_output_dir, + base_filename="test_eval", + system_config=mock_system_config + ) + + assert result is not None + + finally: + os.unlink(system_config_path) + os.unlink(eval_data_path) + + +class TestCLIArgumentParsing: + """Test CLI argument parsing.""" + + def test_default_arguments(self): + """Test CLI with default arguments.""" + with patch('sys.argv', ['lightspeed-eval']): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'): + with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path: + # Mock Path.exists to return False for default paths + mock_path.return_value.exists.return_value = False + + result = main() + assert result == 1 # Should fail due to missing files + + def test_custom_config_paths(self): + """Test CLI with custom configuration paths.""" + custom_system_config = "/path/to/custom/system.yaml" + custom_eval_data = "/path/to/custom/eval_data.yaml" + + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', custom_system_config, + '--eval-data', custom_eval_data + ]): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'): + with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path: + # Mock Path.exists to return False + mock_path.return_value.exists.return_value = False + + result = main() + assert result == 1 # Should fail due to missing files + + def test_argument_validation(self): + """Test argument validation in CLI.""" + # Test that the argument parser accepts the expected arguments + import argparse + from lightspeed_evaluation.runner.evaluation import main + + # This test verifies that the argument parser is set up correctly + # by checking that it doesn't raise an exception with valid arguments + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', 'test_system.yaml', + '--eval-data', 'test_eval.yaml', + '--output-dir', '/test/output' + ]): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'): + with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path: + mock_path.return_value.exists.return_value = False + + # Should not raise an ArgumentError + result = main() + assert result == 1 # Fails due to missing files, but args are valid + + +class TestCLIRealWorldScenarios: + """Test CLI with real-world scenarios.""" + + @patch('lightspeed_evaluation.runner.evaluation.run_evaluation') + def test_cli_with_custom_output_directory(self, mock_run_eval): + """Test CLI with custom output directory.""" + mock_run_eval.return_value = {"TOTAL": 5, "PASS": 4, "FAIL": 1, "ERROR": 0} + + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', 'config/system.yaml', + '--eval-data', 'config/evaluation_data.yaml', + '--output-dir', '/custom/output/path' + ]): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'): + with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path: + mock_path.return_value.exists.return_value = True + + result = main() + assert result == 0 + + @patch('lightspeed_evaluation.runner.evaluation.run_evaluation') + def test_cli_evaluation_with_mixed_results(self, mock_run_eval): + """Test CLI when evaluation has mixed results.""" + mock_run_eval.return_value = {"TOTAL": 10, "PASS": 6, "FAIL": 3, "ERROR": 1} + + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', 'config/system.yaml', + '--eval-data', 'config/evaluation_data.yaml' + ]): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables'): + with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path: + mock_path.return_value.exists.return_value = True + + result = main() + # Should still return 0 (success) as the evaluation completed + assert result == 0 + + def test_cli_with_environment_variables(self): + """Test CLI behavior with environment variables.""" + env_vars = { + 'OPENAI_API_KEY': 'test-key-123', + 'DEEPEVAL_TELEMETRY_OPT_OUT': 'YES', + 'LITELLM_LOG_LEVEL': 'ERROR' + } + + with patch('sys.argv', [ + 'lightspeed-eval', + '--system-config', 'config/system.yaml', + '--eval-data', 'config/evaluation_data.yaml' + ]): + with patch.dict(os.environ, env_vars): + with patch('lightspeed_evaluation.runner.evaluation.setup_environment_variables') as mock_setup: + with patch('lightspeed_evaluation.runner.evaluation.Path') as mock_path: + mock_path.return_value.exists.return_value = False + + result = main() + + # Verify environment setup was called + mock_setup.assert_called_once() + + # Should fail due to missing files, but env setup should have been called + assert result == 1 diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 00000000..a54ca75a --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,513 @@ +"""Tests for configuration components.""" + +import os +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pytest +import yaml + +from lightspeed_evaluation.core.config import ( + ConfigLoader, + DataValidator, + EvaluationData, + LLMConfig, + SystemConfig, + TurnData, + setup_environment_variables, +) + + +class TestSystemConfig: + """Test SystemConfig model.""" + + def test_system_config_defaults(self): + """Test SystemConfig with default values.""" + config = SystemConfig() + + assert config.llm_provider == "openai" + assert config.llm_model == "gpt-4o-mini" + assert config.llm_temperature == 0.0 + assert config.output_dir == "./eval_output" + assert config.include_graphs is True + + def test_system_config_custom_values(self): + """Test SystemConfig with custom values.""" + config = SystemConfig( + llm_provider="anthropic", + llm_model="claude-3-sonnet", + llm_temperature=0.5, + output_dir="./custom_output", + include_graphs=False + ) + + assert config.llm_provider == "anthropic" + assert config.llm_model == "claude-3-sonnet" + assert config.llm_temperature == 0.5 + assert config.output_dir == "./custom_output" + assert config.include_graphs is False + + +class TestLLMConfig: + """Test LLMConfig model.""" + + def test_llm_config_validation(self): + """Test LLMConfig validation.""" + config = LLMConfig( + provider="openai", + model="gpt-4", + temperature=0.7, + max_tokens=1000, + timeout=60, + num_retries=2 + ) + + assert config.provider == "openai" + assert config.model == "gpt-4" + assert config.temperature == 0.7 + assert config.max_tokens == 1000 + assert config.timeout == 60 + assert config.num_retries == 2 + + def test_llm_config_invalid_temperature(self): + """Test LLMConfig with invalid temperature.""" + with pytest.raises(ValueError): + LLMConfig( + provider="openai", + model="gpt-4", + temperature=3.0 # Invalid: > 2.0 + ) + + def test_llm_config_from_dict(self): + """Test creating LLMConfig from dictionary.""" + config_dict = { + "provider": "anthropic", + "model": "claude-3-haiku", + "temperature": 0.3, + "max_tokens": 800, + "timeout": 120, + "num_retries": 1 + } + + config = LLMConfig.from_dict(config_dict) + + assert config.provider == "anthropic" + assert config.model == "claude-3-haiku" + assert config.temperature == 0.3 + + +class TestTurnData: + """Test TurnData model.""" + + def test_valid_turn_data(self): + """Test valid TurnData creation.""" + turn = TurnData( + turn_id=1, + query="What is AI?", + response="AI is artificial intelligence.", + contexts=[{"content": "AI context"}], + expected_response="AI stands for artificial intelligence." + ) + + assert turn.turn_id == 1 + assert turn.query == "What is AI?" + assert turn.response == "AI is artificial intelligence." + assert len(turn.contexts) == 1 + assert turn.contexts[0]["content"] == "AI context" + + def test_turn_data_validation_empty_query(self): + """Test TurnData validation with empty query.""" + with pytest.raises(ValueError, match="Query and response cannot be empty"): + TurnData( + turn_id=1, + query="", + response="Valid response" + ) + + def test_turn_data_validation_invalid_turn_id(self): + """Test TurnData validation with invalid turn_id.""" + with pytest.raises(ValueError, match="Turn ID must be positive"): + TurnData( + turn_id=0, # Invalid: must be positive + query="Valid query", + response="Valid response" + ) + + def test_turn_data_context_validation(self): + """Test TurnData context validation.""" + with pytest.raises(ValueError, match='Context 0 must have a "content" field'): + TurnData( + turn_id=1, + query="Valid query", + response="Valid response", + contexts=[{"invalid": "no content field"}] + ) + + +class TestEvaluationData: + """Test EvaluationData model.""" + + def test_valid_evaluation_data(self): + """Test valid EvaluationData creation.""" + turn = TurnData( + turn_id=1, + query="Test query", + response="Test response" + ) + + eval_data = EvaluationData( + conversation_group_id="test_conv", + description="Test conversation", + turn_metrics=["ragas:faithfulness"], + conversation_metrics=["deepeval:completeness"], + turns=[turn] + ) + + assert eval_data.conversation_group_id == "test_conv" + assert eval_data.description == "Test conversation" + assert len(eval_data.turn_metrics) == 1 + assert len(eval_data.conversation_metrics) == 1 + assert len(eval_data.turns) == 1 + + def test_evaluation_data_empty_conversation_id(self): + """Test EvaluationData with empty conversation_group_id.""" + with pytest.raises(ValueError, match="Conversation group ID cannot be empty"): + EvaluationData( + conversation_group_id="", + turns=[TurnData(turn_id=1, query="q", response="r")] + ) + + def test_evaluation_data_empty_turns(self): + """Test EvaluationData with empty turns.""" + with pytest.raises(ValueError, match="Conversation must have at least one turn"): + EvaluationData( + conversation_group_id="test_conv", + turns=[] + ) + + def test_evaluation_data_invalid_metric_format(self): + """Test EvaluationData with invalid metric format.""" + turn = TurnData(turn_id=1, query="q", response="r") + + with pytest.raises(ValueError, match='must be in format "framework:metric_name"'): + EvaluationData( + conversation_group_id="test_conv", + turn_metrics=["invalid_metric"], # Missing colon + turns=[turn] + ) + + +class TestConfigLoader: + """Test ConfigLoader functionality.""" + + def test_config_loader_initialization(self): + """Test ConfigLoader initialization.""" + loader = ConfigLoader() + + assert loader.system_config is None + assert loader.evaluation_data is None + assert loader.logger is None + + @patch('lightspeed_evaluation.core.config.loader.setup_logging') + def test_load_system_config_with_mock(self, mock_setup_logging): + """Test loading system config with mocked dependencies.""" + # Create temporary config file + config_data = { + "llm": { + "provider": "openai", + "model": "gpt-4", + "temperature": 0.5 + }, + "output": { + "base_directory": "./test_output" + }, + "logging": { + "source_level": "DEBUG" + }, + "metrics_metadata": { + "turn_level": {}, + "conversation_level": {} + } + } + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + yaml.dump(config_data, f) + config_path = f.name + + try: + loader = ConfigLoader() + system_config = loader.load_system_config(config_path) + + assert system_config.llm_provider == "openai" + assert system_config.llm_model == "gpt-4" + assert system_config.llm_temperature == 0.5 + assert system_config.output_dir == "./test_output" + + finally: + os.unlink(config_path) + + +class TestEnvironmentSetup: + """Test environment variable setup.""" + + def test_setup_environment_variables_success(self): + """Test successful environment variable setup.""" + config_data = { + "environment": { + "TEST_VAR": "test_value", + "ANOTHER_VAR": "another_value" + } + } + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + yaml.dump(config_data, f) + config_path = f.name + + try: + # Clear any existing values + os.environ.pop("TEST_VAR", None) + os.environ.pop("ANOTHER_VAR", None) + + setup_environment_variables(config_path) + + assert os.environ.get("TEST_VAR") == "test_value" + assert os.environ.get("ANOTHER_VAR") == "another_value" + + finally: + os.unlink(config_path) + # Clean up + os.environ.pop("TEST_VAR", None) + os.environ.pop("ANOTHER_VAR", None) + + def test_setup_environment_variables_fallback(self): + """Test environment variable setup with fallback.""" + # Test with non-existent file + setup_environment_variables("nonexistent_config.yaml") + + # Should set fallback values + assert os.environ.get("DEEPEVAL_TELEMETRY_OPT_OUT") == "YES" + assert os.environ.get("LITELLM_LOG_LEVEL") == "ERROR" + + +class TestDataValidator: + """Test DataValidator functionality.""" + + def test_data_validator_initialization(self): + """Test DataValidator initialization.""" + validator = DataValidator() + + assert validator.validation_errors == [] + assert validator.evaluation_data is None + + def test_load_evaluation_data_from_yaml(self): + """Test loading evaluation data from YAML file.""" + eval_data = [ + { + "conversation_group_id": "test_conv", + "turn_metrics": ["ragas:faithfulness"], + "conversation_metrics": [], + "turns": [ + { + "turn_id": 1, + "query": "Test query", + "response": "Test response", + "contexts": [ + {"content": "Test context for faithfulness metric"} + ] + } + ] + } + ] + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + yaml.dump(eval_data, f) + data_path = f.name + + try: + # Mock the metric validation to avoid the validation error + with patch('lightspeed_evaluation.core.config.validator.TURN_LEVEL_METRICS', {"ragas:faithfulness"}): + with patch('lightspeed_evaluation.core.config.validator.CONVERSATION_LEVEL_METRICS', set()): + validator = DataValidator() + loaded_data = validator.load_evaluation_data(data_path) + + assert len(loaded_data) == 1 + assert loaded_data[0].conversation_group_id == "test_conv" + assert len(loaded_data[0].turns) == 1 + + finally: + os.unlink(data_path) + + +class TestConfigurationScenarios: + """Test realistic configuration scenarios.""" + + def test_system_config_with_different_providers(self): + """Test SystemConfig with different LLM providers.""" + providers_config = [ + {"provider": "openai", "model": "gpt-4o-mini", "temperature": 0.0}, + {"provider": "anthropic", "model": "claude-3-sonnet", "temperature": 0.1}, + ] + + for config_data in providers_config: + config = SystemConfig( + llm_provider=config_data["provider"], + llm_model=config_data["model"], + llm_temperature=config_data["temperature"] + ) + + assert config.llm_provider == config_data["provider"] + assert config.llm_model == config_data["model"] + assert config.llm_temperature == config_data["temperature"] + + def test_evaluation_data_with_multiple_metrics(self): + """Test EvaluationData with comprehensive metric configurations.""" + eval_data = EvaluationData( + conversation_group_id="comprehensive_eval", + description="Full evaluation with multiple metrics", + turn_metrics=[ + "ragas:faithfulness", + "ragas:response_relevancy", + "custom:answer_correctness" + ], + conversation_metrics=[ + "deepeval:conversation_completeness" + ], + turn_metrics_metadata={ + "ragas:faithfulness": {"threshold": 0.85}, + "custom:answer_correctness": {"threshold": 0.80} + }, + conversation_metrics_metadata={ + "deepeval:conversation_completeness": {"threshold": 0.75} + }, + turns=[ + TurnData( + turn_id=1, + query="What are the benefits of cloud computing?", + response="Cloud computing offers scalability, cost-effectiveness, and accessibility.", + contexts=[ + {"content": "Cloud computing provides on-demand access to computing resources."}, + {"content": "Benefits include reduced infrastructure costs and improved scalability."} + ], + expected_response="Cloud computing provides scalable, cost-effective computing resources." + ) + ] + ) + + assert len(eval_data.turn_metrics) == 3 + assert len(eval_data.conversation_metrics) == 1 + assert len(eval_data.turns) == 1 + assert eval_data.turn_metrics_metadata["ragas:faithfulness"]["threshold"] == 0.85 + + def test_turn_data_with_rich_context(self): + """Test TurnData with comprehensive context information.""" + turn = TurnData( + turn_id=1, + query="How does machine learning model training work?", + response="Machine learning model training involves feeding data to algorithms that learn patterns and make predictions.", + contexts=[ + {"content": "Machine learning training requires large datasets and computational resources."}, + {"content": "The training process involves iterative optimization of model parameters."}, + {"content": "Validation datasets help prevent overfitting during training."} + ], + expected_response="ML training feeds data to algorithms to learn patterns through iterative optimization." + ) + + assert len(turn.contexts) == 3 + assert all("content" in ctx for ctx in turn.contexts) + assert "machine learning" in turn.query.lower() + assert "training" in turn.response.lower() + + def test_load_evaluation_data_invalid_yaml(self): + """Test loading invalid YAML evaluation data.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write("invalid: yaml: content: [") + data_path = f.name + + try: + validator = DataValidator() + with pytest.raises((ValueError, yaml.YAMLError)): + validator.load_evaluation_data(data_path) + + finally: + os.unlink(data_path) + + +class TestConfigurationScenarios: + """Test realistic configuration scenarios.""" + + def test_system_config_with_different_providers(self): + """Test SystemConfig with different LLM providers.""" + providers_config = [ + {"provider": "openai", "model": "gpt-4o-mini", "temperature": 0.0}, + {"provider": "anthropic", "model": "claude-3-sonnet", "temperature": 0.1}, + ] + + for config_data in providers_config: + config = SystemConfig( + llm_provider=config_data["provider"], + llm_model=config_data["model"], + llm_temperature=config_data["temperature"] + ) + + assert config.llm_provider == config_data["provider"] + assert config.llm_model == config_data["model"] + assert config.llm_temperature == config_data["temperature"] + + def test_evaluation_data_with_multiple_metrics(self): + """Test EvaluationData with comprehensive metric configurations.""" + eval_data = EvaluationData( + conversation_group_id="comprehensive_eval", + description="Full evaluation with multiple metrics", + turn_metrics=[ + "ragas:faithfulness", + "ragas:response_relevancy", + "custom:answer_correctness" + ], + conversation_metrics=[ + "deepeval:conversation_completeness" + ], + turn_metrics_metadata={ + "ragas:faithfulness": {"threshold": 0.85}, + "custom:answer_correctness": {"threshold": 0.80} + }, + conversation_metrics_metadata={ + "deepeval:conversation_completeness": {"threshold": 0.75} + }, + turns=[ + TurnData( + turn_id=1, + query="What are the benefits of cloud computing?", + response="Cloud computing offers scalability, cost-effectiveness, and accessibility.", + contexts=[ + {"content": "Cloud computing provides on-demand access to computing resources."}, + {"content": "Benefits include reduced infrastructure costs and improved scalability."} + ], + expected_response="Cloud computing provides scalable, cost-effective computing resources." + ) + ] + ) + + assert len(eval_data.turn_metrics) == 3 + assert len(eval_data.conversation_metrics) == 1 + assert len(eval_data.turns) == 1 + assert eval_data.turn_metrics_metadata["ragas:faithfulness"]["threshold"] == 0.85 + + def test_turn_data_with_rich_context(self): + """Test TurnData with comprehensive context information.""" + turn = TurnData( + turn_id=1, + query="How does machine learning model training work?", + response="Machine learning model training involves feeding data to algorithms that learn patterns and make predictions.", + contexts=[ + {"content": "Machine learning training requires large datasets and computational resources."}, + {"content": "The training process involves iterative optimization of model parameters."}, + {"content": "Validation datasets help prevent overfitting during training."} + ], + expected_response="ML training feeds data to algorithms to learn patterns through iterative optimization." + ) + + assert len(turn.contexts) == 3 + assert all("content" in ctx for ctx in turn.contexts) + assert "machine learning" in turn.query.lower() + assert "training" in turn.response.lower() diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 259c8822..bfc95c56 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -1,6 +1,426 @@ -"""Evaluation tests""" +"""Comprehensive tests for LightSpeed Evaluation Framework.""" +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch -def test_evaluation(): - """Test evaluation""" - assert True +import pytest +import yaml + +from lightspeed_evaluation import ( + ConfigLoader, + DataValidator, + EvaluationDriver, + OutputHandler, +) +from lightspeed_evaluation.core.config import EvaluationData, EvaluationResult, TurnData +from lightspeed_evaluation.runner.evaluation import main, run_evaluation + + +class TestConfigLoading: + """Test configuration loading functionality.""" + + def test_load_system_config_success(self): + """Test successful loading of system configuration.""" + config_path = "config/system.yaml" + + # Skip if config file doesn't exist + if not Path(config_path).exists(): + pytest.skip(f"Config file {config_path} not found") + + loader = ConfigLoader() + system_config = loader.load_system_config(config_path) + + # Verify basic configuration + assert system_config.llm_provider == "openai" + assert system_config.llm_model == "gpt-4o-mini" + assert system_config.llm_temperature == 0.0 + assert system_config.output_dir == "./eval_output" + assert system_config.include_graphs is True + + def test_load_evaluation_data_success(self): + """Test successful loading of evaluation data.""" + data_path = "config/evaluation_data.yaml" + + # Skip if data file doesn't exist + if not Path(data_path).exists(): + pytest.skip(f"Data file {data_path} not found") + + validator = DataValidator() + evaluation_data = validator.load_evaluation_data(data_path) + + # Verify data structure + assert len(evaluation_data) == 3 # Based on the sample data + assert evaluation_data[0].conversation_group_id == "conv_group_1" + assert len(evaluation_data[0].turns) == 1 + assert evaluation_data[0].turns[0].query == "User query" + + def test_load_nonexistent_config_file(self): + """Test loading non-existent configuration file.""" + loader = ConfigLoader() + + with pytest.raises(FileNotFoundError): + loader.load_system_config("nonexistent_config.yaml") + + def test_load_invalid_yaml_config(self): + """Test loading invalid YAML configuration.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write("invalid: yaml: content: [") + invalid_config_path = f.name + + try: + loader = ConfigLoader() + with pytest.raises(yaml.YAMLError): + loader.load_system_config(invalid_config_path) + finally: + os.unlink(invalid_config_path) + + +class TestDataValidation: + """Test data validation functionality.""" + + def test_valid_evaluation_data(self): + """Test validation of valid evaluation data.""" + valid_data = [ + EvaluationData( + conversation_group_id="test_conv", + turn_metrics=["ragas:faithfulness"], + conversation_metrics=[], + turns=[ + TurnData( + turn_id=1, + query="Test query", + response="Test response", + contexts=[{"content": "Test context"}], + expected_response="Expected response" + ) + ] + ) + ] + + validator = DataValidator() + result = validator.validate_evaluation_data(valid_data) + assert result is True + + def test_invalid_evaluation_data_empty_turns(self): + """Test validation fails for empty turns.""" + with pytest.raises(ValueError, match="Conversation must have at least one turn"): + EvaluationData( + conversation_group_id="test_conv", + turn_metrics=["ragas:faithfulness"], + conversation_metrics=[], + turns=[] # Empty turns should fail + ) + + def test_invalid_evaluation_data_empty_query(self): + """Test validation fails for empty query.""" + with pytest.raises(ValueError, match="Query and response cannot be empty"): + TurnData( + turn_id=1, + query="", # Empty query should fail + response="Test response" + ) + + +class TestEvaluationDriver: + """Test EvaluationDriver functionality.""" + + @pytest.fixture + def mock_config_loader(self): + """Create a mock config loader.""" + loader = MagicMock(spec=ConfigLoader) + loader.get_llm_config_dict.return_value = { + "llm": { + "provider": "openai", + "model": "gpt-4o-mini", + "temperature": 0.0, + "max_tokens": 512, + "timeout": 300, + "num_retries": 3 + } + } + # Add system_config attribute + loader.system_config = MagicMock() + loader.system_config.default_turn_metrics_metadata = {} + loader.system_config.default_conversation_metrics_metadata = {} + return loader + + @pytest.fixture + def sample_evaluation_data(self): + """Create sample evaluation data.""" + return [ + EvaluationData( + conversation_group_id="test_conv", + turn_metrics=["ragas:faithfulness"], + conversation_metrics=[], + turns=[ + TurnData( + turn_id=1, + query="What is Python?", + response="Python is a programming language.", + contexts=[{"content": "Python is a high-level programming language."}], + expected_response="Python is a programming language used for development." + ) + ] + ) + ] + + @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) + def test_evaluation_driver_initialization(self, mock_config_loader): + """Test EvaluationDriver initialization.""" + driver = EvaluationDriver(mock_config_loader) + assert driver.config_loader == mock_config_loader + assert driver.data_validator is not None + assert driver.metrics_manager is not None + + @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) + @patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate') + def test_evaluation_driver_run_evaluation(self, mock_ragas_evaluate, mock_config_loader, sample_evaluation_data): + """Test running evaluation with mocked metrics.""" + # Mock the ragas evaluation to return a score + mock_ragas_evaluate.return_value = (0.85, "Mocked faithfulness evaluation") + + driver = EvaluationDriver(mock_config_loader) + results = driver.run_evaluation(sample_evaluation_data) + + assert len(results) == 1 + assert results[0].conversation_group_id == "test_conv" + assert results[0].metric_identifier == "ragas:faithfulness" + assert results[0].score == 0.85 + + +class TestOutputGeneration: + """Test output and report generation.""" + + @pytest.fixture + def sample_results(self): + """Create sample evaluation results.""" + return [ + EvaluationResult( + conversation_group_id="test_conv", + turn_id=1, + metric_identifier="ragas:faithfulness", + result="PASS", + score=0.85, + threshold=0.8, + reason="Good faithfulness score", + query="Test query", + response="Test response", + execution_time=1.5 + ), + EvaluationResult( + conversation_group_id="test_conv", + turn_id=1, + metric_identifier="ragas:response_relevancy", + result="FAIL", + score=0.65, + threshold=0.8, + reason="Low relevancy score", + query="Test query", + response="Test response", + execution_time=1.2 + ) + ] + + def test_output_handler_initialization(self): + """Test OutputHandler initialization.""" + with tempfile.TemporaryDirectory() as temp_dir: + handler = OutputHandler( + output_dir=temp_dir, + base_filename="test_evaluation" + ) + assert handler.output_dir == Path(temp_dir) + assert handler.base_filename == "test_evaluation" + + def test_generate_reports(self, sample_results): + """Test report generation.""" + with tempfile.TemporaryDirectory() as temp_dir: + handler = OutputHandler( + output_dir=temp_dir, + base_filename="test_evaluation" + ) + + # Generate reports without graphs to avoid matplotlib issues in tests + handler.generate_reports(sample_results, include_graphs=False) + + # Check that files were created + output_files = list(Path(temp_dir).glob("test_evaluation_*")) + assert len(output_files) >= 3 # CSV, JSON, TXT files + + +class TestIntegrationWithRealConfigs: + """Integration tests using real configuration files.""" + + @pytest.mark.integration + @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) + @patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate') + @patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalMetrics.evaluate') + @patch('lightspeed_evaluation.core.metrics.custom.CustomMetrics.evaluate') + def test_full_evaluation_pipeline(self, mock_custom, mock_deepeval, mock_ragas): + """Test the complete evaluation pipeline with real config files.""" + system_config_path = "config/system.yaml" + eval_data_path = "config/evaluation_data.yaml" + + # Skip if config files don't exist + if not (Path(system_config_path).exists() and Path(eval_data_path).exists()): + pytest.skip("Config files not found") + + # Mock all metric evaluations + mock_ragas.return_value = (0.85, "Mocked ragas evaluation") + mock_deepeval.return_value = (0.75, "Mocked deepeval evaluation") + mock_custom.return_value = (0.80, "Mocked custom evaluation") + + with tempfile.TemporaryDirectory() as temp_dir: + summary = run_evaluation( + system_config_path=system_config_path, + evaluation_data_path=eval_data_path, + output_dir=temp_dir + ) + + # Verify summary statistics + assert summary is not None + assert "TOTAL" in summary + assert "PASS" in summary + assert "FAIL" in summary + assert "ERROR" in summary + assert summary["TOTAL"] > 0 + + # Verify output files were created + output_files = list(Path(temp_dir).glob("evaluation_*")) + assert len(output_files) >= 3 # At least CSV, JSON, TXT + + @pytest.mark.integration + def test_evaluation_with_mixed_results(self): + """Test evaluation pipeline with mixed pass/fail results.""" + # Create test data with scenarios that should pass and fail + test_data = [ + EvaluationData( + conversation_group_id="high_quality_conv", + turn_metrics=["ragas:faithfulness", "ragas:response_relevancy"], + turns=[ + TurnData( + turn_id=1, + query="What is renewable energy?", + response="Renewable energy comes from natural sources that replenish themselves, such as solar, wind, and hydroelectric power.", + contexts=[{"content": "Renewable energy sources are naturally replenishing and include solar, wind, water, and geothermal power."}] + ) + ] + ), + EvaluationData( + conversation_group_id="low_quality_conv", + turn_metrics=["ragas:faithfulness"], + turns=[ + TurnData( + turn_id=1, + query="Explain quantum computing", + response="Quantum computing uses quantum bits.", + contexts=[{"content": "Quantum computing leverages quantum mechanical phenomena like superposition and entanglement to process information in fundamentally different ways than classical computers."}] + ) + ] + ) + ] + + with patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate') as mock_ragas: + # Mock different scores for different conversations + def side_effect(metric_name, conv_data, scope): + if conv_data.conversation_group_id == "high_quality_conv": + return (0.92, "High quality response with good faithfulness") + else: + return (0.45, "Low quality response, lacks detail") + + mock_ragas.side_effect = side_effect + + mock_config_loader = MagicMock(spec=ConfigLoader) + mock_config_loader.get_llm_config_dict.return_value = { + "llm": {"provider": "openai", "model": "gpt-4o-mini", "temperature": 0.0} + } + mock_config_loader.system_config = MagicMock() + mock_config_loader.system_config.default_turn_metrics_metadata = { + "ragas:faithfulness": {"threshold": 0.8} + } + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + driver = EvaluationDriver(mock_config_loader) + results = driver.run_evaluation(test_data) + + # Should have mixed results + assert len(results) == 3 # 2 from first conv, 1 from second + pass_results = [r for r in results if r.result == "PASS"] + fail_results = [r for r in results if r.result == "FAIL"] + + assert len(pass_results) >= 1 + assert len(fail_results) >= 1 + + def test_evaluation_with_missing_context_data(self): + """Test evaluation behavior when required context data is missing.""" + test_data = [ + EvaluationData( + conversation_group_id="missing_context_conv", + turn_metrics=["ragas:faithfulness"], # Requires context + turns=[ + TurnData( + turn_id=1, + query="What is AI?", + response="AI is artificial intelligence.", + contexts=[] # Missing required context + ) + ] + ) + ] + + validator = DataValidator() + + # Should fail validation due to missing context for faithfulness metric + with patch('lightspeed_evaluation.core.config.validator.TURN_LEVEL_METRICS', {"ragas:faithfulness"}): + result = validator.validate_evaluation_data(test_data) + assert result is False + assert len(validator.validation_errors) > 0 + assert "requires contexts" in validator.validation_errors[0] + + def test_evaluation_with_threshold_variations(self): + """Test evaluation with different threshold configurations.""" + test_data = [ + EvaluationData( + conversation_group_id="threshold_test_conv", + turn_metrics=["ragas:faithfulness"], + turn_metrics_metadata={ + "ragas:faithfulness": {"threshold": 0.9} # High threshold + }, + turns=[ + TurnData( + turn_id=1, + query="Explain photosynthesis", + response="Photosynthesis is how plants make food using sunlight.", + contexts=[{"content": "Photosynthesis is the process by which plants convert light energy into chemical energy."}] + ) + ] + ) + ] + + with patch('lightspeed_evaluation.core.metrics.ragas.RagasMetrics.evaluate') as mock_ragas: + mock_ragas.return_value = (0.85, "Good faithfulness score") # Below 0.9 threshold + + mock_config_loader = MagicMock(spec=ConfigLoader) + mock_config_loader.get_llm_config_dict.return_value = { + "llm": {"provider": "openai", "model": "gpt-4o-mini"} + } + mock_config_loader.system_config = MagicMock() + mock_config_loader.system_config.default_turn_metrics_metadata = {} + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + driver = EvaluationDriver(mock_config_loader) + results = driver.run_evaluation(test_data) + + assert len(results) == 1 + assert results[0].result == "FAIL" # 0.85 < 0.9 threshold + assert results[0].threshold == 0.9 + + +# Pytest configuration +def pytest_configure(config): + """Configure pytest with custom markers.""" + config.addinivalue_line( + "markers", "integration: mark test as integration test" + ) diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 00000000..50bac2ae --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,583 @@ +"""Tests for metrics components.""" + +import os +from unittest.mock import MagicMock, patch + +import pytest + +from lightspeed_evaluation.core.config import EvaluationData, TurnData +from lightspeed_evaluation.core.llm.manager import LLMConfig, LLMManager +from lightspeed_evaluation.core.metrics.custom import CustomMetrics +from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics +from lightspeed_evaluation.core.metrics.ragas import RagasMetrics +from lightspeed_evaluation.core.output.statistics import EvaluationScope + + +class TestLLMManager: + """Test LLM Manager functionality.""" + + def test_llm_manager_initialization(self): + """Test LLM Manager initialization with OpenAI.""" + config = LLMConfig( + provider="openai", + model="gpt-4o-mini", + temperature=0.0, + max_tokens=512, + timeout=300, + num_retries=3 + ) + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + manager = LLMManager(config) + + assert manager.config == config + assert manager.model_name == "gpt-4o-mini" + + def test_llm_manager_missing_api_key(self): + """Test LLM Manager with missing API key.""" + config = LLMConfig(provider="openai", model="gpt-4o-mini") + + with patch.dict(os.environ, {}, clear=True): + with pytest.raises(Exception, match="OPENAI_API_KEY"): + LLMManager(config) + + def test_get_litellm_params(self): + """Test getting LiteLLM parameters.""" + config = LLMConfig( + provider="openai", + model="gpt-4o-mini", + temperature=0.0, + max_tokens=512, + timeout=300, + num_retries=3 + ) + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + manager = LLMManager(config) + params = manager.get_litellm_params() + + assert params["model"] == "gpt-4o-mini" + assert params["temperature"] == 0.0 + assert params["max_tokens"] == 512 + assert params["timeout"] == 300 + assert params["num_retries"] == 3 + + +class TestCustomMetrics: + """Test Custom Metrics functionality.""" + + @pytest.fixture + def mock_llm_manager(self): + """Create a mock LLM manager.""" + manager = MagicMock(spec=LLMManager) + manager.get_model_name.return_value = "gpt-4o-mini" + manager.get_litellm_params.return_value = { + "model": "gpt-4o-mini", + "temperature": 0.0, + "max_tokens": 512, + "timeout": 300, + "num_retries": 3 + } + return manager + + def test_custom_metrics_initialization(self, mock_llm_manager): + """Test CustomMetrics initialization.""" + metrics = CustomMetrics(mock_llm_manager) + + assert metrics.model_name == "gpt-4o-mini" + assert "answer_correctness" in metrics.supported_metrics + + @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion') + def test_answer_correctness_evaluation(self, mock_completion, mock_llm_manager): + """Test answer correctness evaluation with expected response.""" + # Mock LiteLLM response + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = "Score: 0.85\nReason: Response accurately describes Python as a programming language" + mock_completion.return_value = mock_response + + metrics = CustomMetrics(mock_llm_manager) + + turn_data = TurnData( + turn_id=1, + query="What is Python?", + response="Python is a programming language used for web development, data science, and automation.", + contexts=[{"content": "Python is a high-level programming language."}], + expected_response="Python is a high-level programming language used for various applications." + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("answer_correctness", None, scope) + + assert score == 0.85 + assert "Custom answer correctness" in reason + + def test_score_parsing_different_formats(self, mock_llm_manager): + """Test parsing scores in different formats.""" + metrics = CustomMetrics(mock_llm_manager) + + # Test different score formats + test_cases = [ + ("Score: 0.75\nReason: Good", 0.75), + ("8.5/10 - Excellent response", 0.85), + ("Rating: 4 out of 5", 0.8), + ("The score is 90%", 0.9), + ] + + for response_text, expected_score in test_cases: + score, reason = metrics._parse_score_response(response_text) + assert score == expected_score + + def test_unsupported_metric(self, mock_llm_manager): + """Test evaluation of unsupported metric.""" + metrics = CustomMetrics(mock_llm_manager) + + scope = EvaluationScope(is_conversation=False) + score, reason = metrics.evaluate("unsupported_metric", None, scope) + + assert score is None + assert "Unsupported custom metric" in reason + + +class TestRagasMetrics: + """Test Ragas Metrics functionality.""" + + @pytest.fixture + def mock_llm_manager(self): + """Create a mock LLM manager.""" + manager = MagicMock(spec=LLMManager) + manager.get_model_name.return_value = "gpt-4o-mini" + manager.get_litellm_params.return_value = { + "model": "gpt-4o-mini", + "temperature": 0.0, + "max_tokens": 512 + } + return manager + + @patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager') + def test_ragas_metrics_initialization(self, mock_ragas_llm_manager, mock_llm_manager): + """Test RagasMetrics initialization.""" + metrics = RagasMetrics(mock_llm_manager) + + # Verify that RagasLLMManager was called with correct parameters + mock_ragas_llm_manager.assert_called_once_with("gpt-4o-mini", mock_llm_manager.get_litellm_params()) + + assert "faithfulness" in metrics.supported_metrics + assert "response_relevancy" in metrics.supported_metrics + assert "context_recall" in metrics.supported_metrics + + def test_faithfulness_evaluation_with_context(self, mock_llm_manager): + """Test faithfulness evaluation with proper context data.""" + with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'): + metrics = RagasMetrics(mock_llm_manager) + + # Mock the _evaluate_metric method directly + with patch.object(metrics, '_evaluate_metric', return_value=(0.92, "Ragas faithfulness: 0.92")): + turn_data = TurnData( + turn_id=1, + query="What are the benefits of renewable energy?", + response="Renewable energy reduces carbon emissions and provides sustainable power generation.", + contexts=[ + {"content": "Renewable energy sources like solar and wind power help reduce greenhouse gas emissions."}, + {"content": "Sustainable energy systems provide long-term environmental benefits."} + ] + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("faithfulness", None, scope) + + assert score == 0.92 + assert "Ragas faithfulness" in reason + + def test_response_relevancy_evaluation(self, mock_llm_manager): + """Test response relevancy evaluation.""" + with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'): + metrics = RagasMetrics(mock_llm_manager) + + with patch.object(metrics, '_evaluate_metric', return_value=(0.88, "Ragas response relevancy: 0.88")): + turn_data = TurnData( + turn_id=1, + query="How does machine learning work?", + response="Machine learning uses algorithms to learn patterns from data and make predictions." + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("response_relevancy", None, scope) + + assert score == 0.88 + assert "response relevancy" in reason + + def test_conversation_level_metric_error(self, mock_llm_manager): + """Test error when using turn-level metric for conversation.""" + with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'): + metrics = RagasMetrics(mock_llm_manager) + + scope = EvaluationScope(is_conversation=True) + score, reason = metrics.evaluate("faithfulness", None, scope) + + assert score is None + assert "turn-level metric" in reason + + +class TestDeepEvalMetrics: + """Test DeepEval Metrics functionality.""" + + @pytest.fixture + def mock_llm_manager(self): + """Create a mock LLM manager.""" + manager = MagicMock(spec=LLMManager) + manager.get_model_name.return_value = "gpt-4o-mini" + manager.get_litellm_params.return_value = { + "model": "gpt-4o-mini", + "temperature": 0.0, + "max_tokens": 512 + } + return manager + + @patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager') + def test_deepeval_metrics_initialization(self, mock_deepeval_llm_manager, mock_llm_manager): + """Test DeepEvalMetrics initialization.""" + metrics = DeepEvalMetrics(mock_llm_manager) + + # Verify that DeepEvalLLMManager was called with correct parameters + mock_deepeval_llm_manager.assert_called_once_with("gpt-4o-mini", mock_llm_manager.get_litellm_params()) + + assert "conversation_completeness" in metrics.supported_metrics + assert "conversation_relevancy" in metrics.supported_metrics + assert "knowledge_retention" in metrics.supported_metrics + + @patch('lightspeed_evaluation.core.metrics.deepeval.ConversationCompletenessMetric') + def test_conversation_completeness_evaluation(self, mock_metric_class, mock_llm_manager): + """Test conversation completeness evaluation with multi-turn conversation.""" + # Mock metric instance + mock_metric = MagicMock() + mock_metric.score = 0.82 + mock_metric.reason = "Conversation addresses user needs comprehensively" + mock_metric_class.return_value = mock_metric + + with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'): + metrics = DeepEvalMetrics(mock_llm_manager) + + conv_data = EvaluationData( + conversation_group_id="customer_support_conv", + turns=[ + TurnData(turn_id=1, query="I need help with my account", response="I can help you with your account. What specific issue are you experiencing?"), + TurnData(turn_id=2, query="I can't log in", response="Let me help you reset your password. Please check your email for instructions."), + TurnData(turn_id=3, query="I got the email, thanks!", response="Great! Is there anything else I can help you with today?") + ] + ) + + scope = EvaluationScope(is_conversation=True) + + score, reason = metrics.evaluate("conversation_completeness", conv_data, scope) + + assert score == 0.82 + assert "comprehensively" in reason + + def test_turn_level_metric_error(self, mock_llm_manager): + """Test error when using conversation-level metric for turn.""" + with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'): + metrics = DeepEvalMetrics(mock_llm_manager) + + scope = EvaluationScope(is_conversation=False) + score, reason = metrics.evaluate("conversation_completeness", None, scope) + + assert score is None + assert "conversation-level metric" in reason + + +class TestMetricsIntegration: + """Integration tests for metrics components.""" + + @pytest.mark.integration + @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) + def test_metrics_manager_integration(self): + """Test integration between different metric types.""" + from lightspeed_evaluation.drivers.evaluation import MetricsManager + + config = LLMConfig(provider="openai", model="gpt-4o-mini") + llm_manager = LLMManager(config) + + metrics_manager = MetricsManager(llm_manager) + + # Verify all handlers are initialized + assert "ragas" in metrics_manager.handlers + assert "deepeval" in metrics_manager.handlers + assert "custom" in metrics_manager.handlers + + # Verify supported frameworks + frameworks = metrics_manager.get_supported_frameworks() + assert "ragas" in frameworks + assert "deepeval" in frameworks + assert "custom" in frameworks + + def test_evaluation_scope_factory_methods(self): + """Test EvaluationScope creation for different scenarios.""" + # Turn-level scope + turn_data = TurnData( + turn_id=1, + query="What is machine learning?", + response="Machine learning is a subset of AI that enables computers to learn from data." + ) + + turn_scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + assert turn_scope.turn_idx == 0 + assert turn_scope.turn_data == turn_data + assert turn_scope.is_conversation is False + + # Conversation-level scope + conv_scope = EvaluationScope(is_conversation=True) + + assert conv_scope.turn_idx is None + assert conv_scope.turn_data is None + assert conv_scope.is_conversation is True + + +class TestRealWorldScenarios: + """Test real-world evaluation scenarios.""" + + @pytest.fixture + def mock_llm_manager(self): + """Create a mock LLM manager.""" + manager = MagicMock(spec=LLMManager) + manager.get_model_name.return_value = "gpt-4o-mini" + manager.get_litellm_params.return_value = { + "model": "gpt-4o-mini", + "temperature": 0.0, + "max_tokens": 512, + "timeout": 300, + "num_retries": 3 + } + return manager + + @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion') + def test_technical_documentation_evaluation(self, mock_completion, mock_llm_manager): + """Test evaluation of technical documentation responses.""" + # Mock LLM response for technical accuracy + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = "Score: 0.88\nReason: Response provides accurate technical information about Kubernetes with proper context and examples" + mock_completion.return_value = mock_response + + metrics = CustomMetrics(mock_llm_manager) + + turn_data = TurnData( + turn_id=1, + query="How do I deploy a microservice using Kubernetes?", + response="To deploy a microservice using Kubernetes, you need to create a Deployment manifest that specifies the container image, replicas, and resource requirements. Then use kubectl apply to deploy it to your cluster. You'll also need a Service to expose the microservice to other components.", + contexts=[ + {"content": "Kubernetes deployments manage the lifecycle of containerized applications and ensure desired state."}, + {"content": "Services in Kubernetes provide stable network endpoints for accessing pods."} + ], + expected_response="Deploy microservices in Kubernetes by creating Deployment and Service manifests, then applying them with kubectl." + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("answer_correctness", None, scope) + + assert score == 0.88 + assert "technical information" in reason + + def test_customer_support_conversation_evaluation(self, mock_llm_manager): + """Test evaluation of customer support conversation completeness.""" + with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'): + with patch('lightspeed_evaluation.core.metrics.deepeval.ConversationCompletenessMetric') as mock_metric_class: + # Mock metric for customer support scenario + mock_metric = MagicMock() + mock_metric.score = 0.91 + mock_metric.reason = "Conversation fully addresses customer issue with clear resolution steps" + mock_metric_class.return_value = mock_metric + + metrics = DeepEvalMetrics(mock_llm_manager) + + conv_data = EvaluationData( + conversation_group_id="customer_billing_issue", + turns=[ + TurnData(turn_id=1, query="I was charged twice for my subscription", + response="I understand your concern about the duplicate charge. Let me look into your account to investigate this billing issue."), + TurnData(turn_id=2, query="When will this be resolved?", + response="I can see the duplicate charge in your account. I'm processing a refund right now, which should appear in 3-5 business days."), + TurnData(turn_id=3, query="Thank you for the help", + response="You're welcome! I've sent you a confirmation email with the refund details. Is there anything else I can help you with today?") + ] + ) + + scope = EvaluationScope(is_conversation=True) + + score, reason = metrics.evaluate("conversation_completeness", conv_data, scope) + + assert score == 0.91 + assert "fully addresses" in reason + + def test_code_explanation_faithfulness(self, mock_llm_manager): + """Test faithfulness evaluation for code explanation scenarios.""" + with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'): + metrics = RagasMetrics(mock_llm_manager) + + with patch.object(metrics, '_evaluate_metric', return_value=(0.94, "Ragas faithfulness: 0.94")): + turn_data = TurnData( + turn_id=1, + query="Explain what this Python function does: def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)", + response="This is a recursive function that calculates the nth Fibonacci number. It uses the base case where if n is 0 or 1, it returns n directly. Otherwise, it recursively calls itself with n-1 and n-2 and adds the results together.", + contexts=[ + {"content": "The Fibonacci sequence is defined as F(0)=0, F(1)=1, and F(n)=F(n-1)+F(n-2) for n>1."}, + {"content": "Recursive functions call themselves with modified parameters until reaching a base case."} + ] + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("faithfulness", None, scope) + + assert score == 0.94 + assert "faithfulness" in reason + + @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion') + def test_multilingual_content_evaluation(self, mock_completion, mock_llm_manager): + """Test evaluation of responses involving multilingual content.""" + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = "Score: 0.82\nReason: Response correctly explains the concept in English while acknowledging the Spanish term" + mock_completion.return_value = mock_response + + metrics = CustomMetrics(mock_llm_manager) + + turn_data = TurnData( + turn_id=1, + query="What does 'inteligencia artificial' mean and how is it used in technology?", + response="'Inteligencia artificial' is Spanish for 'artificial intelligence'. It refers to computer systems that can perform tasks typically requiring human intelligence, such as learning, reasoning, and problem-solving. It's widely used in technology for applications like machine learning, natural language processing, and computer vision.", + contexts=[ + {"content": "Artificial intelligence (AI) encompasses machine learning, neural networks, and automated decision-making systems."} + ], + expected_response="Inteligencia artificial means artificial intelligence in Spanish, referring to computer systems that simulate human intelligence for various technological applications." + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("answer_correctness", None, scope) + + assert score == 0.82 + assert "Spanish term" in reason + + def test_complex_multi_turn_technical_conversation(self, mock_llm_manager): + """Test evaluation of complex multi-turn technical conversations.""" + with patch('lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager'): + with patch('lightspeed_evaluation.core.metrics.deepeval.KnowledgeRetentionMetric') as mock_metric_class: + mock_metric = MagicMock() + mock_metric.score = 0.87 + mock_metric.reason = "Good knowledge retention across technical discussion about Docker and Kubernetes" + mock_metric_class.return_value = mock_metric + + metrics = DeepEvalMetrics(mock_llm_manager) + + conv_data = EvaluationData( + conversation_group_id="docker_kubernetes_discussion", + turns=[ + TurnData(turn_id=1, query="What's the difference between Docker and Kubernetes?", + response="Docker is a containerization platform that packages applications, while Kubernetes is an orchestration system that manages Docker containers at scale."), + TurnData(turn_id=2, query="How do they work together in a microservices architecture?", + response="In microservices, Docker containers package individual services, and Kubernetes orchestrates these containers, handling deployment, scaling, and service discovery across the cluster."), + TurnData(turn_id=3, query="What about the networking between these Docker containers you mentioned?", + response="Kubernetes provides networking through Services and Ingress controllers. Each Docker container gets an IP address, and Services create stable endpoints for communication between the containerized microservices.") + ] + ) + + scope = EvaluationScope(is_conversation=True) + + score, reason = metrics.evaluate("knowledge_retention", conv_data, scope) + + assert score == 0.87 + assert "knowledge retention" in reason + + def test_evaluation_with_incomplete_responses(self, mock_llm_manager): + """Test evaluation of incomplete or partial responses.""" + with patch('lightspeed_evaluation.core.metrics.ragas.RagasLLMManager'): + metrics = RagasMetrics(mock_llm_manager) + + with patch.object(metrics, '_evaluate_metric', return_value=(0.34, "Ragas response relevancy: 0.34")): + turn_data = TurnData( + turn_id=1, + query="Explain the complete process of photosynthesis including light and dark reactions", + response="Photosynthesis uses sunlight." # Incomplete response + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("response_relevancy", None, scope) + + assert score == 0.34 # Low score for incomplete response + assert "response relevancy" in reason + + @patch('lightspeed_evaluation.core.metrics.custom.litellm.completion') + def test_evaluation_with_edge_case_scoring(self, mock_completion, mock_llm_manager): + """Test evaluation with edge case scoring scenarios.""" + # Test different score formats that might come from LLM + test_cases = [ + ("Perfect score: 1.0\nReason: Excellent", 1.0), + ("Score: 0\nReason: Completely incorrect", 0.0), + ("Rating: 7.5 out of 10", 0.75), + ("85% accuracy", 0.85), + ("Score: 0.999", 0.999), + ] + + metrics = CustomMetrics(mock_llm_manager) + + for response_text, expected_score in test_cases: + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = response_text + mock_completion.return_value = mock_response + + turn_data = TurnData( + turn_id=1, + query="Test query", + response="Test response", + expected_response="Expected response" + ) + + scope = EvaluationScope( + turn_idx=0, + turn_data=turn_data, + is_conversation=False + ) + + score, reason = metrics.evaluate("answer_correctness", None, scope) + + assert score == expected_score, f"Failed for response: {response_text}"