Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lightspeed_evaluation/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

DEFAULT_OUTPUT_DIR = "./eval_output"
DEFAULT_BASE_FILENAME = "evaluation"
MAX_RUN_NAME_LENGTH = 100

SUPPORTED_OUTPUT_TYPES = ["csv", "json", "txt"]
SUPPORTED_CSV_COLUMNS = [
Expand Down
10 changes: 9 additions & 1 deletion src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

from pydantic import BaseModel, ConfigDict, Field, field_validator

from lightspeed_evaluation.core.constants import SUPPORTED_RESULT_STATUSES
from lightspeed_evaluation.core.constants import (
MAX_RUN_NAME_LENGTH,
SUPPORTED_RESULT_STATUSES,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -148,6 +151,11 @@ class EvaluationData(BaseModel):
min_length=1,
description="Optional description of the conversation group",
)
run_name: Optional[str] = Field(
default=None,
max_length=MAX_RUN_NAME_LENGTH,
description=f"Optional name for this evaluation run (max {MAX_RUN_NAME_LENGTH} chars)",
)

# Conversation-level metrics
conversation_metrics: Optional[list[str]] = Field(
Expand Down
19 changes: 17 additions & 2 deletions src/lightspeed_evaluation/core/output/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,20 @@ def __init__(
output_dir: str = DEFAULT_OUTPUT_DIR,
base_filename: str = "evaluation",
system_config: Optional[Any] = None,
run_name: Optional[str] = None,
) -> None:
"""Initialize Output handler."""
"""Initialize Output handler.

Args:
output_dir: Directory for output files
base_filename: Base name for output files
system_config: System configuration
run_name: Optional run name to prepend to filenames
"""
self.output_dir = Path(output_dir)
self.base_filename = base_filename
self.system_config = system_config
self.run_name = run_name
self.output_dir.mkdir(parents=True, exist_ok=True)

print(f"✅ Output handler initialized: {self.output_dir}")
Expand All @@ -41,7 +50,10 @@ def generate_reports(self, results: list[EvaluationResult]) -> None:
"""Generate all output reports based on configuration."""
# Prepare timestamped base filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_filename = f"{self.base_filename}_{timestamp}"
if self.run_name:
base_filename = f"{self.run_name}_{self.base_filename}_{timestamp}"
else:
base_filename = f"{self.base_filename}_{timestamp}"

# Get enabled outputs from system config
enabled_outputs = (
Expand Down Expand Up @@ -180,6 +192,7 @@ def _generate_json_summary(

summary = {
"timestamp": datetime.now().isoformat(),
"run_name": self.run_name,
"total_evaluations": len(results),
"summary_stats": {
"overall": basic_stats,
Expand Down Expand Up @@ -226,6 +239,8 @@ def _generate_text_summary(
f.write("=" * 50 + "\n\n")

f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
if self.run_name:
f.write(f"Run Name: {self.run_name}\n")
f.write(f"Total Evaluations: {len(results)}\n\n")

# Overall statistics
Expand Down
21 changes: 19 additions & 2 deletions src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
CONVERSATION_LEVEL_METRICS,
TURN_LEVEL_METRICS,
)
from lightspeed_evaluation.core.utils import sanitize_run_name

# Metric requirements mapping
METRIC_REQUIREMENTS = {
Expand Down Expand Up @@ -82,8 +83,15 @@ def __init__(self, api_enabled: bool = False) -> None:
self.api_enabled = api_enabled
self.original_data_path: Optional[str] = None

def load_evaluation_data(self, data_path: str) -> list[EvaluationData]:
"""Load and validate evaluation data from YAML file."""
def load_evaluation_data(
self, data_path: str, run_name_override: Optional[str] = None
) -> list[EvaluationData]:
"""Load and validate evaluation data from YAML file.

Args:
data_path: Path to evaluation data YAML file
run_name_override: Optional run name to override YAML/filename default
"""
self.original_data_path = data_path

try:
Expand All @@ -108,6 +116,15 @@ def load_evaluation_data(self, data_path: str) -> list[EvaluationData]:
evaluation_data = []
for i, data_dict in enumerate(raw_data):
try:
# Set run_name with priority: CLI override > YAML value > filename
if run_name_override is not None:
# CLI override takes highest priority
data_dict["run_name"] = sanitize_run_name(run_name_override)
elif "run_name" not in data_dict or data_dict["run_name"] is None:
# Default to YAML filename if not provided
yaml_filename = Path(data_path).stem
data_dict["run_name"] = sanitize_run_name(yaml_filename)

Comment on lines +119 to +127
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Sanitize YAML-provided run_name values.

The current logic sanitizes the CLI override and filename fallback but doesn't sanitize run_name values provided in the YAML file. This could allow filesystem-unsafe characters to pass through validation.

Apply this diff to ensure all run_name values are sanitized:

                # Set run_name with priority: CLI override > YAML value > filename
                if run_name_override is not None:
                    # CLI override takes highest priority
                    data_dict["run_name"] = sanitize_run_name(run_name_override)
                elif "run_name" not in data_dict or data_dict["run_name"] is None:
                    # Default to YAML filename if not provided
                    yaml_filename = Path(data_path).stem
                    data_dict["run_name"] = sanitize_run_name(yaml_filename)
+               else:
+                   # Sanitize YAML-provided run_name
+                   data_dict["run_name"] = sanitize_run_name(data_dict["run_name"])
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Set run_name with priority: CLI override > YAML value > filename
if run_name_override is not None:
# CLI override takes highest priority
data_dict["run_name"] = sanitize_run_name(run_name_override)
elif "run_name" not in data_dict or data_dict["run_name"] is None:
# Default to YAML filename if not provided
yaml_filename = Path(data_path).stem
data_dict["run_name"] = sanitize_run_name(yaml_filename)
# Set run_name with priority: CLI override > YAML value > filename
if run_name_override is not None:
# CLI override takes highest priority
data_dict["run_name"] = sanitize_run_name(run_name_override)
elif "run_name" not in data_dict or data_dict["run_name"] is None:
# Default to YAML filename if not provided
yaml_filename = Path(data_path).stem
data_dict["run_name"] = sanitize_run_name(yaml_filename)
else:
# Sanitize YAML-provided run_name
data_dict["run_name"] = sanitize_run_name(data_dict["run_name"])
🤖 Prompt for AI Agents
In src/lightspeed_evaluation/core/system/validator.py around lines 119 to 127,
the code sanitizes the CLI override and filename fallback but leaves a
YAML-provided data_dict["run_name"] unsanitized; update the logic so that after
handling CLI override and filename default, any existing YAML-provided run_name
is passed through sanitize_run_name before assignment (i.e., when
run_name_override is None and "run_name" exists and is not None, replace
data_dict["run_name"] with sanitize_run_name(data_dict["run_name"]) so all
run_name sources are sanitized).

eval_data = EvaluationData(**data_dict)
evaluation_data.append(eval_data)
except ValidationError as e:
Expand Down
5 changes: 5 additions & 0 deletions src/lightspeed_evaluation/core/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Core utility functions."""

from lightspeed_evaluation.core.utils.sanitize import sanitize_run_name

__all__ = ["sanitize_run_name"]
49 changes: 49 additions & 0 deletions src/lightspeed_evaluation/core/utils/sanitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Utility functions for sanitizing user input."""

import re

from lightspeed_evaluation.core.constants import MAX_RUN_NAME_LENGTH


def sanitize_run_name(run_name: str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider pathvalidate, the advantage would be that we don't need maintain+test this func.

"""Sanitize run name for safe filesystem usage.
Replaces filesystem-unsafe characters with underscores, collapses
multiple spaces/underscores, and enforces max length.
Args:
run_name: Raw run name string to sanitize
Returns:
Sanitized run name safe for filesystem usage. Returns empty string
if input is empty or becomes empty after sanitization.
Examples:
>>> sanitize_run_name("test/run:123")
'test_run_123'
>>> sanitize_run_name(" multiple spaces ")
'multiple_spaces'
>>> sanitize_run_name("rh124: filesystem basics")
'rh124_filesystem_basics'
"""
if not run_name:
return ""

# Strip leading/trailing whitespace
sanitized = run_name.strip()

# Replace invalid filesystem characters with underscores
# Invalid chars: / \ : * ? " ' ` < > | and control characters (0x00-0x1f)
sanitized = re.sub(r'[/\\:*?"\'`<>|\x00-\x1f]', "_", sanitized)

# Replace multiple spaces/underscores with single underscore
sanitized = re.sub(r"[\s_]+", "_", sanitized)

# Strip leading/trailing underscores that may have been created
sanitized = sanitized.strip("_")

# Enforce max length, strip trailing underscores if truncated
if len(sanitized) > MAX_RUN_NAME_LENGTH:
sanitized = sanitized[:MAX_RUN_NAME_LENGTH].rstrip("_")

return sanitized
24 changes: 21 additions & 3 deletions src/lightspeed_evaluation/runner/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,23 @@
from typing import Optional

# Import only lightweight modules at top level
from lightspeed_evaluation.core.constants import MAX_RUN_NAME_LENGTH
from lightspeed_evaluation.core.system import ConfigLoader


def run_evaluation( # pylint: disable=too-many-locals
system_config_path: str, evaluation_data_path: str, output_dir: Optional[str] = None
system_config_path: str,
evaluation_data_path: str,
output_dir: Optional[str] = None,
run_name: Optional[str] = None,
) -> Optional[dict[str, int]]:
"""Run the complete evaluation pipeline using EvaluationPipeline.

Args:
system_config_path: Path to system.yaml
evaluation_data_path: Path to evaluation_data.yaml
output_dir: Optional override for output directory
run_name: Optional name for evaluation run (overrides YAML/filename default)

Returns:
dict: Summary statistics with keys TOTAL, PASS, FAIL, ERROR
Expand Down Expand Up @@ -49,7 +54,9 @@ def run_evaluation( # pylint: disable=too-many-locals

# Step 2: Load and validate evaluation data
data_validator = DataValidator(api_enabled=system_config.api.enabled)
evaluation_data = data_validator.load_evaluation_data(evaluation_data_path)
evaluation_data = data_validator.load_evaluation_data(
evaluation_data_path, run_name_override=run_name
)

print(f"✅ System config: {llm_config.provider}/{llm_config.model}")
print(f"✅ Evaluation data: {len(evaluation_data)} conversation groups")
Expand All @@ -66,10 +73,15 @@ def run_evaluation( # pylint: disable=too-many-locals

# Step 4: Generate reports and calculate stats
print("\n📊 Generating Reports...")
# Extract run_name from first conversation (all should have same run_name)
run_name_for_output = (
evaluation_data[0].run_name if evaluation_data else None
)
output_handler = OutputHandler(
output_dir=output_dir or output_config.output_dir,
base_filename=output_config.base_filename,
system_config=system_config,
run_name=run_name_for_output,
)

# Generate reports based on configuration
Expand Down Expand Up @@ -118,10 +130,16 @@ def main() -> int:
help="Path to evaluation data file (default: config/evaluation_data.yaml)",
)
parser.add_argument("--output-dir", help="Override output directory (optional)")
parser.add_argument(
"--run-name",
help=f"Name for this evaluation run (overrides YAML value, max {MAX_RUN_NAME_LENGTH} chars)",
)

args = parser.parse_args()

summary = run_evaluation(args.system_config, args.eval_data, args.output_dir)
summary = run_evaluation(
args.system_config, args.eval_data, args.output_dir, args.run_name
)

return 0 if summary is not None else 1

Expand Down
114 changes: 114 additions & 0 deletions tests/unit/core/test_sanitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""Tests for sanitization utilities."""

import pytest

from lightspeed_evaluation.core.constants import MAX_RUN_NAME_LENGTH
from lightspeed_evaluation.core.utils import sanitize_run_name


class TestSanitizeRunName:
"""Test cases for sanitize_run_name function."""

def test_basic_alphanumeric(self):
"""Test that basic alphanumeric strings pass through unchanged."""
assert sanitize_run_name("test123") == "test123"
assert sanitize_run_name("rh124_filesystem_basics") == "rh124_filesystem_basics"

def test_empty_string(self):
"""Test that empty string returns empty string."""
assert sanitize_run_name("") == ""

def test_whitespace_trimming(self):
"""Test that leading/trailing whitespace is removed."""
assert sanitize_run_name(" test ") == "test"
assert sanitize_run_name("\ttest\n") == "test"

def test_filesystem_unsafe_characters(self):
"""Test that filesystem-unsafe characters are replaced with underscores."""
assert sanitize_run_name("test/run") == "test_run"
assert sanitize_run_name("test\\run") == "test_run"
assert sanitize_run_name("test:run") == "test_run"
assert sanitize_run_name("test*run") == "test_run"
assert sanitize_run_name("test?run") == "test_run"
assert sanitize_run_name('test"run') == "test_run"
assert sanitize_run_name("test'run") == "test_run"
assert sanitize_run_name("test`run") == "test_run"
assert sanitize_run_name("test<run") == "test_run"
assert sanitize_run_name("test>run") == "test_run"
assert sanitize_run_name("test|run") == "test_run"

def test_multiple_special_characters(self):
"""Test strings with multiple special characters."""
assert sanitize_run_name("test/run:123") == "test_run_123"
assert sanitize_run_name("rh124: filesystem basics") == "rh124_filesystem_basics"
assert sanitize_run_name("test's `command`") == "test_s_command"

def test_space_collapsing(self):
"""Test that multiple spaces are collapsed to single underscore."""
assert sanitize_run_name("multiple spaces") == "multiple_spaces"
assert sanitize_run_name("test run") == "test_run"

def test_underscore_collapsing(self):
"""Test that multiple underscores are collapsed to single underscore."""
assert sanitize_run_name("test___run") == "test_run"
assert sanitize_run_name("test_____run") == "test_run"

def test_mixed_whitespace_underscore_collapsing(self):
"""Test that mixed spaces and underscores collapse properly."""
assert sanitize_run_name("test _ _ run") == "test_run"
assert sanitize_run_name("test _ run") == "test_run"

def test_leading_trailing_underscores_stripped(self):
"""Test that leading/trailing underscores created during sanitization are removed."""
assert sanitize_run_name("/test/") == "test"
assert sanitize_run_name(":test:") == "test"
assert sanitize_run_name("_test_") == "test"

def test_max_length_enforcement(self):
"""Test that strings exceeding max length are truncated."""
long_string = "a" * (MAX_RUN_NAME_LENGTH + 50)
result = sanitize_run_name(long_string)
assert len(result) <= MAX_RUN_NAME_LENGTH
assert result == "a" * MAX_RUN_NAME_LENGTH

def test_max_length_with_trailing_underscores(self):
"""Test that truncation removes trailing underscores."""
# Create a string that when truncated would end with underscore
long_string = "a" * (MAX_RUN_NAME_LENGTH - 1) + "_" + "b" * 50
result = sanitize_run_name(long_string)
assert len(result) <= MAX_RUN_NAME_LENGTH
assert not result.endswith("_")

def test_control_characters(self):
"""Test that control characters are replaced."""
assert sanitize_run_name("test\x00run") == "test_run"
assert sanitize_run_name("test\x1frun") == "test_run"

def test_unicode_characters_preserved(self):
"""Test that Unicode characters (emojis, kanji, etc.) are preserved."""
# Emojis
assert sanitize_run_name("test🚀run") == "test🚀run"
assert sanitize_run_name("📊evaluation") == "📊evaluation"

# Japanese kanji
assert sanitize_run_name("テスト実行") == "テスト実行"
assert sanitize_run_name("test_日本語_run") == "test_日本語_run"

# Chinese characters
assert sanitize_run_name("测试运行") == "测试运行"

# Mix of Unicode and ASCII
assert sanitize_run_name("test_🎯_goal") == "test_🎯_goal"

def test_unicode_with_unsafe_characters(self):
"""Test Unicode strings with filesystem-unsafe characters."""
assert sanitize_run_name("テスト/実行") == "テスト_実行"
assert sanitize_run_name("test🚀:run") == "test🚀_run"
assert sanitize_run_name("評価 💯 test") == "評価_💯_test"

def test_real_world_yaml_filenames(self):
"""Test realistic YAML filename scenarios."""
assert sanitize_run_name("rh124_lesson_01") == "rh124_lesson_01"
assert sanitize_run_name("filesystem-basics") == "filesystem-basics"
assert sanitize_run_name("Module 1: Introduction") == "Module_1_Introduction"
assert sanitize_run_name("test (copy)") == "test_(copy)" # Parentheses are valid
Loading