Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ core:
# 50 is OK on a typical laptop. Check your Judge-LLM service for max requests per minute
max_threads: 50

# If false don't fail on invalid conversations (like missing context for some metrics)
fail_on_invalid_data: true

# Judge-LLM Configuration
llm:
provider: openai # openai, watsonx, azure, gemini etc.
Expand Down
3 changes: 2 additions & 1 deletion config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Core evaluation parameters
core:
max_threads: 50 # Maximum number of threads, set to null for Python default. 50 is OK for bigger datasets
fail_on_invalid_data: true # If False don't fail on invalid conversations (like missing context for some metrics)

# LLM as a judge configuration
llm:
Expand All @@ -28,7 +29,7 @@ embedding:
# To get real time data. Currently it supports lightspeed-stack API.
# But can be easily integrated with other APIs with minimal change.
api:
enabled: true # Enable API calls instead of using pre-filled data
enabled: true # Enable API calls instead of using pre-filled data
api_base: http://localhost:8080 # Base API URL
endpoint_type: streaming # Use "streaming" or "query" endpoint
timeout: 300 # API request timeout in seconds
Expand Down
22 changes: 22 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,17 @@ class TurnData(BaseModel):
default=None, description="Path to verify script for script-based evaluation"
)

# Set of turn metrics that don't pass the validation to ignore them later
_invalid_metrics: set[str] = set()

def add_invalid_metric(self, metric: str) -> None:
"""Add metric to the invalid turn metrics."""
self._invalid_metrics.add(metric)

def is_metric_invalid(self, metric: str) -> bool:
"""Returns True if the metric didn't pass the validation."""
return metric in self._invalid_metrics

@field_validator("turn_metrics")
@classmethod
def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
Expand Down Expand Up @@ -334,6 +345,17 @@ class EvaluationData(BaseModel):
description="Path to cleanup script to run after conversation ends",
)

# Set of conversation metrics that don't pass the validation to ignore them later
_invalid_metrics: set[str] = set()

def add_invalid_metric(self, metric: str) -> None:
"""Add metric to the invalid turn metrics."""
self._invalid_metrics.add(metric)

def is_metric_invalid(self, metric: str) -> bool:
"""Returns True if the metric didn't pass the validation."""
return metric in self._invalid_metrics

@field_validator("conversation_metrics")
@classmethod
def validate_conversation_metrics(
Expand Down
4 changes: 4 additions & 0 deletions src/lightspeed_evaluation/core/models/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ class CoreConfig(BaseModel):
description="Maximum threads for multithreading eval",
gt=0,
)
fail_on_invalid_data: bool = Field(
default=True,
description="If False don't fail on invalid conversations",
)


class SystemConfig(BaseModel):
Expand Down
20 changes: 18 additions & 2 deletions src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,15 @@ def format_pydantic_error(error: ValidationError) -> str:
class DataValidator:
"""Data validator for evaluation data."""

def __init__(self, api_enabled: bool = False) -> None:
def __init__(
self, api_enabled: bool = False, fail_on_invalid_data: bool = True
) -> None:
"""Initialize validator."""
self.validation_errors: list[str] = []
self.evaluation_data: Optional[list[EvaluationData]] = None
self.api_enabled = api_enabled
self.original_data_path: Optional[str] = None
self.fail_on_invalid_data = fail_on_invalid_data

def load_evaluation_data(self, data_path: str) -> list[EvaluationData]:
"""Load and validate evaluation data from YAML file."""
Expand Down Expand Up @@ -152,7 +155,12 @@ def validate_evaluation_data(self, evaluation_data: list[EvaluationData]) -> boo
print("❌ Validation Errors:")
for error in self.validation_errors:
print(f" • {error}")
return False

if self.fail_on_invalid_data:
return False

print("❌ Validation Errors!, ignoring as instructed")
return True

validation_msg = "✅ All data validation passed"
if self.api_enabled:
Expand All @@ -169,6 +177,7 @@ def _validate_metrics_availability(self, data: EvaluationData) -> None:
if turn_data.turn_metrics:
for metric in turn_data.turn_metrics:
if metric not in TURN_LEVEL_METRICS:
turn_data.add_invalid_metric(metric)
self.validation_errors.append(
f"Conversation {conversation_id}, Turn {turn_data.turn_id}: "
f"Unknown turn metric '{metric}'"
Expand All @@ -178,6 +187,7 @@ def _validate_metrics_availability(self, data: EvaluationData) -> None:
if data.conversation_metrics:
for metric in data.conversation_metrics:
if metric not in CONVERSATION_LEVEL_METRICS:
data.add_invalid_metric(metric)
self.validation_errors.append(
f"Conversation {conversation_id}: Unknown conversation metric '{metric}'"
)
Expand All @@ -188,6 +198,10 @@ def _validate_metric_requirements(self, data: EvaluationData) -> None:

field_errors = self._check_metric_requirements(data, self.api_enabled)

# No errors
if not field_errors:
return

# Add conversation group ID prefix to errors
for error in field_errors:
self.validation_errors.append(
Expand Down Expand Up @@ -237,6 +251,8 @@ def _check_metric_requirements(
or (isinstance(field_value, str) and not field_value.strip())
or (isinstance(field_value, list) and not field_value)
):
turn_data.add_invalid_metric(metric)

api_context = (
" when API is disabled"
if field_name in api_populated_fields and not api_enabled
Expand Down
5 changes: 4 additions & 1 deletion src/lightspeed_evaluation/pipeline/evaluation/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ def _initialize_components(self) -> None:
raise ValueError(
"SystemConfig must be loaded before initializing components"
)
self.data_validator = DataValidator(api_enabled=config.api.enabled)
self.data_validator = DataValidator(
api_enabled=config.api.enabled,
fail_on_invalid_data=config.core.fail_on_invalid_data,
)

# Metric manager
metric_manager = MetricManager(config)
Expand Down
14 changes: 14 additions & 0 deletions src/lightspeed_evaluation/pipeline/evaluation/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,13 @@ def _evaluate_turn(
results = []

for metric_identifier in turn_metrics:
if turn_data.is_metric_invalid(metric_identifier):
logger.error(
"Invalid turn metric '%s', check Validation Errors",
metric_identifier,
)
continue

request = EvaluationRequest.for_turn(
conv_data, metric_identifier, turn_idx, turn_data
)
Expand All @@ -204,6 +211,13 @@ def _evaluate_conversation(
results = []

for metric_identifier in conversation_metrics:
if conv_data.is_metric_invalid(metric_identifier):
logger.error(
"Invalid conversation metric '%s', check Validation Errors",
metric_identifier,
)
continue

request = EvaluationRequest.for_conversation(conv_data, metric_identifier)
result = self.components.metrics_evaluator.evaluate_metric(request)
if result:
Expand Down
5 changes: 4 additions & 1 deletion src/lightspeed_evaluation/runner/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def run_evaluation( # pylint: disable=too-many-locals
output_config = system_config.output

# Step 2: Load and validate evaluation data
data_validator = DataValidator(api_enabled=system_config.api.enabled)
data_validator = DataValidator(
api_enabled=system_config.api.enabled,
fail_on_invalid_data=system_config.core.fail_on_invalid_data,
)
evaluation_data = data_validator.load_evaluation_data(evaluation_data_path)

print(f"✅ System config: {llm_config.provider}/{llm_config.model}")
Expand Down
Loading
Loading