Skip to content

Commit c83e295

Browse files
committed
refactor: addressed PR feedback; alignment with pre-existing structure
1 parent 9a0b600 commit c83e295

File tree

9 files changed

+164
-430
lines changed

9 files changed

+164
-430
lines changed

config/registry/geval_metrics.yaml

Lines changed: 0 additions & 136 deletions
This file was deleted.

config/system.yaml

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ core:
66

77
# LLM as a judge configuration
88
llm:
9-
provider: "openai" # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
10-
model: "gpt-4o-mini" # Model name for the provider
9+
provider: vertex # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
10+
model: gemini-2.0-flash # Model name for the provider
1111
temperature: 0.0 # Generation temperature
1212
max_tokens: 512 # Maximum tokens in response
1313
timeout: 300 # Request timeout in seconds
@@ -28,7 +28,7 @@ embedding:
2828
# To get real time data. Currently it supports lightspeed-stack API.
2929
# But can be easily integrated with other APIs with minimal change.
3030
api:
31-
enabled: true # Enable API calls instead of using pre-filled data
31+
enabled: false # Enable API calls instead of using pre-filled data
3232
api_base: http://localhost:8080 # Base API URL
3333
endpoint_type: streaming # Use "streaming" or "query" endpoint
3434
timeout: 300 # API request timeout in seconds
@@ -42,14 +42,6 @@ api:
4242
cache_dir: ".caches/api_cache" # Directory with lightspeed-stack cache
4343
cache_enabled: true # Is lightspeed-stack cache enabled?
4444
# Authentication via API_KEY environment variable only for MCP server
45-
46-
# GEval Configuration
47-
# Configurable custom metrics using DeepEval's GEval framework
48-
geval:
49-
enabled: true # Enable GEval metrics evaluation
50-
registry_path: "config/registry/geval_metrics.yaml" # Path to GEval metrics registry
51-
default_turn_metrics: [] # Optional: auto-apply turn-level GEval metrics
52-
default_conversation_metrics: [] # Optional: auto-apply conversation-level GEval metrics
5345

5446
# Default metrics metadata
5547
metrics_metadata:
@@ -99,6 +91,26 @@ metrics_metadata:
9991
"script:action_eval":
10092
description: "Script-based evaluation for infrastructure/environment validation"
10193

94+
# GEval turn-level metrics
95+
"geval:technical_accuracy":
96+
criteria: |
97+
Assess whether the response provides technically accurate information,
98+
commands, code, syntax, and follows relevant industry or
99+
domain-specific best practices. The response should
100+
contain valid syntax and use appropriate functions, modules, or tools.
101+
evaluation_params:
102+
- query
103+
- response
104+
- expected_response
105+
evaluation_steps:
106+
- "Verify that the provided syntax (e.g., code, commands, configuration) is valid and follows the language/tool's formatting rules."
107+
- "Check if the response uses appropriate modules, functions, libraries, or parameters for the given task."
108+
- "Assess whether the solution aligns with relevant official documentation or established best practices for the specific domain."
109+
- "Verify the response directly and accurately addresses the user's specific query or task."
110+
- "Check for potential security issues, significant inefficiencies, or anti-patterns."
111+
threshold: 0.7
112+
description: "General technical accuracy of provided commands, code, or technical information"
113+
102114
# Conversation-level metrics metadata
103115
conversation_level:
104116
# DeepEval metrics
@@ -115,6 +127,24 @@ metrics_metadata:
115127
threshold: 0.7
116128
description: "How well the model retains information from previous turns"
117129

130+
# GEval conversation-level metrics
131+
"geval:conversation_coherence":
132+
criteria: |
133+
Evaluate whether the conversation maintains context and provides coherent
134+
responses across multiple turns. The assistant should reference previous
135+
exchanges and build upon earlier context.
136+
evaluation_params:
137+
- query
138+
- response
139+
evaluation_steps:
140+
- "Check if the assistant remembers information from previous turns"
141+
- "Verify responses build logically on previous context"
142+
- "Assess whether the conversation flows naturally"
143+
- "Check for contradictions with earlier statements"
144+
threshold: 0.6
145+
description: "Context maintenance and coherence across conversation turns"
146+
147+
118148
# Output Configuration
119149
output:
120150
output_dir: "./eval_output"

src/lightspeed_evaluation/core/metrics/deepeval.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@
2121

2222
from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
2323
from lightspeed_evaluation.core.llm.manager import LLMManager
24-
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
2524
from lightspeed_evaluation.core.metrics.geval import GEvalHandler
25+
from lightspeed_evaluation.core.metrics.manager import MetricManager
26+
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
2627

2728
logger = logging.getLogger(__name__)
2829

@@ -35,12 +36,16 @@ class DeepEvalMetrics: # pylint: disable=too-few-public-methods
3536
both evaluation types for efficiency.
3637
"""
3738

38-
def __init__(self, llm_manager: LLMManager, registry_path: str | None = None):
39+
def __init__(
40+
self,
41+
llm_manager: LLMManager,
42+
metric_manager: MetricManager,
43+
):
3944
"""Initialize with LLM Manager.
4045
4146
Args:
4247
llm_manager: Pre-configured LLMManager with validated parameters
43-
registry_path: Optional path to GEval metrics registry YAML
48+
metric_manager: MetricManager for accessing metric metadata
4449
"""
4550
# Setup cache if enabled (shared across all DeepEval operations)
4651
if llm_manager.get_config().cache_enabled and litellm.cache is None:
@@ -54,10 +59,10 @@ def __init__(self, llm_manager: LLMManager, registry_path: str | None = None):
5459
llm_manager.get_model_name(), llm_manager.get_llm_params()
5560
)
5661

57-
# Initialize GEval handler with shared LLM manager
62+
# Initialize GEval handler with shared LLM manager and metric manager
5863
self.geval_handler = GEvalHandler(
5964
deepeval_llm_manager=self.llm_manager,
60-
registry_path=registry_path,
65+
metric_manager=metric_manager,
6166
)
6267

6368
# Standard DeepEval metrics routing

0 commit comments

Comments
 (0)