66
77# LLM as a judge configuration
88llm :
9- provider : " openai " # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
10- model : " gpt-4o-mini " # Model name for the provider
9+ provider : vertex # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
10+ model : gemini-2.0-flash # Model name for the provider
1111 temperature : 0.0 # Generation temperature
1212 max_tokens : 512 # Maximum tokens in response
1313 timeout : 300 # Request timeout in seconds
@@ -28,7 +28,7 @@ embedding:
2828# To get real time data. Currently it supports lightspeed-stack API.
2929# But can be easily integrated with other APIs with minimal change.
3030api :
31- enabled : true # Enable API calls instead of using pre-filled data
31+ enabled : false # Enable API calls instead of using pre-filled data
3232 api_base : http://localhost:8080 # Base API URL
3333 endpoint_type : streaming # Use "streaming" or "query" endpoint
3434 timeout : 300 # API request timeout in seconds
4242 cache_dir : " .caches/api_cache" # Directory with lightspeed-stack cache
4343 cache_enabled : true # Is lightspeed-stack cache enabled?
4444 # Authentication via API_KEY environment variable only for MCP server
45-
46- # GEval Configuration
47- # Configurable custom metrics using DeepEval's GEval framework
48- geval :
49- enabled : true # Enable GEval metrics evaluation
50- registry_path : " config/registry/geval_metrics.yaml" # Path to GEval metrics registry
51- default_turn_metrics : [] # Optional: auto-apply turn-level GEval metrics
52- default_conversation_metrics : [] # Optional: auto-apply conversation-level GEval metrics
5345
5446# Default metrics metadata
5547metrics_metadata :
@@ -99,6 +91,26 @@ metrics_metadata:
9991 " script:action_eval " :
10092 description : " Script-based evaluation for infrastructure/environment validation"
10193
94+ # GEval turn-level metrics
95+ " geval:technical_accuracy " :
96+ criteria : |
97+ Assess whether the response provides technically accurate information,
98+ commands, code, syntax, and follows relevant industry or
99+ domain-specific best practices. The response should
100+ contain valid syntax and use appropriate functions, modules, or tools.
101+ evaluation_params :
102+ - query
103+ - response
104+ - expected_response
105+ evaluation_steps :
106+ - " Verify that the provided syntax (e.g., code, commands, configuration) is valid and follows the language/tool's formatting rules."
107+ - " Check if the response uses appropriate modules, functions, libraries, or parameters for the given task."
108+ - " Assess whether the solution aligns with relevant official documentation or established best practices for the specific domain."
109+ - " Verify the response directly and accurately addresses the user's specific query or task."
110+ - " Check for potential security issues, significant inefficiencies, or anti-patterns."
111+ threshold : 0.7
112+ description : " General technical accuracy of provided commands, code, or technical information"
113+
102114 # Conversation-level metrics metadata
103115 conversation_level :
104116 # DeepEval metrics
@@ -115,6 +127,24 @@ metrics_metadata:
115127 threshold : 0.7
116128 description : " How well the model retains information from previous turns"
117129
130+ # GEval conversation-level metrics
131+ " geval:conversation_coherence " :
132+ criteria : |
133+ Evaluate whether the conversation maintains context and provides coherent
134+ responses across multiple turns. The assistant should reference previous
135+ exchanges and build upon earlier context.
136+ evaluation_params :
137+ - query
138+ - response
139+ evaluation_steps :
140+ - " Check if the assistant remembers information from previous turns"
141+ - " Verify responses build logically on previous context"
142+ - " Assess whether the conversation flows naturally"
143+ - " Check for contradictions with earlier statements"
144+ threshold : 0.6
145+ description : " Context maintenance and coherence across conversation turns"
146+
147+
118148# Output Configuration
119149output :
120150 output_dir : " ./eval_output"
0 commit comments