@@ -285,12 +285,6 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
285285
286286 # Convert evaluation_params to enum values if valid, otherwise use defaults
287287 converted_params = self ._convert_evaluation_params (evaluation_params )
288- if not converted_params :
289- # If no valid params, use sensible defaults for turn evaluation
290- converted_params = [
291- LLMTestCaseParams .INPUT ,
292- LLMTestCaseParams .ACTUAL_OUTPUT ,
293- ]
294288
295289 # Create GEval metric with runtime configuration
296290 metric_kwargs : dict [str , Any ] = {
@@ -302,6 +296,18 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
302296 "top_logprobs" : 5 ,
303297 }
304298
299+ # Only set evaluation_params if we have valid enum conversions
300+ # or if no params were provided at all (then use defaults)
301+ if converted_params is None :
302+ if not evaluation_params :
303+ metric_kwargs ["evaluation_params" ] = [
304+ LLMTestCaseParams .INPUT ,
305+ LLMTestCaseParams .ACTUAL_OUTPUT ,
306+ ]
307+ # else: leave unset so GEval can auto-detect from custom strings
308+ else :
309+ metric_kwargs ["evaluation_params" ] = converted_params
310+
305311 # Add evaluation steps if provided
306312 if evaluation_steps :
307313 metric_kwargs ["evaluation_steps" ] = evaluation_steps
@@ -320,12 +326,7 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
320326 test_case_kwargs ["expected_output" ] = turn_data .expected_response
321327
322328 if turn_data .contexts :
323- # Normalize contexts: handle both dict and string formats
324- normalized_contexts = [
325- ctx .get ("content" , str (ctx )) if isinstance (ctx , dict ) else str (ctx )
326- for ctx in turn_data .contexts
327- ]
328- test_case_kwargs ["context" ] = normalized_contexts
329+ test_case_kwargs ["context" ] = turn_data .contexts
329330
330331 # Create test case for a single turn
331332 test_case = LLMTestCase (** test_case_kwargs )
@@ -385,12 +386,6 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
385386 """
386387 # Convert evaluation_params to enum values if valid, otherwise use defaults
387388 converted_params = self ._convert_evaluation_params (evaluation_params )
388- if not converted_params :
389- # If no valid params, use sensible defaults for conversation evaluation
390- converted_params = [
391- LLMTestCaseParams .INPUT ,
392- LLMTestCaseParams .ACTUAL_OUTPUT ,
393- ]
394389
395390 # Configure the GEval metric for conversation-level evaluation
396391 metric_kwargs : dict [str , Any ] = {
@@ -402,6 +397,15 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
402397 "top_logprobs" : 5 , # Vertex/Gemini throws an error if over 20.
403398 }
404399
400+ if converted_params is None :
401+ if not evaluation_params :
402+ metric_kwargs ["evaluation_params" ] = [
403+ LLMTestCaseParams .INPUT ,
404+ LLMTestCaseParams .ACTUAL_OUTPUT ,
405+ ]
406+ else :
407+ metric_kwargs ["evaluation_params" ] = converted_params
408+
405409 # Add evaluation steps if provided
406410 if evaluation_steps :
407411 metric_kwargs ["evaluation_steps" ] = evaluation_steps
0 commit comments