diff --git a/src/lighteval/models/endpoints/litellm_model.py b/src/lighteval/models/endpoints/litellm_model.py index b69764ebc..20fb87d04 100644 --- a/src/lighteval/models/endpoints/litellm_model.py +++ b/src/lighteval/models/endpoints/litellm_model.py @@ -294,10 +294,14 @@ def greedy_until( for response, context in zip(responses, contexts): result: list[str] = [choice.message.content for choice in response.choices] + reasonings: list[str | None] = [ + getattr(choice.message, "reasoning_content", None) for choice in response.choices + ] cur_response = ModelResponse( # In empty responses, the model should return an empty string instead of None text=result if result[0] else [""], + reasonings=reasonings, input=context, ) results.append(cur_response) diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py index 3663d88bd..fcb96099e 100644 --- a/src/lighteval/models/model_output.py +++ b/src/lighteval/models/model_output.py @@ -127,6 +127,7 @@ class ModelResponse: text: list[str] = field(default_factory=list) # The text of the response output_tokens: list[list[int]] = field(default_factory=list) # Model generations text_post_processed: list[str] | None = None # The text of the response postprocessed + reasonings: list[str | None] = field(default_factory=list) # The reasoning content of the response # Model logprob outputs logprobs: list[float] = field(default_factory=list) # Log probabilities of the response