huggingface · qgallouedec · Feb 11, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
@@ -304,6 +304,7 @@ def __init__(
             # Distributed training requires device_map=None ("auto" fails)
             if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
+            model_init_kwargs["num_labels"] = 1  # the only output of the model is the reward score
             with ignore_seqcls_score_missing_key():
                 model = create_model_from_path(model, AutoModelForSequenceClassification, **model_init_kwargs)
         else: