modelscope · hjh0119 · Mar 27, 2026 · Mar 26, 2026
diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
@@ -522,6 +522,8 @@ RLHF参数继承于[训练参数](#训练参数)。
 - center_rewards_coefficient: 用于RM训练。用于激励奖励模型输出均值为零的奖励的系数，具体查看这篇[论文](https://huggingface.co/papers/2312.09244)。推荐值：0.01。
 - loss_scale: 覆盖模板参数。rlhf训练时，默认为'last_round'。
 - temperature: 默认为0.9，该参数将在PPO、GRPO、GKD中使用。
+- top_k: rollout采样的top-k参数，-1表示不进行top-k过滤。默认为-1。
+- top_p: rollout采样的top-p参数，1.0表示不进行top-p过滤。默认为1.0。
 
 #### GKD参数
 - lmbda: 默认为0.5。该参数在GKD中使用。控制学生数据比例的 lambda 参数（即策略内学生生成输出所占的比例）。若lmbda为0，则不使用学生生成数据。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -532,6 +532,8 @@ RLHF arguments inherit from the [training arguments](#training-arguments).
 - center_rewards_coefficient: A coefficient used in reward model (RM) training to incentivize the model to output rewards with zero mean. See this [paper](https://huggingface.co/papers/2312.09244) for details. Recommended value: 0.01.
 - loss_scale: Overrides the template parameter. During RLHF training, the default is `'last_round'`.
 - temperature: Default is 0.9; this parameter will be used in PPO, GRPO and GKD.
+- top_k: Top-k parameter for rollout sampling. -1 means no top-k filtering is applied. Default is -1.
+- top_p: Top-p parameter for rollout sampling. 1.0 means no top-p filtering is applied. Default is 1.0.
 
 #### GKD Arguments
 - lmbda: Default is 0.5. This parameter is used in GKD. It controls the lambda parameter for the proportion of student data (i.e., the proportion of student-generated outputs within the strategy). If lmbda is 0, student-generated data is not used.

diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
@@ -80,8 +80,8 @@ class RLHFMegatronArgumentsMixin:
     epsilon: float = 0.2
     epsilon_high: Optional[float] = None
     delta: Optional[float] = None
-    top_k: int = 50
-    top_p: float = 0.9
+    top_k: int = -1
+    top_p: float = 1.0
     repetition_penalty: float = 1.
 
     use_vllm: bool = True

diff --git a/swift/rlhf_trainers/args_mixin.py b/swift/rlhf_trainers/args_mixin.py
@@ -104,9 +104,10 @@ class RolloutTrainerArgumentsMixin(VllmArguments):
     the inference backend for generation.
 
     Args:
-        top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to 50.
+        top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering. -1 means
+            no filtering. Defaults to -1.
         top_p (float): If set to a float < 1, only the smallest set of most probable tokens with probabilities that
-            add up to top_p or higher are kept for generation. Defaults to 0.9.
+            add up to top_p or higher are kept for generation. Defaults to 1.0.
         repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. Defaults to 1.0.
         stop_words (List[str]): A list of strings that will stop the generation when they are generated. Defaults to an
             empty list.
@@ -150,8 +151,8 @@ class RolloutTrainerArgumentsMixin(VllmArguments):
             Only effective when using vLLM backend (`use_vllm=True`).
     """
     # generation args
-    top_k: int = 50
-    top_p: float = 0.9
+    top_k: int = -1
+    top_p: float = 1.0
     repetition_penalty: float = 1.
     stop_words: List[str] = field(default_factory=list)