diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index 44e2491227..34367dee20 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -55,8 +55,6 @@ policy: # makes the training sequence length divisible by the tensor parallel size # this is useful for sequence parallel training make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - ## NOTE: there is a known issue with gradient clipping when using Dtensor - ## if using dtensor, set max_grad_norm to NULL max_grad_norm: 1.0 optimizer: