NVIDIA-NeMo · terrykong · Apr 30, 2025 · Apr 29, 2025
@@ -55,8 +55,6 @@ policy:
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
   make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  ## NOTE: there is a known issue with gradient clipping when using Dtensor
-  ## if using dtensor, set max_grad_norm to NULL
   max_grad_norm: 1.0
 
   optimizer: