NVIDIA-NeMo · SahilJain314 · Apr 23, 2025 · Apr 18, 2025 · Apr 19, 2025 · Apr 20, 2025
@@ -68,7 +68,7 @@ We have a reference GRPO experiment config set up trained for math benchmarks us
 
 #### Single Node
 
-To run GRPO on a single GPU for `Llama-3.2-1B-Instruct`:
+To run GRPO on a single GPU for `Qwen/Qwen2.5-1.5B`:
 
 ```sh
 # Run the GRPO math example using a 1B parameter model
@@ -87,10 +87,10 @@ You can override any of the parameters listed in the yaml configuration file. Fo
 
 ```sh
 uv run python examples/run_grpo_math.py \
-  policy.model_name="Qwen/Qwen2-1.5B" \
-  checkpointing.checkpoint_dir="results/qwen1_5b_math" \
+  policy.model_name="Llama-3.2-1B-Instruct" \
+  checkpointing.checkpoint_dir="results/llama1b_math" \
   logger.wandb_enabled=True \
-  logger.wandb.name="grpo-qwen1_5b_math" \
+  logger.wandb.name="grpo-llama1b_math" \
   logger.num_val_samples_to_print=10 \
 ```
 

@@ -24,7 +24,7 @@ checkpointing:
   save_period: 10
 
 policy:
-  model_name: "meta-llama/Llama-3.2-1B-Instruct"
+  model_name: "Qwen/Qwen2.5-1.5B"
   tokenizer:
     name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
   train_global_batch_size: 512
@@ -37,7 +37,7 @@ policy:
   activation_checkpointing_enabled: false
 
   dtensor_cfg:
-    enabled: false
+    enabled: true
     cpu_offload: False
     sequence_parallel: false
     activation_checkpointing: false

@@ -18,6 +18,9 @@ policy:
   fsdp_offload_enabled: false
   activation_checkpointing_enabled: false
 
+  dtensor_cfg:
+    enabled: False
+
   optimizer:
     name: "torch.optim.AdamW"
     kwargs:

@@ -24,6 +24,7 @@
     FSDPModule,
 )
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.modeling_utils import _get_tied_weight_keys
 from nemo_reinforcer.models.dtensor.parallelize import _parallelize_model
 
 from nemo_reinforcer.algorithms.interfaces import LossFunction
@@ -140,6 +141,7 @@ def __init__(
             device_map="cpu",  # load weights onto CPU initially
             torch_dtype=torch.float32,  # use full precision in sft until https://github.com/NVIDIA/reinforcer/issues/13 is fixed
         )
+
         self.tokenizer = tokenizer
         # ------------------------------------------------
         # 3) Move to GPU + Composable FSDP
@@ -152,6 +154,13 @@ def __init__(
             f"World size({world_size}) must be divisible by TP size({tp_size}) to use DTensor"
         )
 
+        num_tied_weights = len(_get_tied_weight_keys(self.model))
+        skip_tie_check = self.cfg.get("skip_tie_check", False)
+        if num_tied_weights != 0 and tp_size > 1 and not skip_tie_check:
+            raise ValueError(
+                f"Using dtensor policy with tp size {tp_size} for model ({model_name}) that has tied weights (num_tied_weights={num_tied_weights}) is not supported (https://github.com/NVIDIA/reinforcer/issues/227). Please use dtensor policy with tensor parallel == 1 instead."
+            )
+
         mesh_2d = torch.distributed.device_mesh.init_device_mesh(
             "cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp")
         )

@@ -38,6 +38,7 @@
 )
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.modeling_utils import _get_tied_weight_keys
 from nemo_reinforcer.models.policy import PolicyConfig
 from nemo_reinforcer.models.policy.utils import import_class_from_path
 from nemo_reinforcer.distributed.virtual_cluster import (
@@ -92,6 +93,15 @@ def __init__(
             device_map="cpu",  # load weights onto CPU initially
             torch_dtype=torch.float32,  # use full precision in sft until https://github.com/NVIDIA/reinforcer/issues/13 is fixed
         )
+
+        # Check if the model has tied weights
+        num_tied_weights = len(_get_tied_weight_keys(self.model))
+        skip_tie_check = self.cfg.get("skip_tie_check", False)
+        if num_tied_weights != 0 and not skip_tie_check:
+            raise ValueError(
+                f"Using FSP1 with a model ({model_name}) that has tied weights (num_tied_weights={num_tied_weights}) is not supported (https://github.com/NVIDIA/reinforcer/issues/227). Please use dtensor policy with tensor parallel == 1 instead."
+            )
+
         if init_reference_model:
             self.reference_model = AutoModelForCausalLM.from_pretrained(
                 model_name,

@@ -25,7 +25,6 @@
 from nemo_reinforcer.models.generation.vllm import VllmGeneration, VllmConfig
 from nemo_reinforcer.models.policy import PolicyConfig
 
-
 # Define basic vLLM test config
 basic_vllm_test_config: VllmConfig = {
     "backend": "vllm",
@@ -55,6 +54,7 @@ def get_basic_hf_test_config(enable_dtensor: bool = False) -> PolicyConfig:
         "tokenizer": {
             "name": basic_vllm_test_config["tokenizer"]["name"],
         },
+        "skip_tie_check": True,
         # Required training parameters
         "train_global_batch_size": 1,
         "train_micro_batch_size": 1,