radixark · zianglih · Jan 28, 2026 · Jan 28, 2026
diff --git a/docs/en/developer_guide/debug.md b/docs/en/developer_guide/debug.md
@@ -48,4 +48,8 @@ Specifically, miles currently provides the following parameters for separate deb
 
 4.  `--load-debug-rollout-data /your/saved/debug/data_{rollout_id}.pt`
 
-    When enabled, data will be loaded from `args.load_debug_rollout_data.format(rollout_id=rollout_id)`, and SGLang will not be initialized (automatically setting `debug_train_only=True`). This method allows you to fix the input for the training part to tune it, for example, by switching between different parallelization strategies.
+    When enabled, data will be loaded from `args.load_debug_rollout_data.format(rollout_id=rollout_id)`, and SGLang will not be initialized (automatically setting `debug_train_only=True`). This method allows you to fix the input for the training part to tune it, for example, by switching between different parallelization strategies.
+
+5.  `--debug-first-weight-sync /your/saved/debug/first_weight_sync/`
+
+    When enabled, miles saves the first Hugging Face checkpoint that Megatron syncs to SGLang into this directory, compares it with `--hf-checkpoint`, and reports any layers that are not bitwise identical (including mismatch percentages and average relative differences). The run terminates after the first sync completes.
@@ -41,6 +41,8 @@ def __init__(
         self.quantization_config = quantization_config
         self.weight_version = 0
         self._model_update_groups = None
+        self._debug_first_weight_sync = None
+        self._debug_first_weight_sync_done = False
 
     def connect_rollout_engines(
         self, rollout_engines: Sequence[ActorHandle], rollout_engine_lock: ActorHandle
@@ -76,6 +78,17 @@ def update_weights(self) -> None:
         Pause → flush → non-expert (TP) → expert (EP) → continue. Progress on PP source.
         """
         self.weight_version += 1
+        debug_first_weight_sync = (
+            self.args.debug_first_weight_sync and not self._debug_first_weight_sync_done and self.weight_version == 1
+        )
+        if debug_first_weight_sync:
+            from miles.utils.hf_checkpoint_debug import DebugFirstWeightSync
+
+            self._debug_first_weight_sync = DebugFirstWeightSync(
+                output_dir=self.args.debug_first_weight_sync,
+                source_checkpoint=self.args.hf_checkpoint,
+                write_rank=getattr(self, "_is_pp_src_rank", False),
+            )
 
         if dist.get_rank() == 0:
             ray.get([engine.pause_generation.remote() for engine in self.rollout_engines])
@@ -131,6 +144,11 @@ def update_weights(self) -> None:
                     rollout_engines=self.rollout_engines,
                 )
         dist.barrier(group=get_gloo_group())
+        if debug_first_weight_sync and self._debug_first_weight_sync is not None:
+            self._debug_first_weight_sync.finalize_and_compare(group=get_gloo_group())
+            self._debug_first_weight_sync_done = True
+            self._debug_first_weight_sync = None
+            raise RuntimeError("debug-first-weight-sync complete; terminating as requested.")
 
     def _update_weight_from_distributed(
         self,
@@ -224,6 +242,8 @@ def _update_bucket_weights_from_distributed(
         """
         Lock → broadcast → clear → unlock → pbar++. Lock prevents NCCL deadlock.
         """
+        if self._debug_first_weight_sync is not None:
+            self._debug_first_weight_sync.write_chunk(converted_named_tensors)
         # lock the rollout engines to prevent dead lock on broadcast.
         while not ray.get(self.rollout_engine_lock.acquire.remote()):
             time.sleep(0.1)

@@ -47,6 +47,8 @@ def __init__(
         self.model_name = model_name
         self.quantization_config = quantization_config
         self.weight_version = 0
+        self._debug_first_weight_sync = None
+        self._debug_first_weight_sync_done = False
 
         self._hf_weight_iterator = HfWeightIteratorBase.create(
             args=args, model=model, model_name=model_name, quantization_config=quantization_config
@@ -109,6 +111,17 @@ def update_weights(self) -> None:
         version++, flush caches, process buckets. Progress on rank 0.
         """
         self.weight_version += 1
+        debug_first_weight_sync = (
+            self.args.debug_first_weight_sync and not self._debug_first_weight_sync_done and self.weight_version == 1
+        )
+        if debug_first_weight_sync:
+            from miles.utils.hf_checkpoint_debug import DebugFirstWeightSync
+
+            self._debug_first_weight_sync = DebugFirstWeightSync(
+                output_dir=self.args.debug_first_weight_sync,
+                source_checkpoint=self.args.hf_checkpoint,
+                write_rank=dist.get_rank() == 0,
+            )
 
         rank = dist.get_rank()
         if rank == 0:
@@ -124,6 +137,8 @@ def update_weights(self) -> None:
         megatron_local_weights = self.weights_getter()
 
         for hf_named_tensors in self._hf_weight_iterator.get_hf_weight_chunks(megatron_local_weights):
+            if debug_first_weight_sync and self._debug_first_weight_sync is not None:
+                self._debug_first_weight_sync.write_chunk(hf_named_tensors)
             refs, long_lived_tensors = self._send_hf_params(hf_named_tensors)
             ray.get(refs)
             del long_lived_tensors
@@ -140,6 +155,11 @@ def update_weights(self) -> None:
                     rollout_engines=self.rollout_engines,
                 )
         dist.barrier(group=get_gloo_group())
+        if debug_first_weight_sync and self._debug_first_weight_sync is not None:
+            self._debug_first_weight_sync.finalize_and_compare(group=get_gloo_group())
+            self._debug_first_weight_sync_done = True
+            self._debug_first_weight_sync = None
+            raise RuntimeError("debug-first-weight-sync complete; terminating as requested.")
 
     def _send_hf_params(self, hf_named_tensors) -> tuple[list[ObjectRef], Any]:
         all_refs = []

@@ -1107,6 +1107,14 @@ def add_debug_arguments(parser):
                     "This is useful for debugging the rollout generation function."
                 ),
             )
+            parser.add_argument(
+                "--debug-first-weight-sync",
+                type=str,
+                default=None,
+                help=(
+                    "If set, save the first Megatron->SGLang HF weight sync to this directory, then compare it with --hf-checkpoint and report mismatched layers."
+                ),
+            )
             parser.add_argument(
                 "--save-debug-train-data",
                 type=str,
@@ -1638,6 +1646,11 @@ def miles_validate_args(args):
         "debug_rollout_only and debug_train_only cannot be set at the same time, " "please set only one of them."
     )
 
+    if args.debug_first_weight_sync and args.hf_checkpoint is None:
+        logger.warning("--debug-first-weight-sync set without --hf-checkpoint; compare will be skipped.")
+    if args.debug_first_weight_sync and (args.debug_rollout_only or args.debug_train_only):
+        logger.warning("--debug-first-weight-sync is set but weight sync is disabled in debug-only modes.")
+
     # always true on offload for colocate at the moment.
     if args.colocate:
         if args.offload_train is None: