Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/en/developer_guide/debug.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,8 @@ Specifically, miles currently provides the following parameters for separate deb

4. `--load-debug-rollout-data /your/saved/debug/data_{rollout_id}.pt`

When enabled, data will be loaded from `args.load_debug_rollout_data.format(rollout_id=rollout_id)`, and SGLang will not be initialized (automatically setting `debug_train_only=True`). This method allows you to fix the input for the training part to tune it, for example, by switching between different parallelization strategies.
When enabled, data will be loaded from `args.load_debug_rollout_data.format(rollout_id=rollout_id)`, and SGLang will not be initialized (automatically setting `debug_train_only=True`). This method allows you to fix the input for the training part to tune it, for example, by switching between different parallelization strategies.

5. `--debug-first-weight-sync /your/saved/debug/first_weight_sync/`

When enabled, miles saves the first Hugging Face checkpoint that Megatron syncs to SGLang into this directory, compares it with `--hf-checkpoint`, and reports any layers that are not bitwise identical (including mismatch percentages and average relative differences). The run terminates after the first sync completes.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def __init__(
self.quantization_config = quantization_config
self.weight_version = 0
self._model_update_groups = None
self._debug_first_weight_sync = None
self._debug_first_weight_sync_done = False

def connect_rollout_engines(
self, rollout_engines: Sequence[ActorHandle], rollout_engine_lock: ActorHandle
Expand Down Expand Up @@ -76,6 +78,17 @@ def update_weights(self) -> None:
Pause → flush → non-expert (TP) → expert (EP) → continue. Progress on PP source.
"""
self.weight_version += 1
debug_first_weight_sync = (
self.args.debug_first_weight_sync and not self._debug_first_weight_sync_done and self.weight_version == 1
)
if debug_first_weight_sync:
from miles.utils.hf_checkpoint_debug import DebugFirstWeightSync

self._debug_first_weight_sync = DebugFirstWeightSync(
output_dir=self.args.debug_first_weight_sync,
source_checkpoint=self.args.hf_checkpoint,
write_rank=getattr(self, "_is_pp_src_rank", False),
)

if dist.get_rank() == 0:
ray.get([engine.pause_generation.remote() for engine in self.rollout_engines])
Expand Down Expand Up @@ -131,6 +144,11 @@ def update_weights(self) -> None:
rollout_engines=self.rollout_engines,
)
dist.barrier(group=get_gloo_group())
if debug_first_weight_sync and self._debug_first_weight_sync is not None:
self._debug_first_weight_sync.finalize_and_compare(group=get_gloo_group())
self._debug_first_weight_sync_done = True
self._debug_first_weight_sync = None
raise RuntimeError("debug-first-weight-sync complete; terminating as requested.")

def _update_weight_from_distributed(
self,
Expand Down Expand Up @@ -224,6 +242,8 @@ def _update_bucket_weights_from_distributed(
"""
Lock → broadcast → clear → unlock → pbar++. Lock prevents NCCL deadlock.
"""
if self._debug_first_weight_sync is not None:
self._debug_first_weight_sync.write_chunk(converted_named_tensors)
# lock the rollout engines to prevent dead lock on broadcast.
while not ray.get(self.rollout_engine_lock.acquire.remote()):
time.sleep(0.1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def __init__(
self.model_name = model_name
self.quantization_config = quantization_config
self.weight_version = 0
self._debug_first_weight_sync = None
self._debug_first_weight_sync_done = False

self._hf_weight_iterator = HfWeightIteratorBase.create(
args=args, model=model, model_name=model_name, quantization_config=quantization_config
Expand Down Expand Up @@ -109,6 +111,17 @@ def update_weights(self) -> None:
version++, flush caches, process buckets. Progress on rank 0.
"""
self.weight_version += 1
debug_first_weight_sync = (
self.args.debug_first_weight_sync and not self._debug_first_weight_sync_done and self.weight_version == 1
)
if debug_first_weight_sync:
from miles.utils.hf_checkpoint_debug import DebugFirstWeightSync

self._debug_first_weight_sync = DebugFirstWeightSync(
output_dir=self.args.debug_first_weight_sync,
source_checkpoint=self.args.hf_checkpoint,
write_rank=dist.get_rank() == 0,
)

rank = dist.get_rank()
if rank == 0:
Expand All @@ -124,6 +137,8 @@ def update_weights(self) -> None:
megatron_local_weights = self.weights_getter()

for hf_named_tensors in self._hf_weight_iterator.get_hf_weight_chunks(megatron_local_weights):
if debug_first_weight_sync and self._debug_first_weight_sync is not None:
self._debug_first_weight_sync.write_chunk(hf_named_tensors)
refs, long_lived_tensors = self._send_hf_params(hf_named_tensors)
ray.get(refs)
del long_lived_tensors
Expand All @@ -140,6 +155,11 @@ def update_weights(self) -> None:
rollout_engines=self.rollout_engines,
)
dist.barrier(group=get_gloo_group())
if debug_first_weight_sync and self._debug_first_weight_sync is not None:
self._debug_first_weight_sync.finalize_and_compare(group=get_gloo_group())
self._debug_first_weight_sync_done = True
self._debug_first_weight_sync = None
raise RuntimeError("debug-first-weight-sync complete; terminating as requested.")

def _send_hf_params(self, hf_named_tensors) -> tuple[list[ObjectRef], Any]:
all_refs = []
Expand Down
13 changes: 13 additions & 0 deletions miles/utils/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -1107,6 +1107,14 @@ def add_debug_arguments(parser):
"This is useful for debugging the rollout generation function."
),
)
parser.add_argument(
"--debug-first-weight-sync",
type=str,
default=None,
help=(
"If set, save the first Megatron->SGLang HF weight sync to this directory, then compare it with --hf-checkpoint and report mismatched layers."
),
)
parser.add_argument(
"--save-debug-train-data",
type=str,
Expand Down Expand Up @@ -1638,6 +1646,11 @@ def miles_validate_args(args):
"debug_rollout_only and debug_train_only cannot be set at the same time, " "please set only one of them."
)

if args.debug_first_weight_sync and args.hf_checkpoint is None:
logger.warning("--debug-first-weight-sync set without --hf-checkpoint; compare will be skipped.")
if args.debug_first_weight_sync and (args.debug_rollout_only or args.debug_train_only):
logger.warning("--debug-first-weight-sync is set but weight sync is disabled in debug-only modes.")

# always true on offload for colocate at the moment.
if args.colocate:
if args.offload_train is None:
Expand Down
Loading