From f7235944ac3239d023ac8dcdce580fda7d0a5322 Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Thu, 29 Jan 2026 13:30:05 -0800 Subject: [PATCH] [doc] refactor: Restructure examples folder - move recipes to models, distillation, decentralized_pg Signed-off-by: yaoyu-33 --- README.md | 2 +- docs/megatron-lm-to-megatron-bridge.md | 2 +- docs/models/llm/nemotron3.md | 6 +++--- docs/models/llm/nemotronh.md | 2 +- docs/models/vlm/ministral3.md | 8 ++++---- docs/models/vlm/nemotron-nano-v2-vl.md | 14 +++++++------- docs/models/vlm/qwen2.5-vl.md | 8 ++++---- docs/models/vlm/qwen3-vl.md | 6 +++--- docs/recipe-usage.md | 4 ++-- docs/training/distillation.md | 8 ++++---- examples/{recipes => }/decentralized_pg/README.md | 10 +++++----- .../decentralized_pg/pretrain_qwen3_simple.py | 4 ++-- .../decentralized_pg/pretrain_qwen3_vl_simple.py | 2 +- .../pretrain_qwen3_with_decentralized_pg.py | 6 +++--- .../llama32_3b-1b_distill_override_example.yaml | 0 .../llama/distill_llama32_3b-1b.py | 2 +- .../nemotron_3/finetune_nemotron_3_nano.py | 0 .../nemotron_3/pretrain_nemotron_3_nano.py | 0 ...en3_next_80b_a3b_finetune_override_example.yaml | 0 .../qwen3_next/finetune_qwen3_next_80b_a3b.py | 2 +- .../conf/nemotron_nano_v2_vl_override_example.yaml | 0 .../conf/nemotron_nano_v2_vl_video.yaml | 0 .../nemotron_vl/finetune_nemotron_nano_v2_vl.py | 0 .../conf/qwen25_vl_pretrain_override_example.yaml | 0 .../conf/qwen3_vl_pretrain_override_example.yaml | 0 .../vlm}/qwen_vl/data/convert_to_qwenvl_wds.py | 2 +- .../vlm}/qwen_vl/finetune_qwen_vl.py | 10 +++++----- 27 files changed, 49 insertions(+), 49 deletions(-) rename examples/{recipes => }/decentralized_pg/README.md (93%) rename examples/{recipes => }/decentralized_pg/pretrain_qwen3_simple.py (96%) rename examples/{recipes => }/decentralized_pg/pretrain_qwen3_vl_simple.py (98%) rename examples/{recipes => }/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py (99%) rename examples/{recipes => distillation}/llama/conf/llama32_3b-1b_distill_override_example.yaml (100%) rename examples/{recipes => distillation}/llama/distill_llama32_3b-1b.py (99%) rename examples/{recipes => models}/nemotron_3/finetune_nemotron_3_nano.py (100%) rename examples/{recipes => models}/nemotron_3/pretrain_nemotron_3_nano.py (100%) rename examples/{recipes => models}/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml (100%) rename examples/{recipes => models}/qwen3_next/finetune_qwen3_next_80b_a3b.py (98%) rename examples/{recipes => models/vlm}/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml (100%) rename examples/{recipes => models/vlm}/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml (100%) rename examples/{recipes => models/vlm}/nemotron_vl/finetune_nemotron_nano_v2_vl.py (100%) rename examples/{recipes => models/vlm}/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml (100%) rename examples/{recipes => models/vlm}/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml (100%) rename examples/{recipes => models/vlm}/qwen_vl/data/convert_to_qwenvl_wds.py (98%) rename examples/{recipes => models/vlm}/qwen_vl/finetune_qwen_vl.py (97%) diff --git a/README.md b/README.md index 677f8d7cd8..8c870441e3 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ For a deeper dive into conversion design and advanced usage, see the [models REA - Optimized paths when Transformer Engine is available - **Flexible to Customize**: Lightweight custom training loop making it easy to configure custom logic in data loading, distributed training, checkpointing, evaluation and logging ([training framework](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/training), [training utilities](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/training/utils)) - **Supervised & Parameter-Efficient Finetuning**: SFT & PEFT implementation tailored for Megatron-based models that supports LoRA, DoRA, and user-defined PEFT methods ([PEFT implementations](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/peft), [finetune module](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/training/finetune.py), [SFT dataset](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/data/datasets/sft.py)) -- **SOTA Training Recipes**: Pre-configured production-ready training recipes for popular models like Llama 3, with optimized hyperparameters and distributed training configuration ([Llama recipes](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/recipes/llama), [recipe examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/recipes)) +- **SOTA Training Recipes**: Pre-configured production-ready training recipes for popular models like Llama 3, with optimized hyperparameters and distributed training configuration ([Llama recipes](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/recipes/llama), [recipe examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/models)) - **Performance Optimization**: Built-in support for FP8 training, model parallelism, and memory-efficient techniques to offer high utilization and near-linear scalability to thousands of nodes. ([mixed precision](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/training/mixed_precision.py), [communication overlap](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/training/comm_overlap.py), [optimizer utilities](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/recipes/utils/optimizer_utils.py)) ## Supported Models diff --git a/docs/megatron-lm-to-megatron-bridge.md b/docs/megatron-lm-to-megatron-bridge.md index 34871e34f8..b781dc65ba 100644 --- a/docs/megatron-lm-to-megatron-bridge.md +++ b/docs/megatron-lm-to-megatron-bridge.md @@ -7,7 +7,7 @@ Megatron Bridge is Python-first: configure models, data, and training via typed Run your example training entrypoint and override config keys directly: ```bash -python examples/recipes/llama/pretrain_llama3_8b.py \ +python examples/models/llama/pretrain_llama3_8b.py \ train.micro_batch_size=2 \ train.global_batch_size=128 \ model.num_layers=32 model.hidden_size=4096 model.num_attention_heads=32 \ diff --git a/docs/models/llm/nemotron3.md b/docs/models/llm/nemotron3.md index 9ec89b84bc..bc6c8b6536 100644 --- a/docs/models/llm/nemotron3.md +++ b/docs/models/llm/nemotron3.md @@ -40,7 +40,7 @@ python examples/conversion/convert_checkpoints.py export \ BLEND_PATH=/path/to/dataset/blend TOKENIZER_MODEL=/path/to/tiktok/tokenizer/model -torchrun --nproc-per-node=8 examples/recipes/nemotron_3/pretrain_nemotron_3_nano.py \ +torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_nano.py \ --per-split-data-args-path=${BLEND_PATH} \ --tokenizer-model=${TOKENIZER_MODEL} \ train.global_batch_size=3072 \ @@ -58,7 +58,7 @@ Notes: ### Full Parameter Fine-Tuning ```bash -torchrun --nproc-per-node=8 examples/recipes/nemotron_3/finetune_nemotron_3_nano.py \ +torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \ train.global_batch_size=128 \ train.train_iters=100 \ scheduler.lr_warmup_iters=10 \ @@ -74,7 +74,7 @@ Notes: ### LoRA Fine-Tuning To enable LoRA fine-tuning, pass `--peft lora` to script ```bash -torchrun --nproc-per-node=8 examples/recipes/nemotron_3/finetune_nemotron_3_nano.py \ +torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \ --peft lora \ train.global_batch_size=128 \ train.train_iters=100 \ diff --git a/docs/models/llm/nemotronh.md b/docs/models/llm/nemotronh.md index 783d6f4e2b..a880e9abe4 100644 --- a/docs/models/llm/nemotronh.md +++ b/docs/models/llm/nemotronh.md @@ -184,7 +184,7 @@ bridge.export_ckpt( ## Examples - Checkpoint conversion: [examples/conversion/convert_checkpoints.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py) -- Training scripts: [examples/recipes/train_any_basic.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/train_any_basic.py) +- Training scripts: [examples/models/train_any_basic.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/train_any_basic.py) ## Finetuning Recipes diff --git a/docs/models/vlm/ministral3.md b/docs/models/vlm/ministral3.md index b47a08df6c..8137359bd8 100644 --- a/docs/models/vlm/ministral3.md +++ b/docs/models/vlm/ministral3.md @@ -99,7 +99,7 @@ Before training, ensure the following environment variables are set: ### Full Finetuning ```bash -torchrun --nproc-per-node=8 examples/recipes/ministral3/finetune_ministral3_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/ministral3/finetune_ministral3_vl.py \ --pretrained-checkpoint /models/ministral3-3b \ --dataset-type hf \ train.global_batch_size=32 \ @@ -124,7 +124,7 @@ config = ministral3_3b_finetune_config( ### Parameter-Efficient Finetuning (PEFT) with LoRA ```bash -torchrun --nproc-per-node=8 examples/recipes/ministral3/finetune_ministral3_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/ministral3/finetune_ministral3_vl.py \ --pretrained-checkpoint /models/ministral3-3b \ --peft-scheme lora \ --dataset-type hf \ @@ -142,7 +142,7 @@ You can also combine PEFT with freeze options: Example with freeze options: ```bash -torchrun --nproc-per-node=8 examples/recipes/ministral3/finetune_ministral3_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/ministral3/finetune_ministral3_vl.py \ --pretrained-checkpoint /models/ministral3-3b \ --peft-scheme lora \ --freeze-vision-model \ @@ -199,7 +199,7 @@ To change the dataset, specify `dataset.maker_name=` in your command ## Examples - Checkpoint import/export: [examples/conversion/convert_checkpoints.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py) - Generate with VLM (HF→Megatron): [examples/conversion/hf_to_megatron_generate_vlm.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/hf_to_megatron_generate_vlm.py) -- Finetuning script: [examples/recipes/ministral3/finetune_ministral3_vl.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/ministral3/finetune_ministral3_vl.py) +- Finetuning script: [examples/models/vlm/ministral3/finetune_ministral3_vl.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/vlm/ministral3/finetune_ministral3_vl.py) ## Hugging Face Model Cards diff --git a/docs/models/vlm/nemotron-nano-v2-vl.md b/docs/models/vlm/nemotron-nano-v2-vl.md index 6dee739975..6921fef5c1 100644 --- a/docs/models/vlm/nemotron-nano-v2-vl.md +++ b/docs/models/vlm/nemotron-nano-v2-vl.md @@ -85,7 +85,7 @@ Example usage for full parameter finetuning using the [Raven dataset](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron/viewer/raven): ```bash -torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ --hf-model-path $HF_MODEL_PATH \ --pretrained-checkpoint \ dataset.maker_name=make_raven_dataset \ @@ -95,7 +95,7 @@ checkpoint.save=$SAVE_DIR/ ``` Note: -- The config file `examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml` contains a list of arguments +- The config file `examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml` contains a list of arguments that can be overridden in the command. For example, you can set `train.global_batch_size=` in the command. - To change the dataset, you only need to change `dataset.maker_name`. See the dataset section below for details. - After training, you can run inference with `hf_to_megatron_generate_vlm.py` by supplying the trained megatron checkpoint. @@ -110,7 +110,7 @@ settings out of the box in the example script: distribution is substantially different from pretrained.) ```bash -torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ --hf-model-path $HF_MODEL_PATH \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ --lora-on-language-model \ @@ -126,7 +126,7 @@ model.freeze_vision_projection=False 2. Apply LoRA to all linear layers in attention and MLP modules of the vision model, vision projection, and the language model. ```bash -torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ --hf-model-path $HF_MODEL_PATH \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ --lora-on-language-model \ @@ -169,7 +169,7 @@ Megatron Bridge supports various vision-language dataset examples which can be u Note on video training example: - We provide a video config yaml file instead of the default config yaml file that overwrites a few commands. Please - pass in `--config-file "examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml"`. + pass in `--config-file "examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml"`. - The LLaVA video dataset requires manual download beforehand. Please place the downloaded and extracted video files in a folder `VIDEO_ROOT` and pass it in to the maker with `dataset.maker_kwargs={"video_root_path":$VIDEO_ROOT}`. In the nextqa subset example, `VIDEO_ROOT` should look like @@ -186,10 +186,10 @@ Note on video training example: Full video training example command: ```bash -torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \ --hf-model-path $HF_MODEL_PATH \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ ---config-file "examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml" \ +--config-file "examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml" \ logger.wandb_project= \ logger.wandb_save_dir=$SAVE_DIR \ checkpoint.save=$SAVE_DIR/ \ diff --git a/docs/models/vlm/qwen2.5-vl.md b/docs/models/vlm/qwen2.5-vl.md index 5cbd38b8e3..8825fdcb22 100644 --- a/docs/models/vlm/qwen2.5-vl.md +++ b/docs/models/vlm/qwen2.5-vl.md @@ -64,7 +64,7 @@ Before training, ensure the following environment variables are set. Example usage for full parameter finetuning: ```bash -torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen25_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen25_vl.py \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ --recipe qwen25_vl_3b_finetune_config \ --dataset-type hf \ @@ -82,7 +82,7 @@ Note: - `qwen25_vl_7b_finetune_config` - for 7B model - `qwen25_vl_32b_finetune_config` - for 32B model - `qwen25_vl_72b_finetune_config` - for 72B model -- The config file `examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml` contains a list of arguments +- The config file `examples/models/vlm/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml` contains a list of arguments that can be overridden in the command. For example, you can set `train.global_batch_size=` in the command. - The dataset format should be JSONL with conversation format (see dataset section below). - After training, you can run inference with `hf_to_megatron_generate_vlm.py` by supplying the trained megatron checkpoint. @@ -92,7 +92,7 @@ Note: Parameter-efficient finetuning (PEFT) using LoRA or DoRA is supported. You can use the `--peft_scheme` argument to enable PEFT training: ```bash -torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen25_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen25_vl.py \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ --recipe qwen25_vl_3b_finetune_config \ --peft_scheme lora \ @@ -112,7 +112,7 @@ You can also combine PEFT with freeze options to control which components are tr Example with LoRA and freeze options: ```bash -torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen25_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen25_vl.py \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ --recipe qwen25_vl_3b_finetune_config \ --peft_scheme lora \ diff --git a/docs/models/vlm/qwen3-vl.md b/docs/models/vlm/qwen3-vl.md index fa6eed8079..454d69ab70 100644 --- a/docs/models/vlm/qwen3-vl.md +++ b/docs/models/vlm/qwen3-vl.md @@ -55,7 +55,7 @@ Before training, ensure the following environment variables are set: Example usage for full parameter finetuning: ```bash -torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ --recipe qwen3_vl_8b_finetune_config \ --dataset-type hf \ @@ -69,7 +69,7 @@ checkpoint.save=$SAVE_DIR/ For MoE models with expert parallelism: ```bash -torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \ +torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \ --pretrained-checkpoint $MEGATRON_MODEL_PATH \ --recipe qwen3_vl_30b_a3b_finetune_config \ --dataset-type hf \ @@ -84,7 +84,7 @@ Note: - `qwen3_vl_8b_finetune_config` - for 8B dense model - `qwen3_vl_30b_a3b_finetune_config` - for 30B MoE model - For dataset formats and additional information, refer to the [Qwen2.5-VL documentation] -- See the full script with examples at [`examples/recipes/qwen_vl/finetune_qwen_vl.py`](../../../examples/recipes/qwen_vl/finetune_qwen_vl.py) +- See the full script with examples at [`examples/models/vlm/qwen_vl/finetune_qwen_vl.py`](../../../examples/models/vlm/qwen_vl/finetune_qwen_vl.py) ## Hugging Face Model Cards - Qwen3-VL-8B: `https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct` diff --git a/docs/recipe-usage.md b/docs/recipe-usage.md index a115c66573..bf0efa46e5 100644 --- a/docs/recipe-usage.md +++ b/docs/recipe-usage.md @@ -15,7 +15,7 @@ This guide will cover the next steps to make use of a training recipe, including Recipes are provided through a {py:class}`~bridge.training.config.ConfigContainer` object. This is a dataclass that holds all configuration objects needed for training. You can find a more detailed overview of the `ConfigContainer` [here](training/config-container-overview.md). The benefit of providing the full recipe through a pythonic structure is that it is agnostic to any configuration approach that a user may prefer, whether that's YAML, `argparse` or something else. In other words, the user may override the recipe however they see fit. -The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b.py). +The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b.py). ### Python @@ -184,7 +184,7 @@ if __name__ == "__main__": train_script = run.Script(..., args=args_to_fwd) ``` -For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py). +For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b_nemo_run_script.py). #### Plugins diff --git a/docs/training/distillation.md b/docs/training/distillation.md index 307f0672c0..545a6f132e 100644 --- a/docs/training/distillation.md +++ b/docs/training/distillation.md @@ -49,7 +49,7 @@ logit_kl_temperature: 2.0 The simplest way to run knowledge distillation is to use or adapt one of the provided recipe scripts. Here's an example for distilling Llama3.2-3B into Llama3.2-1B: ```bash -torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py +torchrun --nproc_per_node=1 examples/distillation/llama/distill_llama32_3b-1b.py ``` ### Using a Custom YAML Config File @@ -57,7 +57,7 @@ torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py You can provide a custom YAML configuration file to override default settings: ```bash -torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py \ +torchrun --nproc_per_node=1 examples/distillation/llama/distill_llama32_3b-1b.py \ --config-file my_custom_config.yaml ``` @@ -66,7 +66,7 @@ torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py \ Megatron Bridge supports Hydra-style CLI overrides for flexible configuration: ```bash -torchrun --nproc_per_node=2 examples/recipes/llama/distill_llama32_3b-1b.py \ +torchrun --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \ model.tensor_model_parallel_size=2 \ model.teacher.tensor_model_parallel_size=2 ``` @@ -76,7 +76,7 @@ torchrun --nproc_per_node=2 examples/recipes/llama/distill_llama32_3b-1b.py \ CLI overrides take precedence over YAML configuration: ```bash -torchrun --nproc_per_node=2 examples/recipes/llama/distill_llama32_3b-1b.py \ +torchrun --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \ --config-file conf/my_config.yaml \ train.global_batch_size=512 ``` diff --git a/examples/recipes/decentralized_pg/README.md b/examples/decentralized_pg/README.md similarity index 93% rename from examples/recipes/decentralized_pg/README.md rename to examples/decentralized_pg/README.md index 7849e1ee5f..8bc02b7d11 100755 --- a/examples/recipes/decentralized_pg/README.md +++ b/examples/decentralized_pg/README.md @@ -25,10 +25,10 @@ Just use an existing recipe and enable decentralized process groups: ```bash # 8 GPUs: TP2 x PP2 x DP2 -uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py +uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_simple.py # 4 GPUs: TP2 x PP2 x DP1 -uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py +uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_simple.py ``` The key is just two lines: @@ -53,14 +53,14 @@ For full control over process groups: ```bash # 8 GPUs: TP2 x PP2 x DP2 -uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py +uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py # 4 GPUs: TP2 x PP2 x DP1 -uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ +uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ --tp-size 2 --pp-size 2 # 2 GPUs: TP2 x PP1 x DP1 -uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ +uv run python -m torch.distributed.run --nproc_per_node=2 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ --tp-size 2 --pp-size 1 ``` diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_simple.py b/examples/decentralized_pg/pretrain_qwen3_simple.py similarity index 96% rename from examples/recipes/decentralized_pg/pretrain_qwen3_simple.py rename to examples/decentralized_pg/pretrain_qwen3_simple.py index 5a62680e91..454a1fc644 100644 --- a/examples/recipes/decentralized_pg/pretrain_qwen3_simple.py +++ b/examples/decentralized_pg/pretrain_qwen3_simple.py @@ -27,10 +27,10 @@ How to Run ---------- # 8 GPUs: TP2 x PP2 x DP2 -uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py +uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_simple.py # 4 GPUs: TP2 x PP2 x DP1 -uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py +uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_simple.py """ import torch diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py b/examples/decentralized_pg/pretrain_qwen3_vl_simple.py similarity index 98% rename from examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py rename to examples/decentralized_pg/pretrain_qwen3_vl_simple.py index bb6c9d9c71..efafe05422 100644 --- a/examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py +++ b/examples/decentralized_pg/pretrain_qwen3_vl_simple.py @@ -27,7 +27,7 @@ How to Run ---------- # 8 GPUs: EP8 -uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py +uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_vl_simple.py """ import torch diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py b/examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py similarity index 99% rename from examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py rename to examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py index 1f63955d0a..4dec03307e 100644 --- a/examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py +++ b/examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py @@ -37,14 +37,14 @@ How to Run ---------- # 8 GPUs: TP2 x PP2 x DP2 -uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py +uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py # 4 GPUs: TP2 x PP2 x DP1 -uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ +uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ --tp-size 2 --pp-size 2 # 2 GPUs: TP2 x PP1 x DP1 -uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ +uv run python -m torch.distributed.run --nproc_per_node=2 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \ --tp-size 2 --pp-size 1 """ diff --git a/examples/recipes/llama/conf/llama32_3b-1b_distill_override_example.yaml b/examples/distillation/llama/conf/llama32_3b-1b_distill_override_example.yaml similarity index 100% rename from examples/recipes/llama/conf/llama32_3b-1b_distill_override_example.yaml rename to examples/distillation/llama/conf/llama32_3b-1b_distill_override_example.yaml diff --git a/examples/recipes/llama/distill_llama32_3b-1b.py b/examples/distillation/llama/distill_llama32_3b-1b.py similarity index 99% rename from examples/recipes/llama/distill_llama32_3b-1b.py rename to examples/distillation/llama/distill_llama32_3b-1b.py index 504d933806..3835eabfe0 100644 --- a/examples/recipes/llama/distill_llama32_3b-1b.py +++ b/examples/distillation/llama/distill_llama32_3b-1b.py @@ -80,7 +80,7 @@ # Define paths relative to this script's location -# Assumes this script (distill_llama32_3b-1b.py) is in Megatron-Bridge/examples/recipes/llama/ +# Assumes this script (distill_llama32_3b-1b.py) is in Megatron-Bridge/examples/distillation/llama/ # and the config is in a 'conf' subdirectory. SCRIPT_DIR: Path = Path(__file__).parent.resolve() DEFAULT_CONFIG_FILENAME: str = "llama32_3b-1b_distill_override_example.yaml" diff --git a/examples/recipes/nemotron_3/finetune_nemotron_3_nano.py b/examples/models/nemotron_3/finetune_nemotron_3_nano.py similarity index 100% rename from examples/recipes/nemotron_3/finetune_nemotron_3_nano.py rename to examples/models/nemotron_3/finetune_nemotron_3_nano.py diff --git a/examples/recipes/nemotron_3/pretrain_nemotron_3_nano.py b/examples/models/nemotron_3/pretrain_nemotron_3_nano.py similarity index 100% rename from examples/recipes/nemotron_3/pretrain_nemotron_3_nano.py rename to examples/models/nemotron_3/pretrain_nemotron_3_nano.py diff --git a/examples/recipes/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml b/examples/models/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml similarity index 100% rename from examples/recipes/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml rename to examples/models/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml diff --git a/examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py b/examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py similarity index 98% rename from examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py rename to examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py index 7362f68f2d..58947e4f11 100644 --- a/examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py +++ b/examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py @@ -26,7 +26,7 @@ --megatron-path /path/to/megatron_ckpt 2) Run finetune using the imported checkpoint: - $ torchrun --nproc_per_node=8 examples/recipes/qwen/finetune_qwen3_next_80b_a3b.py \ + $ torchrun --nproc_per_node=8 examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py \ --pretrained-checkpoint /path/to/megatron_ckpt Using a custom YAML config file: diff --git a/examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml b/examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml similarity index 100% rename from examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml rename to examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml diff --git a/examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml b/examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml similarity index 100% rename from examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml rename to examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml diff --git a/examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py b/examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py similarity index 100% rename from examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py rename to examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py diff --git a/examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml b/examples/models/vlm/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml similarity index 100% rename from examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml rename to examples/models/vlm/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml diff --git a/examples/recipes/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml b/examples/models/vlm/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml similarity index 100% rename from examples/recipes/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml rename to examples/models/vlm/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml diff --git a/examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py b/examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py similarity index 98% rename from examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py rename to examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py index 0d6fb780e4..769549fa65 100644 --- a/examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py +++ b/examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py @@ -17,7 +17,7 @@ # Example dataset from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain -python examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py \\ +python examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py \\ --dataset-root=/path/to/LLaVA-Pretrain-LCS-558K \ --json=blip_laion_cc_sbu_558k.json \ --mediate-path=images \ diff --git a/examples/recipes/qwen_vl/finetune_qwen_vl.py b/examples/models/vlm/qwen_vl/finetune_qwen_vl.py similarity index 97% rename from examples/recipes/qwen_vl/finetune_qwen_vl.py rename to examples/models/vlm/qwen_vl/finetune_qwen_vl.py index 73cc3517fc..0344cd2d79 100644 --- a/examples/recipes/qwen_vl/finetune_qwen_vl.py +++ b/examples/models/vlm/qwen_vl/finetune_qwen_vl.py @@ -43,27 +43,27 @@ Finetune using the imported checkpoint: Qwen2.5-VL 3B: - $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\ + $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\ --recipe qwen25_vl_3b_finetune_config \\ --pretrained-checkpoint ./logs/checkpoints/qwen25vl3b Qwen2.5-VL 7B: - $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\ + $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\ --recipe qwen25_vl_7b_finetune_config \\ --pretrained-checkpoint ./logs/checkpoints/qwen25_vl_7b Qwen3-VL 8B (dense): - $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\ + $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\ --recipe qwen3_vl_8b_finetune_config \\ --pretrained-checkpoint ./logs/checkpoints/qwen3_vl_8b Qwen3-VL 30B (MoE): - $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\ + $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\ --recipe qwen3_vl_30b_a3b_finetune_config \\ --pretrained-checkpoint ./logs/checkpoints/qwen3_vl_30b_a3b Qwen3-VL 235B (MoE): - $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\ + $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\ --recipe qwen3_vl_235b_a22b_finetune_config \\ --pretrained-checkpoint ./logs/checkpoints/qwen3_vl_235b_a22b