From f7235944ac3239d023ac8dcdce580fda7d0a5322 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <yaoyu.094@gmail.com>
Date: Thu, 29 Jan 2026 13:30:05 -0800
Subject: [PATCH] [doc] refactor: Restructure examples folder - move recipes to
 models, distillation, decentralized_pg

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 README.md                                          |  2 +-
 docs/megatron-lm-to-megatron-bridge.md             |  2 +-
 docs/models/llm/nemotron3.md                       |  6 +++---
 docs/models/llm/nemotronh.md                       |  2 +-
 docs/models/vlm/ministral3.md                      |  8 ++++----
 docs/models/vlm/nemotron-nano-v2-vl.md             | 14 +++++++-------
 docs/models/vlm/qwen2.5-vl.md                      |  8 ++++----
 docs/models/vlm/qwen3-vl.md                        |  6 +++---
 docs/recipe-usage.md                               |  4 ++--
 docs/training/distillation.md                      |  8 ++++----
 examples/{recipes => }/decentralized_pg/README.md  | 10 +++++-----
 .../decentralized_pg/pretrain_qwen3_simple.py      |  4 ++--
 .../decentralized_pg/pretrain_qwen3_vl_simple.py   |  2 +-
 .../pretrain_qwen3_with_decentralized_pg.py        |  6 +++---
 .../llama32_3b-1b_distill_override_example.yaml    |  0
 .../llama/distill_llama32_3b-1b.py                 |  2 +-
 .../nemotron_3/finetune_nemotron_3_nano.py         |  0
 .../nemotron_3/pretrain_nemotron_3_nano.py         |  0
 ...en3_next_80b_a3b_finetune_override_example.yaml |  0
 .../qwen3_next/finetune_qwen3_next_80b_a3b.py      |  2 +-
 .../conf/nemotron_nano_v2_vl_override_example.yaml |  0
 .../conf/nemotron_nano_v2_vl_video.yaml            |  0
 .../nemotron_vl/finetune_nemotron_nano_v2_vl.py    |  0
 .../conf/qwen25_vl_pretrain_override_example.yaml  |  0
 .../conf/qwen3_vl_pretrain_override_example.yaml   |  0
 .../vlm}/qwen_vl/data/convert_to_qwenvl_wds.py     |  2 +-
 .../vlm}/qwen_vl/finetune_qwen_vl.py               | 10 +++++-----
 27 files changed, 49 insertions(+), 49 deletions(-)
 rename examples/{recipes => }/decentralized_pg/README.md (93%)
 rename examples/{recipes => }/decentralized_pg/pretrain_qwen3_simple.py (96%)
 rename examples/{recipes => }/decentralized_pg/pretrain_qwen3_vl_simple.py (98%)
 rename examples/{recipes => }/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py (99%)
 rename examples/{recipes => distillation}/llama/conf/llama32_3b-1b_distill_override_example.yaml (100%)
 rename examples/{recipes => distillation}/llama/distill_llama32_3b-1b.py (99%)
 rename examples/{recipes => models}/nemotron_3/finetune_nemotron_3_nano.py (100%)
 rename examples/{recipes => models}/nemotron_3/pretrain_nemotron_3_nano.py (100%)
 rename examples/{recipes => models}/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml (100%)
 rename examples/{recipes => models}/qwen3_next/finetune_qwen3_next_80b_a3b.py (98%)
 rename examples/{recipes => models/vlm}/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml (100%)
 rename examples/{recipes => models/vlm}/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml (100%)
 rename examples/{recipes => models/vlm}/nemotron_vl/finetune_nemotron_nano_v2_vl.py (100%)
 rename examples/{recipes => models/vlm}/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml (100%)
 rename examples/{recipes => models/vlm}/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml (100%)
 rename examples/{recipes => models/vlm}/qwen_vl/data/convert_to_qwenvl_wds.py (98%)
 rename examples/{recipes => models/vlm}/qwen_vl/finetune_qwen_vl.py (97%)

diff --git a/README.md b/README.md
index 677f8d7cd8..8c870441e3 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ For a deeper dive into conversion design and advanced usage, see the [models REA
   - Optimized paths when Transformer Engine is available
 - **Flexible to Customize**: Lightweight custom training loop making it easy to configure custom logic in data loading, distributed training, checkpointing, evaluation and logging ([training framework](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/training), [training utilities](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/training/utils))
 - **Supervised & Parameter-Efficient Finetuning**: SFT & PEFT implementation tailored for Megatron-based models that supports LoRA, DoRA, and user-defined PEFT methods ([PEFT implementations](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/peft), [finetune module](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/training/finetune.py), [SFT dataset](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/data/datasets/sft.py))
-- **SOTA Training Recipes**: Pre-configured production-ready training recipes for popular models like Llama 3, with optimized hyperparameters and distributed training configuration ([Llama recipes](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/recipes/llama), [recipe examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/recipes))
+- **SOTA Training Recipes**: Pre-configured production-ready training recipes for popular models like Llama 3, with optimized hyperparameters and distributed training configuration ([Llama recipes](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/src/megatron/bridge/recipes/llama), [recipe examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/models))
 - **Performance Optimization**: Built-in support for FP8 training, model parallelism, and memory-efficient techniques to offer high utilization and near-linear scalability to thousands of nodes. ([mixed precision](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/training/mixed_precision.py), [communication overlap](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/training/comm_overlap.py), [optimizer utilities](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/recipes/utils/optimizer_utils.py))
 
 ## Supported Models
diff --git a/docs/megatron-lm-to-megatron-bridge.md b/docs/megatron-lm-to-megatron-bridge.md
index 34871e34f8..b781dc65ba 100644
--- a/docs/megatron-lm-to-megatron-bridge.md
+++ b/docs/megatron-lm-to-megatron-bridge.md
@@ -7,7 +7,7 @@ Megatron Bridge is Python-first: configure models, data, and training via typed
 Run your example training entrypoint and override config keys directly:
 
 ```bash
-python examples/recipes/llama/pretrain_llama3_8b.py \
+python examples/models/llama/pretrain_llama3_8b.py \
   train.micro_batch_size=2 \
   train.global_batch_size=128 \
   model.num_layers=32 model.hidden_size=4096 model.num_attention_heads=32 \
diff --git a/docs/models/llm/nemotron3.md b/docs/models/llm/nemotron3.md
index 9ec89b84bc..bc6c8b6536 100644
--- a/docs/models/llm/nemotron3.md
+++ b/docs/models/llm/nemotron3.md
@@ -40,7 +40,7 @@ python examples/conversion/convert_checkpoints.py export  \
 BLEND_PATH=/path/to/dataset/blend
 TOKENIZER_MODEL=/path/to/tiktok/tokenizer/model
 
-torchrun --nproc-per-node=8 examples/recipes/nemotron_3/pretrain_nemotron_3_nano.py \
+torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_nano.py \
 --per-split-data-args-path=${BLEND_PATH} \
 --tokenizer-model=${TOKENIZER_MODEL} \
 train.global_batch_size=3072 \
@@ -58,7 +58,7 @@ Notes:
 
 ### Full Parameter Fine-Tuning
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/nemotron_3/finetune_nemotron_3_nano.py \
+torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \
 train.global_batch_size=128 \
 train.train_iters=100 \
 scheduler.lr_warmup_iters=10 \
@@ -74,7 +74,7 @@ Notes:
 ### LoRA Fine-Tuning
 To enable LoRA fine-tuning, pass `--peft lora` to script
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/nemotron_3/finetune_nemotron_3_nano.py \
+torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \
 --peft lora \
 train.global_batch_size=128 \
 train.train_iters=100 \
diff --git a/docs/models/llm/nemotronh.md b/docs/models/llm/nemotronh.md
index 783d6f4e2b..a880e9abe4 100644
--- a/docs/models/llm/nemotronh.md
+++ b/docs/models/llm/nemotronh.md
@@ -184,7 +184,7 @@ bridge.export_ckpt(
 ## Examples
 
 - Checkpoint conversion: [examples/conversion/convert_checkpoints.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py)
-- Training scripts: [examples/recipes/train_any_basic.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/train_any_basic.py)
+- Training scripts: [examples/models/train_any_basic.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/train_any_basic.py)
 
 ## Finetuning Recipes
 
diff --git a/docs/models/vlm/ministral3.md b/docs/models/vlm/ministral3.md
index b47a08df6c..8137359bd8 100644
--- a/docs/models/vlm/ministral3.md
+++ b/docs/models/vlm/ministral3.md
@@ -99,7 +99,7 @@ Before training, ensure the following environment variables are set:
 ### Full Finetuning
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/ministral3/finetune_ministral3_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/ministral3/finetune_ministral3_vl.py \
 --pretrained-checkpoint /models/ministral3-3b \
 --dataset-type hf \
 train.global_batch_size=32 \
@@ -124,7 +124,7 @@ config = ministral3_3b_finetune_config(
 ### Parameter-Efficient Finetuning (PEFT) with LoRA
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/ministral3/finetune_ministral3_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/ministral3/finetune_ministral3_vl.py \
 --pretrained-checkpoint /models/ministral3-3b \
 --peft-scheme lora \
 --dataset-type hf \
@@ -142,7 +142,7 @@ You can also combine PEFT with freeze options:
 
 Example with freeze options:
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/ministral3/finetune_ministral3_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/ministral3/finetune_ministral3_vl.py \
 --pretrained-checkpoint /models/ministral3-3b \
 --peft-scheme lora \
 --freeze-vision-model \
@@ -199,7 +199,7 @@ To change the dataset, specify `dataset.maker_name=<maker_name>` in your command
 ## Examples
 - Checkpoint import/export: [examples/conversion/convert_checkpoints.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py)
 - Generate with VLM (HF→Megatron): [examples/conversion/hf_to_megatron_generate_vlm.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/hf_to_megatron_generate_vlm.py)
-- Finetuning script: [examples/recipes/ministral3/finetune_ministral3_vl.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/ministral3/finetune_ministral3_vl.py)
+- Finetuning script: [examples/models/vlm/ministral3/finetune_ministral3_vl.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/vlm/ministral3/finetune_ministral3_vl.py)
 
 ## Hugging Face Model Cards
 
diff --git a/docs/models/vlm/nemotron-nano-v2-vl.md b/docs/models/vlm/nemotron-nano-v2-vl.md
index 6dee739975..6921fef5c1 100644
--- a/docs/models/vlm/nemotron-nano-v2-vl.md
+++ b/docs/models/vlm/nemotron-nano-v2-vl.md
@@ -85,7 +85,7 @@ Example usage for full parameter finetuning using the
 [Raven dataset](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron/viewer/raven):
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
 --hf-model-path $HF_MODEL_PATH \
 --pretrained-checkpoint <megatron model path> \
 dataset.maker_name=make_raven_dataset \
@@ -95,7 +95,7 @@ checkpoint.save=$SAVE_DIR/<experiment name>
 ```
 
 Note:
-- The config file `examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml` contains a list of arguments 
+- The config file `examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml` contains a list of arguments 
   that can be overridden in the command. For example, you can set `train.global_batch_size=<batch size>` in the command. 
 - To change the dataset, you only need to change `dataset.maker_name`. See the dataset section below for details.
 - After training, you can run inference with `hf_to_megatron_generate_vlm.py` by supplying the trained megatron checkpoint. 
@@ -110,7 +110,7 @@ settings out of the box in the example script:
    distribution is substantially different from pretrained.)
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
 --hf-model-path $HF_MODEL_PATH \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
 --lora-on-language-model \
@@ -126,7 +126,7 @@ model.freeze_vision_projection=False
 2. Apply LoRA to all linear layers in attention and MLP modules of the vision model, vision projection, and the language model.
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
 --hf-model-path $HF_MODEL_PATH \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
 --lora-on-language-model \
@@ -169,7 +169,7 @@ Megatron Bridge supports various vision-language dataset examples which can be u
 
 Note on video training example:
 - We provide a video config yaml file instead of the default config yaml file that overwrites a few commands. Please
-  pass in `--config-file "examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml"`.
+  pass in `--config-file "examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml"`.
 - The LLaVA video dataset requires manual download beforehand. Please place the downloaded and extracted video files
   in a folder `VIDEO_ROOT` and pass it in to the maker with `dataset.maker_kwargs={"video_root_path":$VIDEO_ROOT}`. 
   In the nextqa subset example, `VIDEO_ROOT` should look like
@@ -186,10 +186,10 @@ Note on video training example:
 
 Full video training example command:
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py \
 --hf-model-path $HF_MODEL_PATH \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
---config-file "examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml" \
+--config-file "examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml" \
 logger.wandb_project=<optional wandb project name> \
 logger.wandb_save_dir=$SAVE_DIR \
 checkpoint.save=$SAVE_DIR/<experiment name> \
diff --git a/docs/models/vlm/qwen2.5-vl.md b/docs/models/vlm/qwen2.5-vl.md
index 5cbd38b8e3..8825fdcb22 100644
--- a/docs/models/vlm/qwen2.5-vl.md
+++ b/docs/models/vlm/qwen2.5-vl.md
@@ -64,7 +64,7 @@ Before training, ensure the following environment variables are set.
 Example usage for full parameter finetuning:
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen25_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen25_vl.py \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
 --recipe qwen25_vl_3b_finetune_config \
 --dataset-type hf \
@@ -82,7 +82,7 @@ Note:
   - `qwen25_vl_7b_finetune_config` - for 7B model  
   - `qwen25_vl_32b_finetune_config` - for 32B model
   - `qwen25_vl_72b_finetune_config` - for 72B model
-- The config file `examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml` contains a list of arguments 
+- The config file `examples/models/vlm/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml` contains a list of arguments 
   that can be overridden in the command. For example, you can set `train.global_batch_size=<batch size>` in the command. 
 - The dataset format should be JSONL with conversation format (see dataset section below).
 - After training, you can run inference with `hf_to_megatron_generate_vlm.py` by supplying the trained megatron checkpoint. 
@@ -92,7 +92,7 @@ Note:
 Parameter-efficient finetuning (PEFT) using LoRA or DoRA is supported. You can use the `--peft_scheme` argument to enable PEFT training:
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen25_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen25_vl.py \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
 --recipe qwen25_vl_3b_finetune_config \
 --peft_scheme lora \
@@ -112,7 +112,7 @@ You can also combine PEFT with freeze options to control which components are tr
 
 Example with LoRA and freeze options:
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen25_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen25_vl.py \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
 --recipe qwen25_vl_3b_finetune_config \
 --peft_scheme lora \
diff --git a/docs/models/vlm/qwen3-vl.md b/docs/models/vlm/qwen3-vl.md
index fa6eed8079..454d69ab70 100644
--- a/docs/models/vlm/qwen3-vl.md
+++ b/docs/models/vlm/qwen3-vl.md
@@ -55,7 +55,7 @@ Before training, ensure the following environment variables are set:
 Example usage for full parameter finetuning:
 
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
 --recipe qwen3_vl_8b_finetune_config \
 --dataset-type hf \
@@ -69,7 +69,7 @@ checkpoint.save=$SAVE_DIR/<experiment name>
 
 For MoE models with expert parallelism:
 ```bash
-torchrun --nproc-per-node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \
+torchrun --nproc-per-node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \
 --pretrained-checkpoint $MEGATRON_MODEL_PATH \
 --recipe qwen3_vl_30b_a3b_finetune_config \
 --dataset-type hf \
@@ -84,7 +84,7 @@ Note:
   - `qwen3_vl_8b_finetune_config` - for 8B dense model
   - `qwen3_vl_30b_a3b_finetune_config` - for 30B MoE model
 - For dataset formats and additional information, refer to the [Qwen2.5-VL documentation]
-- See the full script with examples at [`examples/recipes/qwen_vl/finetune_qwen_vl.py`](../../../examples/recipes/qwen_vl/finetune_qwen_vl.py)
+- See the full script with examples at [`examples/models/vlm/qwen_vl/finetune_qwen_vl.py`](../../../examples/models/vlm/qwen_vl/finetune_qwen_vl.py)
 
 ## Hugging Face Model Cards
 - Qwen3-VL-8B: `https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct`
diff --git a/docs/recipe-usage.md b/docs/recipe-usage.md
index a115c66573..bf0efa46e5 100644
--- a/docs/recipe-usage.md
+++ b/docs/recipe-usage.md
@@ -15,7 +15,7 @@ This guide will cover the next steps to make use of a training recipe, including
 Recipes are provided through a {py:class}`~bridge.training.config.ConfigContainer` object. This is a dataclass that holds all configuration objects needed for training. You can find a more detailed overview of the `ConfigContainer` [here](training/config-container-overview.md).
 The benefit of providing the full recipe through a pythonic structure is that it is agnostic to any configuration approach that a user may prefer, whether that's YAML, `argparse` or something else. In other words, the user may override the recipe however they see fit.
 
-The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b.py).
+The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b.py).
 
 
 ### Python
@@ -184,7 +184,7 @@ if __name__ == "__main__":
     train_script = run.Script(..., args=args_to_fwd)
 ```
 
-For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py).
+For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b_nemo_run_script.py).
 
 #### Plugins
 
diff --git a/docs/training/distillation.md b/docs/training/distillation.md
index 307f0672c0..545a6f132e 100644
--- a/docs/training/distillation.md
+++ b/docs/training/distillation.md
@@ -49,7 +49,7 @@ logit_kl_temperature: 2.0
 The simplest way to run knowledge distillation is to use or adapt one of the provided recipe scripts. Here's an example for distilling Llama3.2-3B into Llama3.2-1B:
 
 ```bash
-torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py
+torchrun --nproc_per_node=1 examples/distillation/llama/distill_llama32_3b-1b.py
 ```
 
 ### Using a Custom YAML Config File
@@ -57,7 +57,7 @@ torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py
 You can provide a custom YAML configuration file to override default settings:
 
 ```bash
-torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py \
+torchrun --nproc_per_node=1 examples/distillation/llama/distill_llama32_3b-1b.py \
     --config-file my_custom_config.yaml
 ```
 
@@ -66,7 +66,7 @@ torchrun --nproc_per_node=1 examples/recipes/llama/distill_llama32_3b-1b.py \
 Megatron Bridge supports Hydra-style CLI overrides for flexible configuration:
 
 ```bash
-torchrun --nproc_per_node=2 examples/recipes/llama/distill_llama32_3b-1b.py \
+torchrun --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \
     model.tensor_model_parallel_size=2 \
     model.teacher.tensor_model_parallel_size=2
 ```
@@ -76,7 +76,7 @@ torchrun --nproc_per_node=2 examples/recipes/llama/distill_llama32_3b-1b.py \
 CLI overrides take precedence over YAML configuration:
 
 ```bash
-torchrun --nproc_per_node=2 examples/recipes/llama/distill_llama32_3b-1b.py \
+torchrun --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \
     --config-file conf/my_config.yaml \
     train.global_batch_size=512
 ```
diff --git a/examples/recipes/decentralized_pg/README.md b/examples/decentralized_pg/README.md
similarity index 93%
rename from examples/recipes/decentralized_pg/README.md
rename to examples/decentralized_pg/README.md
index 7849e1ee5f..8bc02b7d11 100755
--- a/examples/recipes/decentralized_pg/README.md
+++ b/examples/decentralized_pg/README.md
@@ -25,10 +25,10 @@ Just use an existing recipe and enable decentralized process groups:
 
 ```bash
 # 8 GPUs: TP2 x PP2 x DP2
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_simple.py
 
 # 4 GPUs: TP2 x PP2 x DP1
-uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_simple.py
 ```
 
 The key is just two lines:
@@ -53,14 +53,14 @@ For full control over process groups:
 
 ```bash
 # 8 GPUs: TP2 x PP2 x DP2
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
 
 # 4 GPUs: TP2 x PP2 x DP1
-uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 2
 
 # 2 GPUs: TP2 x PP1 x DP1
-uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
+uv run python -m torch.distributed.run --nproc_per_node=2 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 1
 ```
 
diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_simple.py b/examples/decentralized_pg/pretrain_qwen3_simple.py
similarity index 96%
rename from examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
rename to examples/decentralized_pg/pretrain_qwen3_simple.py
index 5a62680e91..454a1fc644 100644
--- a/examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
+++ b/examples/decentralized_pg/pretrain_qwen3_simple.py
@@ -27,10 +27,10 @@
 How to Run
 ----------
 # 8 GPUs: TP2 x PP2 x DP2
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_simple.py
 
 # 4 GPUs: TP2 x PP2 x DP1
-uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_simple.py
 """
 
 import torch
diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py b/examples/decentralized_pg/pretrain_qwen3_vl_simple.py
similarity index 98%
rename from examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py
rename to examples/decentralized_pg/pretrain_qwen3_vl_simple.py
index bb6c9d9c71..efafe05422 100644
--- a/examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py
+++ b/examples/decentralized_pg/pretrain_qwen3_vl_simple.py
@@ -27,7 +27,7 @@
 How to Run
 ----------
 # 8 GPUs: EP8
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_vl_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_vl_simple.py
 """
 
 import torch
diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py b/examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
similarity index 99%
rename from examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
rename to examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
index 1f63955d0a..4dec03307e 100644
--- a/examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
+++ b/examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
@@ -37,14 +37,14 @@
 How to Run
 ----------
 # 8 GPUs: TP2 x PP2 x DP2
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
 
 # 4 GPUs: TP2 x PP2 x DP1
-uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 2
 
 # 2 GPUs: TP2 x PP1 x DP1
-uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
+uv run python -m torch.distributed.run --nproc_per_node=2 examples/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 1
 """
 
diff --git a/examples/recipes/llama/conf/llama32_3b-1b_distill_override_example.yaml b/examples/distillation/llama/conf/llama32_3b-1b_distill_override_example.yaml
similarity index 100%
rename from examples/recipes/llama/conf/llama32_3b-1b_distill_override_example.yaml
rename to examples/distillation/llama/conf/llama32_3b-1b_distill_override_example.yaml
diff --git a/examples/recipes/llama/distill_llama32_3b-1b.py b/examples/distillation/llama/distill_llama32_3b-1b.py
similarity index 99%
rename from examples/recipes/llama/distill_llama32_3b-1b.py
rename to examples/distillation/llama/distill_llama32_3b-1b.py
index 504d933806..3835eabfe0 100644
--- a/examples/recipes/llama/distill_llama32_3b-1b.py
+++ b/examples/distillation/llama/distill_llama32_3b-1b.py
@@ -80,7 +80,7 @@
 
 
 # Define paths relative to this script's location
-# Assumes this script (distill_llama32_3b-1b.py) is in Megatron-Bridge/examples/recipes/llama/
+# Assumes this script (distill_llama32_3b-1b.py) is in Megatron-Bridge/examples/distillation/llama/
 # and the config is in a 'conf' subdirectory.
 SCRIPT_DIR: Path = Path(__file__).parent.resolve()
 DEFAULT_CONFIG_FILENAME: str = "llama32_3b-1b_distill_override_example.yaml"
diff --git a/examples/recipes/nemotron_3/finetune_nemotron_3_nano.py b/examples/models/nemotron_3/finetune_nemotron_3_nano.py
similarity index 100%
rename from examples/recipes/nemotron_3/finetune_nemotron_3_nano.py
rename to examples/models/nemotron_3/finetune_nemotron_3_nano.py
diff --git a/examples/recipes/nemotron_3/pretrain_nemotron_3_nano.py b/examples/models/nemotron_3/pretrain_nemotron_3_nano.py
similarity index 100%
rename from examples/recipes/nemotron_3/pretrain_nemotron_3_nano.py
rename to examples/models/nemotron_3/pretrain_nemotron_3_nano.py
diff --git a/examples/recipes/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml b/examples/models/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml
similarity index 100%
rename from examples/recipes/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml
rename to examples/models/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml
diff --git a/examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py b/examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py
similarity index 98%
rename from examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py
rename to examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py
index 7362f68f2d..58947e4f11 100644
--- a/examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py
+++ b/examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py
@@ -26,7 +26,7 @@
                --megatron-path /path/to/megatron_ckpt
 
         2) Run finetune using the imported checkpoint:
-           $ torchrun --nproc_per_node=8 examples/recipes/qwen/finetune_qwen3_next_80b_a3b.py \
+           $ torchrun --nproc_per_node=8 examples/models/qwen3_next/finetune_qwen3_next_80b_a3b.py \
                --pretrained-checkpoint /path/to/megatron_ckpt
 
     Using a custom YAML config file:
diff --git a/examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml b/examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml
similarity index 100%
rename from examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml
rename to examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_override_example.yaml
diff --git a/examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml b/examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml
similarity index 100%
rename from examples/recipes/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml
rename to examples/models/vlm/nemotron_vl/conf/nemotron_nano_v2_vl_video.yaml
diff --git a/examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py b/examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py
similarity index 100%
rename from examples/recipes/nemotron_vl/finetune_nemotron_nano_v2_vl.py
rename to examples/models/vlm/nemotron_vl/finetune_nemotron_nano_v2_vl.py
diff --git a/examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml b/examples/models/vlm/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml
similarity index 100%
rename from examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml
rename to examples/models/vlm/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml
diff --git a/examples/recipes/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml b/examples/models/vlm/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml
similarity index 100%
rename from examples/recipes/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml
rename to examples/models/vlm/qwen_vl/conf/qwen3_vl_pretrain_override_example.yaml
diff --git a/examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py b/examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py
similarity index 98%
rename from examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py
rename to examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py
index 0d6fb780e4..769549fa65 100644
--- a/examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py
+++ b/examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py
@@ -17,7 +17,7 @@
 
 # Example dataset from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain
 
-python examples/recipes/qwen_vl/data/convert_to_qwenvl_wds.py \\
+python examples/models/vlm/qwen_vl/data/convert_to_qwenvl_wds.py \\
     --dataset-root=/path/to/LLaVA-Pretrain-LCS-558K \
     --json=blip_laion_cc_sbu_558k.json \
     --mediate-path=images \
diff --git a/examples/recipes/qwen_vl/finetune_qwen_vl.py b/examples/models/vlm/qwen_vl/finetune_qwen_vl.py
similarity index 97%
rename from examples/recipes/qwen_vl/finetune_qwen_vl.py
rename to examples/models/vlm/qwen_vl/finetune_qwen_vl.py
index 73cc3517fc..0344cd2d79 100644
--- a/examples/recipes/qwen_vl/finetune_qwen_vl.py
+++ b/examples/models/vlm/qwen_vl/finetune_qwen_vl.py
@@ -43,27 +43,27 @@
 
     Finetune using the imported checkpoint:
         Qwen2.5-VL 3B:
-            $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\
+            $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\
                 --recipe qwen25_vl_3b_finetune_config \\
                 --pretrained-checkpoint ./logs/checkpoints/qwen25vl3b
 
         Qwen2.5-VL 7B:
-            $  uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\
+            $  uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\
                 --recipe qwen25_vl_7b_finetune_config \\
                 --pretrained-checkpoint ./logs/checkpoints/qwen25_vl_7b
 
         Qwen3-VL 8B (dense):
-            $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\
+            $ uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\
                 --recipe qwen3_vl_8b_finetune_config \\
                 --pretrained-checkpoint ./logs/checkpoints/qwen3_vl_8b
 
         Qwen3-VL 30B (MoE):
-            $  uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\
+            $  uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\
                 --recipe qwen3_vl_30b_a3b_finetune_config \\
                 --pretrained-checkpoint ./logs/checkpoints/qwen3_vl_30b_a3b
 
         Qwen3-VL 235B (MoE):
-            $  uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen_vl.py \\
+            $  uv run python -m torch.distributed.run --nproc_per_node=8 examples/models/vlm/qwen_vl/finetune_qwen_vl.py \\
                 --recipe qwen3_vl_235b_a22b_finetune_config \\
                 --pretrained-checkpoint ./logs/checkpoints/qwen3_vl_235b_a22b