diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM index 3fb6006ed7..347ad215a8 160000 --- a/3rdparty/Megatron-LM +++ b/3rdparty/Megatron-LM @@ -1 +1 @@ -Subproject commit 3fb6006ed735b627d08fd0084d2f456e929c7961 +Subproject commit 347ad215a8ca2f46c9a599666b03465c475bf4eb diff --git a/GIT_COMMIT_SKILLS.md b/GIT_COMMIT_SKILLS.md new file mode 100644 index 0000000000..39e5ca2c67 --- /dev/null +++ b/GIT_COMMIT_SKILLS.md @@ -0,0 +1,143 @@ +# Git Commit Workflow + +## Standard Workflow for Committing Changes + +### 1. Check Current Branch +```bash +git branch --show-current +``` + +### 2. Create Feature Branch (REQUIRED if on main) +**Important**: You must create a feature branch before committing if you're on `main`. Never commit directly to `main`. + +Check your current branch: +```bash +CURRENT_BRANCH=$(git branch --show-current) +if [ "$CURRENT_BRANCH" = "main" ]; then + git checkout -b feature/your-feature-name +fi +``` + +Or manually: +```bash +git checkout -b feature/your-feature-name +``` + +### 3. Stage Your Changes +```bash +git add ... +# Or stage all modified files: +git add -u +``` + +### 4. Run Pre-commit Hooks +```bash +export PATH="/Users/yuya/Library/Python/3.9/bin:$PATH" +pre-commit run +``` +This will run all pre-commit hooks (formatting, linting, etc.) before committing. + +### 5. Commit with Sign-off +```bash +git commit -s -m "[module] type: Your descriptive commit message" +``` +The `-s` flag adds a Signed-off-by line to the commit message. See [Commit and PR Title Format](#commit-and-pr-title-format) for proper formatting. + +### 6. Push to Remote +```bash +git push +``` + +### 7. Check for Existing PR +```bash +gh pr list --head --json number --jq '.[0].number' +``` + +### 8. Trigger CI Testing +If a PR exists, comment on it to trigger CI: +```bash +COMMIT_HASH=$(git rev-parse HEAD) +gh pr comment --body "/ok to test $COMMIT_HASH" +``` + +## Example Workflow + +```bash +# 1. Check branch +CURRENT_BRANCH=$(git branch --show-current) +echo "Current branch: $CURRENT_BRANCH" + +# 2. Create feature branch if on main (REQUIRED) +if [ "$CURRENT_BRANCH" = "main" ]; then + git checkout -b feature/your-feature-name +fi + +# 3. Stage changes +git add tests/unit_tests/models/gemma_vl/test_gemma3_vl_bridge.py + +# 4. Run pre-commit +export PATH="/Users/yuya/Library/Python/3.9/bin:$PATH" +pre-commit run + +# 5. Commit with sign-off +git commit -s -m "[test] fix: Fix gemma3_vl bridge test for image_token_id default" + +# 6. Push +git push + +# 7. Check for PR and trigger CI +PR_NUMBER=$(gh pr list --head feature/provider-bridge-refactor-3 --json number --jq '.[0].number') +COMMIT_HASH=$(git rev-parse HEAD) +gh pr comment $PR_NUMBER --body "/ok to test $COMMIT_HASH" +``` + +## Commit and PR Title Format + +Format your commit messages and PR titles as: + +```text +[{modules}] {type}: {description} +``` + +### Modules +Use the most relevant ones, separate multiple with `,`: +- `model` - Model implementations and bridges +- `recipe` - Training recipes +- `training` - Training loop and utilities +- `data` - Data loading and processing +- `ckpt` - Checkpoint conversion and saving +- `peft` - Parameter-efficient fine-tuning (LoRA, etc.) +- `perf` - Performance optimizations +- `ci` - CI/CD configuration +- `doc` - Documentation +- `test` - Tests +- `build` - Build system and dependencies +- `misc` - Other changes + +### Types +- `feat` - New feature +- `fix` - Bug fix +- `refactor` - Code refactoring without changing functionality +- `chore` - Maintenance tasks +- `test` - Adding or updating tests + +### Breaking Changes +If your PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. + +### Examples +```text +[model] feat: Add Qwen3 model bridge +[recipe, doc] feat: Add Llama 3.1 70B recipe with documentation +[ckpt] fix: Handle missing keys in HF checkpoint conversion +[BREAKING][training] refactor: Change optimizer config structure +[ci, build] chore: Update ruff version +[test] fix: Fix gemma3_vl bridge test for image_token_id default +``` + +## Notes + +- **Never commit directly to `main`** - Always create a feature branch first +- Always run `pre-commit run` before committing to catch formatting/linting issues early +- Use descriptive commit messages following the format above +- The `-s` flag is required for DCO (Developer Certificate of Origin) compliance +- If pre-commit modifies files, you may need to stage them again before committing diff --git a/REMOTE_DEBUG_SKILLS.md b/REMOTE_DEBUG_SKILLS.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/rl/rlhf_with_bridge.py b/examples/rl/rlhf_with_bridge.py index 883c57f700..3ea4f2b0eb 100644 --- a/examples/rl/rlhf_with_bridge.py +++ b/examples/rl/rlhf_with_bridge.py @@ -33,8 +33,7 @@ Run (single GPU) ```bash -export CUDA_VISIBLE_DEVICES=0 -python examples/rl/rlhf_with_bridge.py \ +uv run python examples/rl/rlhf_with_bridge.py \ --hf-policy-model Qwen/Qwen3-0.6B \ --hf-reward-model distilbert-base-uncased-finetuned-sst-2-english \ --train-iters 5 --mbs 1 --gbs 1 --seq-length 256 --max-new-tokens 32 @@ -42,7 +41,7 @@ Run (multi-GPU) ```bash -torchrun --nproc_per_node=2 examples/rl/rlhf_with_bridge.py \ +uv run python -m torch.distributed.run --nproc_per_node=2 examples/rl/rlhf_with_bridge.py \ --hf-policy-model Qwen/Qwen3-0.6B \ --hf-reward-model distilbert-base-uncased-finetuned-sst-2-english \ --train-iters 20 --mbs 1 --gbs 2 --seq-length 256 --max-new-tokens 32 @@ -62,6 +61,7 @@ import torch import torch.nn.functional as F from megatron.core.pipeline_parallel import get_forward_backward_func +from megatron.core.process_groups_config import ProcessGroupCollection from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from megatron.bridge import AutoBridge @@ -94,6 +94,7 @@ class Args: global_batch_size: int train_iters: int seq_length: int + trust_remote_code: bool = False def build_config(provider, args: Args) -> ConfigContainer: @@ -233,6 +234,7 @@ def main() -> None: global_batch_size=ns.gbs, train_iters=ns.train_iters, seq_length=ns.seq_length, + trust_remote_code=ns.trust_remote_code, ) # Resolve per-rank device up front for multi-GPU runs @@ -296,6 +298,9 @@ def main() -> None: initialize_megatron(cfg=cfg) set_jit_fusion_options(cfg.model, cfg.train.micro_batch_size) + # Get process group collection after initialization + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + # Build model + optimizer + scheduler model_list = get_model( cfg.model, @@ -303,6 +308,7 @@ def main() -> None: overlap_param_gather_with_optimizer_step=False, use_torch_fsdp2=cfg.dist.use_torch_fsdp2, data_parallel_random_init=cfg.rng.data_parallel_random_init, + pg_collection=pg_collection, ) model = model_list[0] optimizer, scheduler = setup_optimizer( diff --git a/tutorials/training/reduced_precision_training.ipynb b/tutorials/training/reduced_precision_training.ipynb index 47f8cc2258..3d6124761e 100644 --- a/tutorials/training/reduced_precision_training.ipynb +++ b/tutorials/training/reduced_precision_training.ipynb @@ -119,7 +119,7 @@ "# - train_text_document.bin (document/sequence data)\n", "# - train_text_document.idx (document/sequence metadata)\n", "\n", - "MEGATRON_LM_PATH=/opt/megatron-lm/\n", + "MEGATRON_LM_PATH=/opt/Megatron-Bridge/3rdparty/Megatron-LM/\n", "\n", "echo \"Tokenizing training data...\"\n", "python3 $MEGATRON_LM_PATH/tools/preprocess_data.py \\\n",