Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .main.commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
8318b8093e7d49bce2529a29a14aeb50e5840ee0
07e512a344e9635de3596c45e5a21938ea0e3c4b
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-LM
Submodule Megatron-LM updated 75 files
+1 −1 .github/copy-pr-bot.yaml
+4 −4 .github/oncall_schedule.json
+2 −1 .github/workflows/claude-complexity-label.yml
+0 −54 .github/workflows/claude_review.yml
+18 −12 README.md
+82 −55 docs/get-started/install.md
+14 −28 docs/get-started/quickstart.md
+1 −1 docs/index.md
+1 −1 docs/user-guide/training-examples.md
+302 −27 examples/mimo/data/energon_vlm_task_encoder.py
+4 −1 examples/mimo/model_providers/llava_avlm.py
+19 −4 examples/mimo/model_providers/llava_vlm.py
+4 −1 examples/mimo/model_providers/mock.py
+20 −10 examples/mimo/scripts/run_vlm_train.sh
+47 −16 examples/mimo/train.py
+10 −7 examples/post_training/modelopt/convert_model.py
+7 −6 examples/post_training/modelopt/finetune.py
+1 −1 examples/rl/environments/math/math_agent.py
+3 −1 examples/rl/model_configs/nemotron6_3b_moe.sh
+18 −19 gpt_builders.py
+2 −2 megatron/core/QuickStart.md
+2 −2 megatron/core/README.md
+5 −0 megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
+1 −1 megatron/core/inference/quantization/mxfp8_tensor.py
+112 −0 megatron/core/inference/quantization/utils.py
+33 −6 megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/chat_completions.py
+1 −0 megatron/core/models/gpt/experimental_attention_variant_module_specs.py
+46 −0 megatron/core/models/gpt/gpt_layer_specs.py
+4 −0 megatron/core/models/mimo/config/base_configs.py
+100 −31 megatron/core/models/mimo/model/base.py
+260 −0 megatron/core/models/mimo/partition/utils.py
+3 −3 megatron/core/pipeline_parallel/schedules.py
+3 −0 megatron/core/resharding/__init__.py
+93 −21 megatron/core/resharding/execution.py
+12 −12 megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py
+6 −6 megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py
+40 −32 megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py
+11 −4 megatron/core/resharding/nvshmem_copy_service/service.py
+197 −13 megatron/core/resharding/refit.py
+261 −0 megatron/core/resharding/transforms.py
+60 −6 megatron/core/tensor_parallel/random.py
+8 −0 megatron/core/transformer/attention.py
+957 −0 megatron/core/transformer/experimental_attention_variant/absorbed_mla.py
+259 −26 megatron/core/transformer/multi_latent_attention.py
+5 −0 megatron/core/transformer/transformer_config.py
+5 −3 megatron/rl/rl_utils.py
+21 −13 megatron/training/arguments.py
+2 −21 megatron/training/checkpointing.py
+100 −0 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_gb200/model_config.yaml
+13,042 −0 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm_gb200/golden_values_dev_dgx_gb200.json
+1 −0 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm_gb200/golden_values_dev_dgx_h100.json
+1 −0 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm_gb200/golden_values_lts_dgx_a100.json
+100 −0 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm_gb200/model_config.yaml
+142 −0 ...onal_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8_seq_packing/golden_values_dev.json
+537 −0 ...s/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8_seq_packing/golden_values_dev_dgx_h100.json
+65 −0 ...unctional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8_seq_packing/model_config.yaml
+142 −0 tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp2_dp8/golden_values_dev.json
+537 −0 ...ctional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp2_dp8/golden_values_dev_dgx_h100.json
+65 −0 tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp2_dp8/model_config.yaml
+3 −3 ...nctional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release/model_config.yaml
+0 −0 ...t_cases/mixtral/deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release_sm/golden_values_dev_dgx_gb200.json
+4 −4 ...ional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release_sm/model_config.yaml
+88 −47 tests/test_utils/python_scripts/swap_pr_labels.py
+8 −0 tests/test_utils/recipes/h100/mimo.yaml
+8 −1 tests/unit_tests/models/test_mimo_embedding_alignment.py
+172 −0 tests/unit_tests/models/test_mimo_model.py
+434 −0 tests/unit_tests/models/test_mimo_partition.py
+11 −12 tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py
+242 −0 tests/unit_tests/resharding/test_mxfp8_refit.py
+59 −1 tests/unit_tests/tensor_parallel/test_random.py
+399 −0 tests/unit_tests/transformer/experimental_attention_variant/test_absorbed_mla.py
+0 −0 tests/unit_tests/transformer/experimental_attention_variant/test_attention_variant_dsa.py
+275 −0 tests/unit_tests/transformer/test_attention.py
+282 −0 tests/unit_tests/transformer/test_multi_latent_attention.py
+288 −274 uv.lock
Loading
Loading