Skip to content

Commit cac60ce

Browse files
committed
Merge branch 'fix_overlap_param_gather' into 'main'
fix EP distopt with overlap param gather See merge request ADLR/megatron-lm!1345 (cherry picked from commit ccfeda4) ac93d847 fix EP distopt with overlap param gather bb7b4307 change golden metrics 0ff731ff Minor fix to thrown value error
1 parent b26d3e3 commit cac60ce

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

megatron/core/optimizer/optimizer.py

+16-10
Original file line numberDiff line numberDiff line change
@@ -754,21 +754,27 @@ def load_state_dict(self, state_dict):
754754
self.param_groups += optimizer.param_groups
755755

756756
def disable_pre_hook(self):
757-
if not self.config.use_distributed_optimizer or not self.config.overlap_param_gather:
758-
raise ValueError(
759-
"disable_pre_hook should only be called with 'use_distributed_optimizer' "
760-
"and 'overlap_param_gather' are both enabled."
761-
)
762757
for optimizer in self.chained_optimizers:
758+
if (
759+
not optimizer.config.use_distributed_optimizer
760+
or not optimizer.config.overlap_param_gather
761+
):
762+
raise ValueError(
763+
"disable_pre_hook should only be called with 'use_distributed_optimizer' "
764+
"and 'overlap_param_gather' both enabled."
765+
)
763766
optimizer.disable_pre_hook()
764767

765768
def enable_pre_hook(self):
766-
if not self.config.use_distributed_optimizer or not self.config.overlap_param_gather:
767-
raise ValueError(
768-
"enable_pre_hook should only be called with 'use_distributed_optimizer' "
769-
"and 'overlap_param_gather' are both enabled."
770-
)
771769
for optimizer in self.chained_optimizers:
770+
if (
771+
not optimizer.config.use_distributed_optimizer
772+
or not optimizer.config.overlap_param_gather
773+
):
774+
raise ValueError(
775+
"enable_pre_hook should only be called with 'use_distributed_optimizer' "
776+
"and 'overlap_param_gather' both enabled."
777+
)
772778
optimizer.enable_pre_hook()
773779

774780
def step(self):

tests/functional_tests/jet_recipes/MR-gpt.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ products:
7070
# - {tp_size: [2], pp_size: [1,2], extra_args: ['"--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"']} # TODO: need updated container with TE > 1.0.0
7171
- {tp_size: [2], pp_size: [1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_8experts2parallel"]}
7272
- {tp_size: [2], pp_size: [1], extra_args: ['"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_8experts2parallel_dist_optimizer"]}
73-
- {tp_size: [2], pp_size: [1], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --overlap-grad-reduce"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_overlap_grad_reduce_groupedGEMM"]}
73+
- {tp_size: [2], pp_size: [1], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --overlap-grad-reduce --overlap-param-gather"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM"]}
7474
- {tp_size: [2], pp_size: [1], extra_args: ['"--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_groupedGEMM"]}
7575
- {tp_size: [2], pp_size: [1], extra_args: ['"--disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_top2router"]}
7676
- {tp_size: [1], pp_size: [1], extra_args: ["--use-distributed-optimizer"], args_meta: ["dist_optimizer"]}
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80961, 10.86088, 10.86703, 10.80386, 10.71988, 10.64698, 10.21161, 10.32003, 10.22052, 9.92363]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31228.0, 37860.0, 38327.0, 36135.0, 33138.0, 34687.0, 30217.0, 34984.0, 35952.0, 37036.0]}, "iteration_timing_avg": 0.18751352941176463}
1+
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80961, 10.86088, 10.86703, 10.80386, 10.71988, 10.64698, 10.21161, 10.32003, 10.22052, 9.92363]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31228.0, 37860.0, 38327.0, 36135.0, 33138.0, 34687.0, 30217.0, 34984.0, 35952.0, 37036.0]}, "iteration_timing_avg": 0.17911029411764712}

0 commit comments

Comments
 (0)