diff --git a/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py b/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py index 0574ce7b57..ca0e0f2525 100644 --- a/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py +++ b/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py @@ -63,8 +63,8 @@ class GPTOSSProvider(GPTModelProvider): yarn_beta_fast: float = 32.0 yarn_beta_slow: float = 1.0 yarn_correction_range_round_to_int: bool = False - yarn_mscale: Optional[float] = None - yarn_mscale_all_dim: Optional[float] = None + yarn_mscale: Optional[float] = 1.0 # NOTE (yiakwy) : None + yarn_mscale_all_dim: Optional[float] = 1.0 # NOTE(yiakwy) : None moe_router_topk: int = 4 moe_router_pre_softmax: bool = False diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py index 70420c8aa1..b5e84884ac 100644 --- a/src/megatron/bridge/training/config.py +++ b/src/megatron/bridge/training/config.py @@ -153,7 +153,7 @@ class DistributedInitConfig: Make sure EP and CP aren't used with this option enabled. """ - use_gloo_process_groups: bool = True + use_gloo_process_groups: bool = False # True NOTE (yiakwy) """If set, create Gloo process groups for communications.""" use_sharp: bool = False