diff --git a/scripts/performance/configs/kimi/__init__.py b/scripts/performance/configs/kimi/__init__.py new file mode 100644 index 0000000000..2b0c728e2c --- /dev/null +++ b/scripts/performance/configs/kimi/__init__.py @@ -0,0 +1,71 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import megatron.bridge # noqa: F401 + + HAVE_MEGATRON_BRIDGE = True +except ModuleNotFoundError: + HAVE_MEGATRON_BRIDGE = False + +if HAVE_MEGATRON_BRIDGE: + from .kimi_llm_pretrain import ( + kimi_k2_pretrain_config_b200, + kimi_k2_pretrain_config_gb200, + kimi_k2_pretrain_config_gb300, + kimi_k2_pretrain_config_h100, + ) + +from .kimi_workload_base_configs import ( + KIMI_K2_PRETRAIN_CONFIG_B200_BF16, + KIMI_K2_PRETRAIN_CONFIG_B200_FP8_CS, + KIMI_K2_PRETRAIN_CONFIG_B200_FP8_MX, + KIMI_K2_PRETRAIN_CONFIG_GB200_BF16, + KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_CS, + KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_MX, + KIMI_K2_PRETRAIN_CONFIG_GB300_BF16, + KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_CS, + KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_MX, + KIMI_K2_PRETRAIN_CONFIG_GB300_NVFP4, + KIMI_K2_PRETRAIN_CONFIG_H100_BF16, + KIMI_K2_PRETRAIN_CONFIG_H100_FP8_CS, + KIMI_K2_PRETRAIN_CONFIG_H100_FP8_SC, +) + + +__all__ = [ + "KIMI_K2_PRETRAIN_CONFIG_B200_BF16", + "KIMI_K2_PRETRAIN_CONFIG_B200_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_B200_FP8_MX", + "KIMI_K2_PRETRAIN_CONFIG_GB200_BF16", + "KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_MX", + "KIMI_K2_PRETRAIN_CONFIG_GB300_BF16", + "KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_MX", + "KIMI_K2_PRETRAIN_CONFIG_GB300_NVFP4", + "KIMI_K2_PRETRAIN_CONFIG_H100_BF16", + "KIMI_K2_PRETRAIN_CONFIG_H100_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_H100_FP8_SC", +] + +if HAVE_MEGATRON_BRIDGE: + __all__.extend( + [ + "kimi_k2_pretrain_config_gb300", + "kimi_k2_pretrain_config_gb200", + "kimi_k2_pretrain_config_b200", + "kimi_k2_pretrain_config_h100", + ] + ) diff --git a/scripts/performance/configs/kimi/kimi_llm_pretrain.py b/scripts/performance/configs/kimi/kimi_llm_pretrain.py new file mode 100644 index 0000000000..655d7c093e --- /dev/null +++ b/scripts/performance/configs/kimi/kimi_llm_pretrain.py @@ -0,0 +1,201 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from utils.overrides import set_workload_base_configs +from utils.precision import get_precision_config +from utils.utils import get_workload_base_config + +from megatron.bridge.recipes.kimi.kimi_k2 import _get_kimi_k2_pipeline_layout +from megatron.bridge.recipes.kimi.kimi_k2 import kimi_k2_pretrain_config as pretrain_config +from megatron.bridge.training.config import ConfigContainer + + +logger = logging.getLogger(__name__) + + +def set_kimi_k2_common_configs(cfg: ConfigContainer) -> None: + """Set common performance configurations for all Kimi-K2 configs.""" + cfg.model.seq_length = 4096 + cfg.dataset.sequence_length = 4096 + + cfg.model.moe_router_fusion = True + cfg.model.recompute_granularity = "selective" + cfg.dist.enable_megatron_core_experimental = True + + cfg.mixed_precision.grad_reduce_in_fp32 = False + cfg.ddp.grad_reduce_in_fp32 = False + + cfg.model.moe_router_force_load_balancing = True + cfg.model.qk_clip = True + + +def kimi_k2_pretrain_config_gb300( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: + """GB300, baseline config.""" + base_cfg = get_workload_base_config( + model_family_name="kimi", + model_recipe_name="kimi_k2", + gpu="gb300", + compute_dtype=precision.upper(), + task="pretrain", + config_variant=config_variant, + ) + + cfg = pretrain_config() + precision_config = get_precision_config(precision) + cfg.mixed_precision = precision_config + + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + pp_size = base_cfg.pipeline_model_parallel_size + vp_size = base_cfg.virtual_pipeline_model_parallel_size + layout = _get_kimi_k2_pipeline_layout(pp_size, vp_size) + cfg.model.pipeline_model_parallel_layout = layout + + set_kimi_k2_common_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + + cfg.comm_overlap.overlap_grad_reduce = True + + # Setting num_workers and pin_memory to 0 and False respectively gives better performance. + # we are debugging this and might change this in the future. + cfg.dataset.num_workers = 0 + cfg.dataset.pin_memory = False + + return cfg + + +def kimi_k2_pretrain_config_gb200( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: + """GB200, baseline config.""" + base_cfg = get_workload_base_config( + model_family_name="kimi", + model_recipe_name="kimi_k2", + gpu="gb200", + compute_dtype=precision.upper(), + task="pretrain", + config_variant=config_variant, + ) + + cfg = pretrain_config() + precision_config = get_precision_config(precision) + cfg.mixed_precision = precision_config + + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + pp_size = base_cfg.pipeline_model_parallel_size + vp_size = base_cfg.virtual_pipeline_model_parallel_size + layout = _get_kimi_k2_pipeline_layout(pp_size, vp_size) + cfg.model.pipeline_model_parallel_layout = layout + + set_kimi_k2_common_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + + cfg.comm_overlap.overlap_grad_reduce = True + + # Setting num_workers and pin_memory to 0 and False respectively gives better performance. + # we are debugging this and might change this in the future. + cfg.dataset.num_workers = 0 + cfg.dataset.pin_memory = False + + return cfg + + +def kimi_k2_pretrain_config_b200( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: + """B200, baseline config.""" + base_cfg = get_workload_base_config( + model_family_name="kimi", + model_recipe_name="kimi_k2", + gpu="b200", + compute_dtype=precision.upper(), + task="pretrain", + config_variant=config_variant, + ) + + cfg = pretrain_config() + precision_config = get_precision_config(precision) + cfg.mixed_precision = precision_config + + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + pp_size = base_cfg.pipeline_model_parallel_size + vp_size = base_cfg.virtual_pipeline_model_parallel_size + layout = _get_kimi_k2_pipeline_layout(pp_size, vp_size) + cfg.model.pipeline_model_parallel_layout = layout + + set_kimi_k2_common_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + + cfg.comm_overlap.overlap_grad_reduce = True + + return cfg + + +def kimi_k2_pretrain_config_h100( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: + """H100, baseline config.""" + base_cfg = get_workload_base_config( + model_family_name="kimi", + model_recipe_name="kimi_k2", + gpu="h100", + compute_dtype=precision.upper(), + task="pretrain", + config_variant=config_variant, + ) + + cfg = pretrain_config() + precision_config = get_precision_config(precision) + cfg.mixed_precision = precision_config + + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + pp_size = base_cfg.pipeline_model_parallel_size + vp_size = base_cfg.virtual_pipeline_model_parallel_size + layout = _get_kimi_k2_pipeline_layout(pp_size, vp_size) + cfg.model.pipeline_model_parallel_layout = layout + + set_kimi_k2_common_configs(cfg) + set_workload_base_configs(cfg, base_cfg) + + # Disabling to avoid functional errors. TODO: Test with it enabled and keep it enabled if it works. + cfg.comm_overlap.overlap_grad_reduce = False + + return cfg diff --git a/scripts/performance/configs/kimi/kimi_workload_base_configs.py b/scripts/performance/configs/kimi/kimi_workload_base_configs.py new file mode 100644 index 0000000000..30415faea0 --- /dev/null +++ b/scripts/performance/configs/kimi/kimi_workload_base_configs.py @@ -0,0 +1,109 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Workload base presets for Kimi-K2 performance configs.""" + +from dataclasses import replace + +from utils.utils import WorkloadBaseConfig + + +BASE_KIMI_K2_CONFIG = WorkloadBaseConfig( + expert_tensor_parallel_size=1, +) + + +KIMI_K2_PRETRAIN_CONFIG_GB300 = replace( + BASE_KIMI_K2_CONFIG, + num_gpus=256, + global_batch_size=2048, + pipeline_model_parallel_size=4, + virtual_pipeline_model_parallel_size=4, + expert_model_parallel_size=64, + moe_flex_dispatcher_backend="hybridep", + moe_a2a_overlap=False, + cuda_graph_impl="transformer_engine", + cuda_graph_scope=["attn", "moe_router", "moe_preprocess"], + recompute_modules=["moe_act"], +) +KIMI_K2_PRETRAIN_CONFIG_GB300_BF16 = KIMI_K2_PRETRAIN_CONFIG_GB300 +KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_CS = KIMI_K2_PRETRAIN_CONFIG_GB300 +KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_MX = KIMI_K2_PRETRAIN_CONFIG_GB300 +KIMI_K2_PRETRAIN_CONFIG_GB300_NVFP4 = KIMI_K2_PRETRAIN_CONFIG_GB300 + + +KIMI_K2_PRETRAIN_CONFIG_GB200 = replace( + BASE_KIMI_K2_CONFIG, + num_gpus=256, + global_batch_size=2048, + pipeline_model_parallel_size=4, + virtual_pipeline_model_parallel_size=4, + expert_model_parallel_size=64, + moe_flex_dispatcher_backend="hybridep", + moe_a2a_overlap=False, + recompute_modules=["mla_up_proj"], + cuda_graph_impl="transformer_engine", + cuda_graph_scope=["moe_router", "moe_preprocess"], +) +KIMI_K2_PRETRAIN_CONFIG_GB200_BF16 = KIMI_K2_PRETRAIN_CONFIG_GB200 +KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_CS = KIMI_K2_PRETRAIN_CONFIG_GB200 +KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_MX = KIMI_K2_PRETRAIN_CONFIG_GB200 + + +KIMI_K2_PRETRAIN_CONFIG_B200 = replace( + BASE_KIMI_K2_CONFIG, + num_gpus=256, + pipeline_model_parallel_size=16, + expert_model_parallel_size=16, + global_batch_size=2048, + recompute_modules=["mla_up_proj"], + moe_a2a_overlap=False, +) +KIMI_K2_PRETRAIN_CONFIG_B200_BF16 = KIMI_K2_PRETRAIN_CONFIG_B200 +KIMI_K2_PRETRAIN_CONFIG_B200_FP8_CS = KIMI_K2_PRETRAIN_CONFIG_B200 +KIMI_K2_PRETRAIN_CONFIG_B200_FP8_MX = KIMI_K2_PRETRAIN_CONFIG_B200 + + +KIMI_K2_PRETRAIN_CONFIG_H100 = replace( + BASE_KIMI_K2_CONFIG, + num_gpus=1024, + tensor_model_parallel_size=8, + pipeline_model_parallel_size=16, + virtual_pipeline_model_parallel_size=2, + expert_model_parallel_size=64, + global_batch_size=8192, + recompute_modules=["mla_up_proj", "mlp"], + moe_a2a_overlap=False, + pp_layout="Et|(tt|)*30L", +) +KIMI_K2_PRETRAIN_CONFIG_H100_BF16 = KIMI_K2_PRETRAIN_CONFIG_H100 +KIMI_K2_PRETRAIN_CONFIG_H100_FP8_CS = KIMI_K2_PRETRAIN_CONFIG_H100 +KIMI_K2_PRETRAIN_CONFIG_H100_FP8_SC = KIMI_K2_PRETRAIN_CONFIG_H100 + + +__all__ = [ + "KIMI_K2_PRETRAIN_CONFIG_GB300_BF16", + "KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_GB300_FP8_MX", + "KIMI_K2_PRETRAIN_CONFIG_GB300_NVFP4", + "KIMI_K2_PRETRAIN_CONFIG_GB200_BF16", + "KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_GB200_FP8_MX", + "KIMI_K2_PRETRAIN_CONFIG_B200_BF16", + "KIMI_K2_PRETRAIN_CONFIG_B200_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_B200_FP8_MX", + "KIMI_K2_PRETRAIN_CONFIG_H100_BF16", + "KIMI_K2_PRETRAIN_CONFIG_H100_FP8_CS", + "KIMI_K2_PRETRAIN_CONFIG_H100_FP8_SC", +] diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py index 7b4c08b5b1..24250e7321 100644 --- a/scripts/performance/utils/overrides.py +++ b/scripts/performance/utils/overrides.py @@ -19,6 +19,7 @@ from omegaconf import OmegaConf from megatron.bridge.recipes.deepseek.deepseek_v3 import set_deepseek_v3_pipeline_model_parallel_layout +from megatron.bridge.recipes.kimi.kimi_k2 import _get_kimi_k2_pipeline_layout from megatron.bridge.training.comm_overlap import * from megatron.bridge.training.config import ConfigContainer, TokenizerConfig from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop @@ -394,6 +395,20 @@ def set_user_overrides(recipe: ConfigContainer, args: argparse.Namespace) -> Con pp_size is not None or vp_size != -1 or pipeline_model_parallel_layout is not None ): set_deepseek_v3_pipeline_model_parallel_layout(recipe.model, layout=pipeline_model_parallel_layout) + if model_recipe_name == "kimi_k2": + if pp_size is not None or vp_size != -1: + try: + layout = _get_kimi_k2_pipeline_layout( + recipe.model.pipeline_model_parallel_size, recipe.model.virtual_pipeline_model_parallel_size + ) + recipe.model.pipeline_model_parallel_layout = layout + except ValueError: + logger.warning( + f"Invalid PP and VP size: {pp_size} and {vp_size} to infer PP layout for Kimi-K2. Using default layout." + ) + recipe.model.pipeline_model_parallel_layout = None + if pipeline_model_parallel_layout is not None: + recipe.model.pipeline_model_parallel_layout = pipeline_model_parallel_layout if args.pytorch_profiler: recipe.logger.tensorboard_dir = "/nemo_run/pytorch_profile" @@ -417,7 +432,7 @@ def set_post_overrides( model_family_name, model_recipe_name, gpu, compute_dtype, task, config_variant ) - if compute_dtype == "bf16": + if compute_dtype == "bf16" and recipe.optimizer.optimizer == "adam": recipe.optimizer.use_precision_aware_optimizer = True tp = recipe.model.tensor_model_parallel_size @@ -429,9 +444,11 @@ def set_post_overrides( logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}") ## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved. if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"): - recipe.optimizer.overlap_param_gather_with_optimizer_step = True - if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig): - recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True + # Do not enable overlap_param_gather_with_optimizer_step for muon optimizer. + if recipe.optimizer.optimizer != "dist_muon": + recipe.optimizer.overlap_param_gather_with_optimizer_step = True + if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig): + recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True default_num_gpus = workload_base_config.num_gpus if user_gbs is None: diff --git a/src/megatron/bridge/recipes/kimi/kimi_k2.py b/src/megatron/bridge/recipes/kimi/kimi_k2.py index 451c2e6acc..3450f91820 100644 --- a/src/megatron/bridge/recipes/kimi/kimi_k2.py +++ b/src/megatron/bridge/recipes/kimi/kimi_k2.py @@ -33,6 +33,8 @@ def _get_kimi_k2_pipeline_layout(pp_size: int, vp_size: int): (8, 2): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 14 + [["decoder", "loss"]], (4, 4): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 14 + [["decoder", "loss"]], } + + vp_size = 1 if vp_size is None else vp_size if (pp_size, vp_size) not in map_pp_vp_to_layout: raise ValueError( f"Invalid PP and VP size: {pp_size} and {vp_size} to infer PP layout "