Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions scripts/performance/configs/deepseek/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,26 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_LARGE_SCALE,
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_LARGE_SCALE,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_LARGE_SCALE,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1,
Expand All @@ -60,12 +66,15 @@
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1",
Expand All @@ -77,12 +86,15 @@
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1


DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1 = replace(
Expand All @@ -100,6 +101,7 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1


DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1 = replace(
Expand All @@ -115,6 +117,7 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1


DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V1 = replace(
Expand Down Expand Up @@ -159,6 +162,7 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2


DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V2 = replace(
Expand All @@ -173,6 +177,7 @@
)
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V2


DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2 = replace(
Expand All @@ -182,6 +187,7 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2


DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V2 = replace(
Expand Down Expand Up @@ -242,12 +248,15 @@
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_H100_BF16_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V1",
"DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_SC_V1",
Expand All @@ -259,12 +268,15 @@
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_H100_BF16_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_SC_V2",
Expand Down
20 changes: 20 additions & 0 deletions scripts/performance/configs/gpt_oss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,50 @@
from .gpt_oss_workload_base_configs import (
GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2,
GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1,
GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2,
)


__all__ = [
# V1 (GBS=512)
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1",
# V2 (GBS=1280)
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2",
]

if HAVE_MEGATRON_BRIDGE:
Expand Down
12 changes: 11 additions & 1 deletion scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from megatron.bridge.recipes.gpt_oss import gpt_oss_120b_pretrain_config
from megatron.bridge.training.config import ConfigContainer
from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend


logger = logging.getLogger(__name__)
Expand All @@ -38,7 +39,6 @@ def gpt_oss_120b_pretrain_config_gb300(
precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
) -> ConfigContainer:
"""GB300, baseline config."""
# GPT-OSS currently only has BF16 base configs enabled
base_cfg = get_workload_base_config(
model_family_name="gpt_oss",
model_recipe_name="gpt_oss_120b",
Expand All @@ -51,6 +51,8 @@ def gpt_oss_120b_pretrain_config_gb300(

cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
if base_cfg.moe_flex_dispatcher_backend is not None:
apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend)
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -73,6 +75,8 @@ def gpt_oss_120b_pretrain_config_gb200(

cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
if base_cfg.moe_flex_dispatcher_backend is not None:
apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend)
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -95,6 +99,8 @@ def gpt_oss_120b_pretrain_config_b300(

cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
if base_cfg.moe_flex_dispatcher_backend is not None:
apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend)
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -117,6 +123,8 @@ def gpt_oss_120b_pretrain_config_b200(

cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
if base_cfg.moe_flex_dispatcher_backend is not None:
apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend)
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -139,6 +147,8 @@ def gpt_oss_120b_pretrain_config_h100(

cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
if base_cfg.moe_flex_dispatcher_backend is not None:
apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend)
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@
recompute_modules=["layernorm", "moe_act"],
)

GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1
GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1
GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1
GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1
GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1


# =============================================================================
# GPT-OSS 120B Pretrain - V2 (GBS=1280)
Expand Down Expand Up @@ -116,18 +122,34 @@
global_batch_size=1280,
)

GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2
GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2
GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2
GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2
GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2


__all__ = [
# V1 (GBS=512)
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1",
# V2 (GBS=1280)
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2",
"GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2",
]
16 changes: 16 additions & 0 deletions scripts/performance/configs/qwen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,35 @@
QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_LARGE_SCALE,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_LARGE_SCALE,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_LARGE_SCALE,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_LARGE_SCALE,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V1,
QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V2,
QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_LARGE_SCALE,
Expand Down Expand Up @@ -105,30 +113,38 @@
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V1",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_V1",
# Qwen3 235B A22B V2 (num_gpus=256 for Blackwell, GBS=8192 for all)
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V2",
"QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_V2",
# Qwen3 Next 80B A3B V1
Expand Down
Loading
Loading