NVIDIA-NeMo · malay-nagda · Feb 27, 2026 · Feb 23, 2026
diff --git a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py
@@ -286,7 +286,7 @@ def qwen3_30b_a3b_pretrain_config_b200(
     cfg.mixed_precision = precision_config
     cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
     cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
-    cfg.model.moe_token_dispatcher_type = "alltoall"
+    cfg.model.moe_token_dispatcher_type = "flex"
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)

diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
@@ -391,20 +391,17 @@
 QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1 = replace(
     BASE_QWEN3_30B_A3B_CONFIG,
     num_gpus=8,
+    micro_batch_size=4,
+    moe_flex_dispatcher_backend="hybridep",
     cuda_graph_impl="transformer_engine",
-    cuda_graph_scope=["moe_router", "moe_preprocess"],
+    cuda_graph_scope=["attn", "moe_router", "moe_preprocess"],
 )
 
 
-QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1 = replace(
-    BASE_QWEN3_30B_A3B_CONFIG,
-    num_gpus=8,
-    cuda_graph_impl="transformer_engine",
-    cuda_graph_scope=["moe_router", "moe_preprocess"],
-)
+QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1
 
 
-QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1
+QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1
 
 
 QWEN3_30B_A3B_PRETRAIN_CONFIG_H100_BF16_V1 = replace(