From 0a8447aa657932ff8ee181f68488f6a27143fb65 Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Tue, 3 Feb 2026 21:21:35 -0800 Subject: [PATCH] update qwen3 235b mxfp8 gb recipe andresolves nan grad norm Signed-off-by: Dingqing Yang --- .../performance/configs/qwen/qwen3_workload_base_configs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py index b669c33651..0cd9f66c8b 100644 --- a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py +++ b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py @@ -198,8 +198,8 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1, num_gpus=256, pipeline_model_parallel_size=4, - virtual_pipeline_model_parallel_size=12, - expert_model_parallel_size=16, + expert_model_parallel_size=32, + cuda_graph_scope=["attn", "moe_router", "moe_preprocess"], global_batch_size=8192, ) @@ -217,6 +217,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2 = replace( QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1, num_gpus=256, + expert_model_parallel_size=32, global_batch_size=8192, )