From 40353d5d92414e532939e2b3c07977fc1b426137 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Fri, 14 Nov 2025 14:15:00 -0800 Subject: [PATCH 1/3] tp2 to tp1 Signed-off-by: Youngeun Kwon --- .../llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml | 4 ++-- .../recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml index 02fb889e82..f6413ea975 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml @@ -10,10 +10,10 @@ checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off policy: megatron_cfg: - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 expert_model_parallel_size: 8 - sequence_parallel: true + sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml index 4ed5c404ab..053c703eee 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml @@ -17,10 +17,10 @@ policy: megatron_cfg: enabled: true empty_unused_memory_level: 1 - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 expert_model_parallel_size: 8 - sequence_parallel: true + sequence_parallel: false optimizer: lr: 3.0e-07 min_lr: 3.0e-08 From a8eed4170a0b57f99d00666cda357cf17d4b0e7c Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Fri, 14 Nov 2025 14:37:24 -0800 Subject: [PATCH 2/3] modify async logprob batchsize Signed-off-by: Youngeun Kwon --- .../llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml index f6413ea975..0cfc018263 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml @@ -9,6 +9,7 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off policy: + logprob_batch_size: 2 megatron_cfg: tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 From af8250e3168f1daf8f994e1e985347dee5535f29 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Fri, 14 Nov 2025 17:02:00 -0800 Subject: [PATCH 3/3] async recipe Signed-off-by: Youngeun Kwon --- .../llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml index 0cfc018263..4cc5981460 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml @@ -9,10 +9,9 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off policy: - logprob_batch_size: 2 megatron_cfg: tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 + pipeline_model_parallel_size: 2 expert_model_parallel_size: 8 sequence_parallel: false generation: @@ -23,7 +22,7 @@ policy: gpus_per_node: 8 vllm_cfg: async_engine: true - tensor_parallel_size: 4 + tensor_parallel_size: 2 gpu_memory_utilization: 0.8 logger: log_dir: logs/grpo-qwen3-30ba3b-4n8g-2T2G-async-1off