diff --git a/examples/conversion/hf_to_megatron_generate_text.py b/examples/conversion/hf_to_megatron_generate_text.py index aff39e94a9..9d0077f201 100644 --- a/examples/conversion/hf_to_megatron_generate_text.py +++ b/examples/conversion/hf_to_megatron_generate_text.py @@ -167,6 +167,10 @@ def main(args) -> None: model_provider.initialize_model_parallel(seed=0) model = model_provider.provide_distributed_model(wrap_with_ddp=False) + # TEMP FIX for inference failure when mtp_num_layers is not None + for m in model: + m.config.mtp_num_layers = None + model = [m.cuda() for m in model] for m in model: m.eval() diff --git a/examples/conversion/hf_to_megatron_generate_vlm.py b/examples/conversion/hf_to_megatron_generate_vlm.py index 5658b7dfef..131a00253d 100644 --- a/examples/conversion/hf_to_megatron_generate_vlm.py +++ b/examples/conversion/hf_to_megatron_generate_vlm.py @@ -253,6 +253,10 @@ def main(args) -> None: model_provider.initialize_model_parallel(seed=0) model = model_provider.provide_distributed_model(wrap_with_ddp=False) + # TEMP FIX for inference failure when mtp_num_layers is not None + for m in model: + m.config.mtp_num_layers = None + model = [m.cuda() for m in model] for m in model: m.eval() diff --git a/examples/models/vlm/glm_45v/inference.sh b/examples/models/vlm/glm_45v/inference.sh index 497c18134a..c42b12c0cd 100755 --- a/examples/models/vlm/glm_45v/inference.sh +++ b/examples/models/vlm/glm_45v/inference.sh @@ -17,7 +17,7 @@ WORKSPACE=${WORKSPACE:-/workspace} # GLM-4.5V is a large MoE model (106B parameters) -# Using TP=1, PP=4, EP=2 for inference (8 GPUs minimum) +# Using TP=1, PP=2, EP=4 for inference (8 GPUs minimum) # Inference with Hugging Face checkpoints uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \ @@ -26,8 +26,8 @@ uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf --prompt "Describe this image." \ --max_new_tokens 50 \ --tp 1 \ - --pp 4 \ - --ep 2 \ + --pp 2 \ + --ep 4 \ --trust_remote_code # Inference with imported Megatron checkpoints