From 0e94517bcbf1b00a2aa5fd50e7427ff0e38b4b13 Mon Sep 17 00:00:00 2001 From: c0de128 Date: Mon, 22 Dec 2025 13:52:45 -0600 Subject: [PATCH 1/2] [Bugfix][ROCm] Add device parameter to init_aiter_topK_meta_data Add explicit device parameter to init_aiter_topK_meta_data() instead of hardcoding 'cuda'. This improves multi-GPU support and makes device handling explicit. Changes: - Add device parameter (default: 'cuda') to init_aiter_topK_meta_data() - Use device parameter for all tensor creation in the function - Update caller in layer.py to pass torch.cuda.current_device() Signed-off-by: c0de128 --- vllm/model_executor/layers/fused_moe/layer.py | 1 + .../layers/fused_moe/rocm_aiter_fused_moe.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2e7267d56d83..5af200a49b07 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1077,6 +1077,7 @@ def _init_aiter_shared_experts_topK_buffer( max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens * dp_size, is_EP=self.use_ep, + device=torch.cuda.current_device(), ) self.local_num_experts += self.num_fused_shared_experts diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index ebd9e3a4a8f2..bc4cedbdc694 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -52,6 +52,7 @@ def init_aiter_topK_meta_data( shared_experts_score: float = 1.0, max_num_tokens: int = 32768, is_EP: bool = False, + device: int | str = "cuda", ): global aiter_topK_meta_data fake_expertid = n_routed_experts + n_shared_experts @@ -64,7 +65,7 @@ def init_aiter_topK_meta_data( total_topk_ids = torch.empty( (max_num_tokens, top_k + n_shared_experts + is_EP), dtype=torch.int32, - device="cuda", + device=device, ) ns_topk_ids, s_topk_ids = total_topk_ids.split( [top_k, n_shared_experts + is_EP], dim=1 @@ -80,12 +81,12 @@ def init_aiter_topK_meta_data( s_topk_ids_list = [ list(range(n_routed_experts, fake_expertid)) ] * max_num_tokens - s_topk_ids[:] = torch.tensor(s_topk_ids_list, dtype=torch.int32, device="cuda") + s_topk_ids[:] = torch.tensor(s_topk_ids_list, dtype=torch.int32, device=device) total_topk_weights = torch.empty( (max_num_tokens, top_k + n_shared_experts + is_EP), dtype=torch.float32, - device="cuda", + device=device, ) ns_topk_weights, s_topk_weights = total_topk_weights.split( [top_k, n_shared_experts + is_EP], dim=1 From 7fae7ba1cf796a41cd4939a7a430861e8154b557 Mon Sep 17 00:00:00 2001 From: c0de128 Date: Fri, 26 Dec 2025 09:50:14 -0600 Subject: [PATCH 2/2] [Bugfix][Hardware][AMD] Guard AITER topK init with rocm_aiter_fmoe_enabled check Add rocm_aiter_fmoe_enabled guard to _init_aiter_shared_experts_topK_buffer to prevent torch.cuda.current_device() from being called during CPU tests. The AITER-specific initialization should only run when AITER is enabled (i.e., on ROCm systems). This fixes CI failures in CPU config tests where no CUDA device is available. Signed-off-by: Kevin McKay Signed-off-by: c0de128 --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5af200a49b07..10630a84e66f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1066,7 +1066,7 @@ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: def _init_aiter_shared_experts_topK_buffer( self, vllm_config: VllmConfig, dp_size: int ): - if self.num_fused_shared_experts > 0: + if self.num_fused_shared_experts > 0 and self.rocm_aiter_fmoe_enabled: init_aiter_topK_meta_data( n_routed_experts=self.global_num_experts, n_shared_experts=self.num_fused_shared_experts,