vllm-project · wangxiyuan · Feb 25, 2026 · Feb 11, 2026 · Feb 24, 2026
@@ -39,7 +39,7 @@ The following table lists additional configuration options available in vLLM Asc
 | `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. |
 | `multistream_overlap_gate`          | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts.  |
 | `recompute_scheduler_enable`        | bool | `False` | Whether to enable recompute scheduler.                                                                    |
-| `enable_cpu_binding`                | bool | `False` | Whether to enable CPU Binding.                                                                            |
+| `enable_cpu_binding`                | bool | `True`  | Whether to enable CPU binding. Only takes effect on ARM CPUs; when enabled, A3 uses NUMA-balanced binding strategy and other device types use NUMA-affinity's. |
 | `SLO_limits_for_dynamic_batch`      | int  | `-1`    | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature                            |
 | `enable_npugraph_ex`                | bool | `False` | Whether to enable npugraph_ex graph mode.                                                                 |
 | `pa_shape_list`                     | list | `[]`    | The custom shape list of page attention ops.                                                              |

@@ -9,6 +9,7 @@ pybind11
 pyyaml
 scipy
 pandas
+psutil
 setuptools>=64
 setuptools-scm>=8
 torch==2.9.0

@@ -1,7 +1,8 @@
 import unittest
 from unittest.mock import patch
 
-from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo
+from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo, bind_cpus, is_arm_cpu
+from vllm_ascend.utils import AscendDeviceType
 
 
 class TestDeviceInfo(unittest.TestCase):
@@ -103,6 +104,23 @@ def test_average_distribute(self):
             2: [8, 9, 10, 11, 12, 13]
         })
 
+    @patch('vllm_ascend.cpu_binding.get_ascend_device_type')
+    def test_binding_mode_table(self, mock_get_device_type):
+        mock_get_device_type.return_value = AscendDeviceType.A2
+        self.assertEqual(self.cpu_alloc._binding_mode(), "affinity")
+        mock_get_device_type.return_value = AscendDeviceType.A3
+        self.assertEqual(self.cpu_alloc._binding_mode(), "numa_balanced")
+
+    @patch('vllm_ascend.cpu_binding.get_ascend_device_type')
+    def test_build_cpu_pools_fallback_to_numa_balanced(self, mock_get_device_type):
+        mock_get_device_type.return_value = AscendDeviceType.A2
+        self.cpu_alloc.device_info.npu_affinity = {}
+        with patch.object(self.cpu_alloc, "build_cpu_node_map") as mock_build_cpu_node_map, \
+                patch.object(self.cpu_alloc, "handle_no_affinity") as mock_handle_no_affinity:
+            self.cpu_alloc.build_cpu_pools()
+        mock_build_cpu_node_map.assert_called_once()
+        mock_handle_no_affinity.assert_called_once()
+
     def test_extend_numa(self):
         result = self.cpu_alloc.extend_numa([])
         self.assertEqual(result, [])
@@ -128,8 +146,10 @@ def test_build_cpu_node_map(self, mock_execute_command):
         self.assertEqual(self.cpu_alloc.numa_to_cpu_map,
                          expected_numa_to_cpu_map)
 
+    @patch('vllm_ascend.cpu_binding.get_ascend_device_type')
     @patch('vllm_ascend.cpu_binding.execute_command')
-    def test_handle_no_affinity(self, mock_execute_command):
+    def test_handle_no_affinity(self, mock_execute_command, mock_get_device_type):
+        mock_get_device_type.return_value = AscendDeviceType.A3
         mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)]
         self.cpu_alloc.device_info.running_npu_list = [0, 1]
         self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
@@ -163,5 +183,26 @@ def test_bind_threads(self, mock_execute_command):
         mock_execute_command.assert_called()
 
 
+class TestBindingSwitch(unittest.TestCase):
+
+    @patch('vllm_ascend.cpu_binding.platform.machine')
+    def test_is_arm_cpu(self, mock_machine):
+        mock_machine.return_value = "x86_64"
+        self.assertFalse(is_arm_cpu())
+        mock_machine.return_value = "aarch64"
+        self.assertTrue(is_arm_cpu())
+        mock_machine.return_value = "armv8"
+        self.assertTrue(is_arm_cpu())
+        mock_machine.return_value = "mips64"
+        self.assertFalse(is_arm_cpu())
+
+    @patch('vllm_ascend.cpu_binding.CpuAlloc')
+    @patch('vllm_ascend.cpu_binding.is_arm_cpu')
+    def test_bind_cpus_skip_non_arm(self, mock_is_arm_cpu, mock_cpu_alloc):
+        mock_is_arm_cpu.return_value = False
+        bind_cpus(0)
+        mock_cpu_alloc.assert_not_called()
+
+
 if __name__ == '__main__':
     unittest.main()
@@ -70,7 +70,7 @@ def test_init_npu_worker_normal_case(
         # Setup mock behavior
         mock_ops.register_dummy_fusion_op.return_value = None
         mock_ascend_config = MagicMock()
-        mock_ascend_config.enable_cpu_binding = False
+        mock_ascend_config.enable_cpu_binding = True
         mock_get_ascend_config.return_value = mock_ascend_config
 
         # Import and create NPUWorker instance
@@ -125,7 +125,7 @@ def test_init_npu_worker_with_trust_remote_code(
         self.model_config_mock.trust_remote_code = True
         mock_ops.register_dummy_fusion_op.return_value = None
         mock_ascend_config = MagicMock()
-        mock_ascend_config.enable_cpu_binding = False
+        mock_ascend_config.enable_cpu_binding = True
         mock_get_ascend_config.return_value = mock_ascend_config
 
         # Create NPUWorker instance
@@ -168,7 +168,7 @@ def test_init_npu_worker_with_custom_cache_dtype(
         self.cache_config_mock.cache_dtype = "float32"
         mock_ops.register_dummy_fusion_op.return_value = None
         mock_ascend_config = MagicMock()
-        mock_ascend_config.enable_cpu_binding = False
+        mock_ascend_config.enable_cpu_binding = True
         mock_get_ascend_config.return_value = mock_ascend_config
 
         # Create NPUWorker instance

@@ -84,7 +84,7 @@ def __init__(self, vllm_config: "VllmConfig"):
         self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False)
         self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False)
         self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False)
-        self.enable_cpu_binding = additional_config.get("enable_cpu_binding", False)
+        self.enable_cpu_binding = additional_config.get("enable_cpu_binding", True)
 
         self.pd_tp_ratio = 1
         self.pd_head_ratio = 1

@@ -1,16 +1,29 @@
 #!/usr/bin/env python3
 
 import os
+import platform
 import subprocess
 from collections import defaultdict
 
 import psutil
 from vllm.logger import logger
 
+from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
+
 ALLOWED_CPUS_PATH = "/proc/self/status"
 ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
 
 
+def is_arm_cpu() -> bool:
+    arch = platform.machine().lower()
+    if arch in {"x86_64", "amd64", "i386", "i686"}:
+        return False
+    if arch in {"aarch64", "arm64"} or arch.startswith("arm"):
+        return True
+    logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.")
+    return False
+
+
 def execute_command(cmd: list[str]) -> tuple[str, int]:
     with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
         out, _ = p.communicate(timeout=1000)
@@ -77,7 +90,7 @@ def get_running_npus(self) -> list[int]:
             devices_list = [int(x) for x in devices_str.split(",")]
             running_npu_set = set(devices_list) & running_npu_set
         if not running_npu_set:
-            raise RuntimeError("Can not get running npu info, you can use BIND_CPU=0 to skip.")
+            raise RuntimeError("Can not get running npu info.")
         return sorted(running_npu_set)
 
     def parse_allowed_cpus(self) -> list[int]:
@@ -202,7 +215,7 @@ def handle_no_affinity(self) -> None:
             npu_num_this_node = min(npu_num_per_node, num_running_npu - index)
             if npu_num_this_node <= 0:
                 break
-            # Evenly distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
+            # NUMA-balanced distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
             total_cpu_num = len(cpus)
             base_cpu_num = total_cpu_num // npu_num_this_node
             extra_cpu_num = total_cpu_num % npu_num_this_node
@@ -217,9 +230,22 @@ def handle_no_affinity(self) -> None:
                     index += 1
                 start_index = end_index
 
+    DEVICE_BINDING_MODE = {
+        AscendDeviceType.A3: "numa_balanced",
+    }
+
+    @classmethod
+    def _binding_mode(cls) -> str:
+        device_type = get_ascend_device_type()
+        return cls.DEVICE_BINDING_MODE.get(device_type, "affinity")
+
     def build_cpu_pools(self) -> None:
         self.build_cpu_node_map()
+        if self._binding_mode() == "numa_balanced":
+            self.handle_no_affinity()
+            return
         if not self.device_info.npu_affinity:
+            logger.warning("NPU affinity info not found, fallback to NUMA-balanced CPU binding.")
             self.handle_no_affinity()
             return
         for npu in self.device_info.running_npu_list:
@@ -282,5 +308,8 @@ def run_all(self) -> None:
 
 
 def bind_cpus(rank_id: int) -> None:
+    if not is_arm_cpu():
+        logger.info("CPU binding skipped: non-ARM CPU detected.")
+        return
     binder = CpuAlloc(rank_id)
     binder.run_all()
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ pybind11 @@
     pyyaml
     scipy
     pandas
+    psutil
     setuptools>=64
     setuptools-scm>=8
     torch==2.9.0
@@ Expand Down @@