Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ The following table lists additional configuration options available in vLLM Asc
| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. |
| `multistream_overlap_gate` | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts. |
| `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. |
| `enable_cpu_binding` | bool | `False` | Whether to enable CPU Binding. |
| `enable_cpu_binding` | bool | `True` | Whether to enable CPU binding. Only takes effect on ARM CPUs; when enabled, A3 uses NUMA-balanced binding strategy and other device types use NUMA-affinity's. |
| `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature |
| `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph_ex graph mode. |
| `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. |
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pybind11
pyyaml
scipy
pandas
psutil
setuptools>=64
setuptools-scm>=8
torch==2.9.0
Expand Down
45 changes: 43 additions & 2 deletions tests/ut/device_allocator/test_cpu_binding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import unittest
from unittest.mock import patch

from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo
from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo, bind_cpus, is_arm_cpu
from vllm_ascend.utils import AscendDeviceType


class TestDeviceInfo(unittest.TestCase):
Expand Down Expand Up @@ -103,6 +104,23 @@ def test_average_distribute(self):
2: [8, 9, 10, 11, 12, 13]
})

@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
def test_binding_mode_table(self, mock_get_device_type):
mock_get_device_type.return_value = AscendDeviceType.A2
self.assertEqual(self.cpu_alloc._binding_mode(), "affinity")
mock_get_device_type.return_value = AscendDeviceType.A3
self.assertEqual(self.cpu_alloc._binding_mode(), "numa_balanced")

@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
def test_build_cpu_pools_fallback_to_numa_balanced(self, mock_get_device_type):
mock_get_device_type.return_value = AscendDeviceType.A2
self.cpu_alloc.device_info.npu_affinity = {}
with patch.object(self.cpu_alloc, "build_cpu_node_map") as mock_build_cpu_node_map, \
patch.object(self.cpu_alloc, "handle_no_affinity") as mock_handle_no_affinity:
self.cpu_alloc.build_cpu_pools()
mock_build_cpu_node_map.assert_called_once()
mock_handle_no_affinity.assert_called_once()

def test_extend_numa(self):
result = self.cpu_alloc.extend_numa([])
self.assertEqual(result, [])
Expand All @@ -128,8 +146,10 @@ def test_build_cpu_node_map(self, mock_execute_command):
self.assertEqual(self.cpu_alloc.numa_to_cpu_map,
expected_numa_to_cpu_map)

@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
@patch('vllm_ascend.cpu_binding.execute_command')
def test_handle_no_affinity(self, mock_execute_command):
def test_handle_no_affinity(self, mock_execute_command, mock_get_device_type):
mock_get_device_type.return_value = AscendDeviceType.A3
mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)]
self.cpu_alloc.device_info.running_npu_list = [0, 1]
self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
Expand Down Expand Up @@ -163,5 +183,26 @@ def test_bind_threads(self, mock_execute_command):
mock_execute_command.assert_called()


class TestBindingSwitch(unittest.TestCase):

@patch('vllm_ascend.cpu_binding.platform.machine')
def test_is_arm_cpu(self, mock_machine):
mock_machine.return_value = "x86_64"
self.assertFalse(is_arm_cpu())
mock_machine.return_value = "aarch64"
self.assertTrue(is_arm_cpu())
mock_machine.return_value = "armv8"
self.assertTrue(is_arm_cpu())
mock_machine.return_value = "mips64"
self.assertFalse(is_arm_cpu())

@patch('vllm_ascend.cpu_binding.CpuAlloc')
@patch('vllm_ascend.cpu_binding.is_arm_cpu')
def test_bind_cpus_skip_non_arm(self, mock_is_arm_cpu, mock_cpu_alloc):
mock_is_arm_cpu.return_value = False
bind_cpus(0)
mock_cpu_alloc.assert_not_called()


if __name__ == '__main__':
unittest.main()
6 changes: 3 additions & 3 deletions tests/ut/worker/test_worker_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_init_npu_worker_normal_case(
# Setup mock behavior
mock_ops.register_dummy_fusion_op.return_value = None
mock_ascend_config = MagicMock()
mock_ascend_config.enable_cpu_binding = False
mock_ascend_config.enable_cpu_binding = True
mock_get_ascend_config.return_value = mock_ascend_config

# Import and create NPUWorker instance
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_init_npu_worker_with_trust_remote_code(
self.model_config_mock.trust_remote_code = True
mock_ops.register_dummy_fusion_op.return_value = None
mock_ascend_config = MagicMock()
mock_ascend_config.enable_cpu_binding = False
mock_ascend_config.enable_cpu_binding = True
mock_get_ascend_config.return_value = mock_ascend_config

# Create NPUWorker instance
Expand Down Expand Up @@ -168,7 +168,7 @@ def test_init_npu_worker_with_custom_cache_dtype(
self.cache_config_mock.cache_dtype = "float32"
mock_ops.register_dummy_fusion_op.return_value = None
mock_ascend_config = MagicMock()
mock_ascend_config.enable_cpu_binding = False
mock_ascend_config.enable_cpu_binding = True
mock_get_ascend_config.return_value = mock_ascend_config

# Create NPUWorker instance
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(self, vllm_config: "VllmConfig"):
self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False)
self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False)
self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False)
self.enable_cpu_binding = additional_config.get("enable_cpu_binding", False)
self.enable_cpu_binding = additional_config.get("enable_cpu_binding", True)

self.pd_tp_ratio = 1
self.pd_head_ratio = 1
Expand Down
33 changes: 31 additions & 2 deletions vllm_ascend/cpu_binding.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,29 @@
#!/usr/bin/env python3

import os
import platform
import subprocess
from collections import defaultdict

import psutil
from vllm.logger import logger

from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type

ALLOWED_CPUS_PATH = "/proc/self/status"
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")


def is_arm_cpu() -> bool:
arch = platform.machine().lower()
if arch in {"x86_64", "amd64", "i386", "i686"}:
return False
if arch in {"aarch64", "arm64"} or arch.startswith("arm"):
return True
logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.")
return False


def execute_command(cmd: list[str]) -> tuple[str, int]:
with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
out, _ = p.communicate(timeout=1000)
Expand Down Expand Up @@ -77,7 +90,7 @@ def get_running_npus(self) -> list[int]:
devices_list = [int(x) for x in devices_str.split(",")]
running_npu_set = set(devices_list) & running_npu_set
if not running_npu_set:
raise RuntimeError("Can not get running npu info, you can use BIND_CPU=0 to skip.")
raise RuntimeError("Can not get running npu info.")
return sorted(running_npu_set)

def parse_allowed_cpus(self) -> list[int]:
Expand Down Expand Up @@ -202,7 +215,7 @@ def handle_no_affinity(self) -> None:
npu_num_this_node = min(npu_num_per_node, num_running_npu - index)
if npu_num_this_node <= 0:
break
# Evenly distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
# NUMA-balanced distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
total_cpu_num = len(cpus)
base_cpu_num = total_cpu_num // npu_num_this_node
extra_cpu_num = total_cpu_num % npu_num_this_node
Expand All @@ -217,9 +230,22 @@ def handle_no_affinity(self) -> None:
index += 1
start_index = end_index

DEVICE_BINDING_MODE = {
AscendDeviceType.A3: "numa_balanced",
}

@classmethod
def _binding_mode(cls) -> str:
device_type = get_ascend_device_type()
return cls.DEVICE_BINDING_MODE.get(device_type, "affinity")

def build_cpu_pools(self) -> None:
self.build_cpu_node_map()
if self._binding_mode() == "numa_balanced":
self.handle_no_affinity()
return
if not self.device_info.npu_affinity:
logger.warning("NPU affinity info not found, fallback to NUMA-balanced CPU binding.")
self.handle_no_affinity()
return
for npu in self.device_info.running_npu_list:
Expand Down Expand Up @@ -282,5 +308,8 @@ def run_all(self) -> None:


def bind_cpus(rank_id: int) -> None:
if not is_arm_cpu():
logger.info("CPU binding skipped: non-ARM CPU detected.")
return
binder = CpuAlloc(rank_id)
binder.run_all()