From dbcc767fd3c96a5cb01e3ab49d1e69794a45547d Mon Sep 17 00:00:00 2001 From: leo-pony Date: Thu, 5 Feb 2026 14:47:31 +0800 Subject: [PATCH 1/4] Make UT does not complie and run custom kernels examples Signed-off-by: leo-pony --- .github/workflows/_unit_test.yaml | 1 + setup.py | 8 +++++++- tests/ut/ops/test_layernorm.py | 3 ++- tests/ut/ops/test_token_dispatcher.py | 23 +++++++++++++++++++++++ tests/ut/test_utils.py | 3 +++ vllm_ascend/envs.py | 6 ++++++ vllm_ascend/utils.py | 9 +++++++++ vllm_ascend/worker/worker.py | 8 ++++++++ 8 files changed, 59 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_unit_test.yaml b/.github/workflows/_unit_test.yaml index 289180fbc26..fb5dab4cad6 100644 --- a/.github/workflows/_unit_test.yaml +++ b/.github/workflows/_unit_test.yaml @@ -27,6 +27,7 @@ jobs: VLLM_USE_MODELSCOPE: True SOC_VERSION: ascend910b1 MAX_JOBS: 4 + COMPILE_CUSTOM_KERNELS: 0 steps: - name: Install packages run: | diff --git a/setup.py b/setup.py index 3449282e473..1dea266f40e 100644 --- a/setup.py +++ b/setup.py @@ -163,6 +163,8 @@ def gen_build_info(): with open(package_dir, "w+") as f: f.write("# Auto-generated file\n") f.write(f"__device_type__ = '{device_type}'\n") + f.write( + f"__compile_custom_kernels__ = {envs.COMPILE_CUSTOM_KERNELS}\n") logging.info(f"Generated _build_info.py with SOC version: {soc_version}") @@ -334,6 +336,8 @@ def configure(self, ext: CMakeExtension) -> None: ) def build_extensions(self) -> None: + if not envs.COMPILE_CUSTOM_KERNELS: + return # Ensure that CMake is present and working try: subprocess.check_output(["cmake", "--version"]) @@ -423,7 +427,9 @@ def run(self): # only checks out the commit. In this case, we set a dummy version. VERSION = "0.0.0" -ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")] +ext_modules = [] +if envs.COMPILE_CUSTOM_KERNELS: + ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")] def get_path(*filepath) -> str: diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py index a86c6736d70..2a2903825b1 100644 --- a/tests/ut/ops/test_layernorm.py +++ b/tests/ut/ops/test_layernorm.py @@ -39,7 +39,8 @@ def default_vllm_config(): with set_current_vllm_config(mock_config): yield mock_config - +@pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") @pytest.mark.skipif(is_310p_hw(), reason="non_310P device unittest case.") @pytest.mark.parametrize("residual", [None, torch.randn(4, 8, dtype=torch.float32)]) @patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm) diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py index e7577c2a6cc..4844013b392 100644 --- a/tests/ut/ops/test_token_dispatcher.py +++ b/tests/ut/ops/test_token_dispatcher.py @@ -17,6 +17,7 @@ from unittest.mock import MagicMock, PropertyMock, patch +import pytest import torch from tests.ut.base import TestBase @@ -180,6 +181,8 @@ def tearDown(self): self.patcher_npu_moe_init_routing_custom.stop() self.patcher_npu_moe_token_unpermute.stop() + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch_without_expert_map(self): hidden_states = torch.randn(3, 128) topk_weights = torch.tensor([[0.7, 0.3], [0.6, 0.4], [0.5, 0.5]]) @@ -194,6 +197,8 @@ def test_token_dispatch_without_expert_map(self): self.assertEqual(results.group_list_type, 1) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch_with_expert_map(self): self.dispatcher.expert_map = torch.tensor([0, 1, 2, 3]) hidden_states = torch.randn(3, 128) @@ -209,6 +214,8 @@ def test_token_dispatch_with_expert_map(self): self.assertEqual(results.group_list_type, 1) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch_without_quant(self): kwargs = { "apply_router_weight_on_input": False, @@ -229,6 +236,8 @@ def test_token_dispatch_without_quant(self): self.assertEqual(results.group_list_type, 1) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch_with_quant(self): kwargs = { "apply_router_weight_on_input": False, @@ -254,6 +263,8 @@ def test_token_dispatch_with_quant(self): self.assertIsNotNone(results.dynamic_scale) self.assertEqual(results.group_list_type, 1) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_combine_with_expert_map(self): hidden_states = torch.randn(6, 128) context_metadata = { @@ -265,6 +276,8 @@ def test_token_combine_with_expert_map(self): hidden_states, context_metadata).routed_out self.assertEqual(final_hidden_states.shape, (6, 128)) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_combine_without_expert_map(self): hidden_states = torch.randn(6, 128) context_metadata = { @@ -277,6 +290,8 @@ def test_token_combine_without_expert_map(self): self.mock_npu_moe_token_unpermute.assert_called_once() self.assertEqual(final_hidden_states.shape, (6, 128)) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch_with_router_weight(self): self.dispatcher.apply_router_weight_on_input = True hidden_states = torch.randn(3, 128) @@ -381,6 +396,8 @@ def setUp(self): num_local_experts=2, with_quant=False) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch(self): hidden_states = torch.randn(8, 16) topk_weights = torch.rand(8, 4) @@ -400,6 +417,8 @@ def test_token_dispatch(self): self.assertIsNotNone(result.group_list) self.assertEqual(result.group_list_type, 1) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_combine(self): hidden_states = torch.randn(16, 16) context_metadata = { @@ -419,6 +438,8 @@ def test_token_combine(self): self.assertIsNotNone(output) self.assertEqual(output.routed_out.shape, (8, 16)) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch_with_quant(self): self.dispatcher = TokenDispatcherWithAll2AllV(top_k=2, num_experts=4, @@ -444,6 +465,8 @@ def test_token_dispatch_with_quant(self): self.assertIsNotNone(result.dynamic_scale) self.assertEqual(result.group_list_type, 1) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_token_dispatch_with_quant_no_active_tokens(self): self.dispatcher = TokenDispatcherWithAll2AllV(top_k=2, num_experts=4, diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index 9e4bd623f6a..f90502c336e 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -18,6 +18,7 @@ from threading import Lock from unittest import mock +import pytest import torch from vllm.config import (CompilationConfig, ModelConfig, ParallelConfig, VllmConfig) @@ -104,6 +105,8 @@ def test_aligned_16(self): output_tensor = utils.aligned_16(input_tensor) self.assertEqual(output_tensor.shape[0], 32) + @pytest.mark.skip( + "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.") def test_enable_custom_op(self): result = utils.enable_custom_op() self.assertTrue(result) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 6fb90aeacd0..ec0993de2dd 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -35,6 +35,12 @@ # The build type of the package. It can be one of the following values: # Release, Debug, RelWithDebugInfo. If not set, the default value is Release. "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"), + # Whether to compile custom kernels. If not set, the default value is True. + # If set to False, the custom kernels will not be compiled. Please note that + # the sleep mode feature will be disabled as well if custom kernels are not + # compiled. + "COMPILE_CUSTOM_KERNELS": + lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))), # The CXX compiler used for compiling the package. If not set, the default # value is None, which means the system default CXX compiler will be used. "CXX_COMPILER": lambda: os.getenv("CXX_COMPILER", None), diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 9aadfb66da9..5f80d6aa604 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -53,6 +53,7 @@ ACL_FORMAT_FRACTAL_NZ = 29 _CUSTOM_OP_ENABLED = None +_COMPILE_CUSTOM_KERNELS = None _CURRENT_STREAM = None _PREFETCH_STREAM = None _WEIGHT_PREFETCH_METHOD = None @@ -151,6 +152,14 @@ def maybe_trans_nz(weight: torch.Tensor): return torch_npu.npu_format_cast(weight, ACL_FORMAT_FRACTAL_NZ) +def custom_kernels_compiled(): + global _COMPILE_CUSTOM_KERNELS + if _COMPILE_CUSTOM_KERNELS is None: + from vllm_ascend import _build_info # type: ignore + _COMPILE_CUSTOM_KERNELS = _build_info.__compile_custom_kernels__ + return _COMPILE_CUSTOM_KERNELS + + def _round_up(x: int, align: int): # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc. # input: 15, 16 -> output: 16 diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 27cc3ca50c0..36e77a1da77 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -183,6 +183,10 @@ def uninstall_static_kernel(self): def sleep(self, level: int = 1) -> None: + if not custom_kernels_compiled(): + raise ValueError( + "Sleep mode needs custom kernels. " + "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.") free_bytes_before_sleep = torch.npu.mem_get_info()[0] # Save the buffers before level 2 sleep if level == 2: @@ -203,6 +207,10 @@ def sleep(self, level: int = 1) -> None: used_bytes / GiB_bytes) def wake_up(self, tags: Optional[list[str]] = None) -> None: + if not custom_kernels_compiled(): + raise ValueError( + "Sleep mode needs custom kernels. " + "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.") if envs_ascend.VLLM_ASCEND_ENABLE_NZ: raise ValueError( "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues " From 7c534ea7f9d452425f102c78e403addf80a1bcf8 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Thu, 5 Feb 2026 14:55:55 +0800 Subject: [PATCH 2/4] Update ut/lint cann version to 8.5.0 Signed-off-by: leo-pony --- .github/workflows/pr_test_light.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 7a5eb5b9dfe..74fb6eb6dcd 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -92,7 +92,7 @@ jobs: with: vllm: ${{ matrix.vllm_version }} runner: linux-amd64-cpu-8-hk - image: quay.nju.edu.cn/ascend/cann:8.2.rc2-910b-ubuntu22.04-py3.11 + image: quay.nju.edu.cn/ascend/cann:8.5.0-910b-ubuntu22.04-py3.11 type: pr e2e-light: From 60e5ecd2c09c39e84cbd35d0f73034209463394a Mon Sep 17 00:00:00 2001 From: leo-pony Date: Thu, 5 Feb 2026 16:08:25 +0800 Subject: [PATCH 3/4] Remove sleep model custom kernel compile check Signed-off-by: leo-pony --- setup.py | 2 -- vllm_ascend/envs.py | 7 ++++--- vllm_ascend/utils.py | 9 --------- vllm_ascend/worker/worker.py | 14 ++++++-------- 4 files changed, 10 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index 1dea266f40e..8e7ffbd183e 100644 --- a/setup.py +++ b/setup.py @@ -163,8 +163,6 @@ def gen_build_info(): with open(package_dir, "w+") as f: f.write("# Auto-generated file\n") f.write(f"__device_type__ = '{device_type}'\n") - f.write( - f"__compile_custom_kernels__ = {envs.COMPILE_CUSTOM_KERNELS}\n") logging.info(f"Generated _build_info.py with SOC version: {soc_version}") diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index ec0993de2dd..1bb18de0cdc 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -36,9 +36,10 @@ # Release, Debug, RelWithDebugInfo. If not set, the default value is Release. "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"), # Whether to compile custom kernels. If not set, the default value is True. - # If set to False, the custom kernels will not be compiled. Please note that - # the sleep mode feature will be disabled as well if custom kernels are not - # compiled. + # If set to False, the custom kernels will not be compiled. + # This configuration option should only be set to False when running UT + # scenarios in an environment without an NPU. Do not set it to False in + # other scenarios. "COMPILE_CUSTOM_KERNELS": lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))), # The CXX compiler used for compiling the package. If not set, the default diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 5f80d6aa604..9aadfb66da9 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -53,7 +53,6 @@ ACL_FORMAT_FRACTAL_NZ = 29 _CUSTOM_OP_ENABLED = None -_COMPILE_CUSTOM_KERNELS = None _CURRENT_STREAM = None _PREFETCH_STREAM = None _WEIGHT_PREFETCH_METHOD = None @@ -152,14 +151,6 @@ def maybe_trans_nz(weight: torch.Tensor): return torch_npu.npu_format_cast(weight, ACL_FORMAT_FRACTAL_NZ) -def custom_kernels_compiled(): - global _COMPILE_CUSTOM_KERNELS - if _COMPILE_CUSTOM_KERNELS is None: - from vllm_ascend import _build_info # type: ignore - _COMPILE_CUSTOM_KERNELS = _build_info.__compile_custom_kernels__ - return _COMPILE_CUSTOM_KERNELS - - def _round_up(x: int, align: int): # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc. # input: 15, 16 -> output: 16 diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 36e77a1da77..e01fb41c7ed 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -88,6 +88,12 @@ def __init__( # Additional parameters for compatibility with vllm **kwargs): """Initialize the worker for Ascend.""" + if not envs_ascend.COMPILE_CUSTOM_KERNELS: + logger.warning( + "COMPILE_CUSTOM_KERNELS is set to False. " + "In most scenarios, without custom kernels, vllm-ascend will not function correctly." + ) + # register patch for vllm from vllm_ascend.utils import adapt_patch adapt_patch() @@ -183,10 +189,6 @@ def uninstall_static_kernel(self): def sleep(self, level: int = 1) -> None: - if not custom_kernels_compiled(): - raise ValueError( - "Sleep mode needs custom kernels. " - "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.") free_bytes_before_sleep = torch.npu.mem_get_info()[0] # Save the buffers before level 2 sleep if level == 2: @@ -207,10 +209,6 @@ def sleep(self, level: int = 1) -> None: used_bytes / GiB_bytes) def wake_up(self, tags: Optional[list[str]] = None) -> None: - if not custom_kernels_compiled(): - raise ValueError( - "Sleep mode needs custom kernels. " - "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.") if envs_ascend.VLLM_ASCEND_ENABLE_NZ: raise ValueError( "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues " From 8111c21f389f3fc8dade7e0390422bf1a77aee09 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Thu, 5 Feb 2026 16:44:06 +0800 Subject: [PATCH 4/4] Fix format Signed-off-by: leo-pony --- vllm_ascend/envs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 1bb18de0cdc..05e4131bf48 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -40,8 +40,7 @@ # This configuration option should only be set to False when running UT # scenarios in an environment without an NPU. Do not set it to False in # other scenarios. - "COMPILE_CUSTOM_KERNELS": - lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))), + "COMPILE_CUSTOM_KERNELS": lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))), # The CXX compiler used for compiling the package. If not set, the default # value is None, which means the system default CXX compiler will be used. "CXX_COMPILER": lambda: os.getenv("CXX_COMPILER", None),