From dbcc767fd3c96a5cb01e3ab49d1e69794a45547d Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Thu, 5 Feb 2026 14:47:31 +0800
Subject: [PATCH 1/4] Make UT does not complie and run custom kernels examples

Signed-off-by: leo-pony <nengjunma@outlook.com>
---
 .github/workflows/_unit_test.yaml     |  1 +
 setup.py                              |  8 +++++++-
 tests/ut/ops/test_layernorm.py        |  3 ++-
 tests/ut/ops/test_token_dispatcher.py | 23 +++++++++++++++++++++++
 tests/ut/test_utils.py                |  3 +++
 vllm_ascend/envs.py                   |  6 ++++++
 vllm_ascend/utils.py                  |  9 +++++++++
 vllm_ascend/worker/worker.py          |  8 ++++++++
 8 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_unit_test.yaml b/.github/workflows/_unit_test.yaml
index 289180fbc26..fb5dab4cad6 100644
--- a/.github/workflows/_unit_test.yaml
+++ b/.github/workflows/_unit_test.yaml
@@ -27,6 +27,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
         SOC_VERSION: ascend910b1
         MAX_JOBS: 4
+        COMPILE_CUSTOM_KERNELS: 0
     steps:
       - name: Install packages
         run: |
diff --git a/setup.py b/setup.py
index 3449282e473..1dea266f40e 100644
--- a/setup.py
+++ b/setup.py
@@ -163,6 +163,8 @@ def gen_build_info():
     with open(package_dir, "w+") as f:
         f.write("# Auto-generated file\n")
         f.write(f"__device_type__ = '{device_type}'\n")
+        f.write(
+            f"__compile_custom_kernels__ = {envs.COMPILE_CUSTOM_KERNELS}\n")
     logging.info(f"Generated _build_info.py with SOC version: {soc_version}")
 
 
@@ -334,6 +336,8 @@ def configure(self, ext: CMakeExtension) -> None:
         )
 
     def build_extensions(self) -> None:
+        if not envs.COMPILE_CUSTOM_KERNELS:
+            return
         # Ensure that CMake is present and working
         try:
             subprocess.check_output(["cmake", "--version"])
@@ -423,7 +427,9 @@ def run(self):
     # only checks out the commit. In this case, we set a dummy version.
     VERSION = "0.0.0"
 
-ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]
+ext_modules = []
+if envs.COMPILE_CUSTOM_KERNELS:
+    ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]
 
 
 def get_path(*filepath) -> str:
diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py
index a86c6736d70..2a2903825b1 100644
--- a/tests/ut/ops/test_layernorm.py
+++ b/tests/ut/ops/test_layernorm.py
@@ -39,7 +39,8 @@ def default_vllm_config():
     with set_current_vllm_config(mock_config):
         yield mock_config
 
-
+@pytest.mark.skip(
+    "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
 @pytest.mark.skipif(is_310p_hw(), reason="non_310P device unittest case.")
 @pytest.mark.parametrize("residual", [None, torch.randn(4, 8, dtype=torch.float32)])
 @patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py
index e7577c2a6cc..4844013b392 100644
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -17,6 +17,7 @@
 
 from unittest.mock import MagicMock, PropertyMock, patch
 
+import pytest
 import torch
 
 from tests.ut.base import TestBase
@@ -180,6 +181,8 @@ def tearDown(self):
         self.patcher_npu_moe_init_routing_custom.stop()
         self.patcher_npu_moe_token_unpermute.stop()
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch_without_expert_map(self):
         hidden_states = torch.randn(3, 128)
         topk_weights = torch.tensor([[0.7, 0.3], [0.6, 0.4], [0.5, 0.5]])
@@ -194,6 +197,8 @@ def test_token_dispatch_without_expert_map(self):
 
         self.assertEqual(results.group_list_type, 1)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch_with_expert_map(self):
         self.dispatcher.expert_map = torch.tensor([0, 1, 2, 3])
         hidden_states = torch.randn(3, 128)
@@ -209,6 +214,8 @@ def test_token_dispatch_with_expert_map(self):
 
         self.assertEqual(results.group_list_type, 1)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch_without_quant(self):
         kwargs = {
             "apply_router_weight_on_input": False,
@@ -229,6 +236,8 @@ def test_token_dispatch_without_quant(self):
 
         self.assertEqual(results.group_list_type, 1)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch_with_quant(self):
         kwargs = {
             "apply_router_weight_on_input": False,
@@ -254,6 +263,8 @@ def test_token_dispatch_with_quant(self):
         self.assertIsNotNone(results.dynamic_scale)
         self.assertEqual(results.group_list_type, 1)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_combine_with_expert_map(self):
         hidden_states = torch.randn(6, 128)
         context_metadata = {
@@ -265,6 +276,8 @@ def test_token_combine_with_expert_map(self):
             hidden_states, context_metadata).routed_out
         self.assertEqual(final_hidden_states.shape, (6, 128))
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_combine_without_expert_map(self):
         hidden_states = torch.randn(6, 128)
         context_metadata = {
@@ -277,6 +290,8 @@ def test_token_combine_without_expert_map(self):
         self.mock_npu_moe_token_unpermute.assert_called_once()
         self.assertEqual(final_hidden_states.shape, (6, 128))
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch_with_router_weight(self):
         self.dispatcher.apply_router_weight_on_input = True
         hidden_states = torch.randn(3, 128)
@@ -381,6 +396,8 @@ def setUp(self):
                                                       num_local_experts=2,
                                                       with_quant=False)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch(self):
         hidden_states = torch.randn(8, 16)
         topk_weights = torch.rand(8, 4)
@@ -400,6 +417,8 @@ def test_token_dispatch(self):
         self.assertIsNotNone(result.group_list)
         self.assertEqual(result.group_list_type, 1)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_combine(self):
         hidden_states = torch.randn(16, 16)
         context_metadata = {
@@ -419,6 +438,8 @@ def test_token_combine(self):
         self.assertIsNotNone(output)
         self.assertEqual(output.routed_out.shape, (8, 16))
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch_with_quant(self):
         self.dispatcher = TokenDispatcherWithAll2AllV(top_k=2,
                                                       num_experts=4,
@@ -444,6 +465,8 @@ def test_token_dispatch_with_quant(self):
         self.assertIsNotNone(result.dynamic_scale)
         self.assertEqual(result.group_list_type, 1)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_token_dispatch_with_quant_no_active_tokens(self):
         self.dispatcher = TokenDispatcherWithAll2AllV(top_k=2,
                                                       num_experts=4,
diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py
index 9e4bd623f6a..f90502c336e 100644
--- a/tests/ut/test_utils.py
+++ b/tests/ut/test_utils.py
@@ -18,6 +18,7 @@
 from threading import Lock
 from unittest import mock
 
+import pytest
 import torch
 from vllm.config import (CompilationConfig, ModelConfig, ParallelConfig,
                          VllmConfig)
@@ -104,6 +105,8 @@ def test_aligned_16(self):
         output_tensor = utils.aligned_16(input_tensor)
         self.assertEqual(output_tensor.shape[0], 32)
 
+    @pytest.mark.skip(
+        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
     def test_enable_custom_op(self):
         result = utils.enable_custom_op()
         self.assertTrue(result)
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 6fb90aeacd0..ec0993de2dd 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -35,6 +35,12 @@
     # The build type of the package. It can be one of the following values:
     # Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
     "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"),
+    # Whether to compile custom kernels. If not set, the default value is True.
+    # If set to False, the custom kernels will not be compiled. Please note that
+    # the sleep mode feature will be disabled as well if custom kernels are not
+    # compiled.
+    "COMPILE_CUSTOM_KERNELS":
+    lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
     # The CXX compiler used for compiling the package. If not set, the default
     # value is None, which means the system default CXX compiler will be used.
     "CXX_COMPILER": lambda: os.getenv("CXX_COMPILER", None),
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 9aadfb66da9..5f80d6aa604 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -53,6 +53,7 @@
 ACL_FORMAT_FRACTAL_NZ = 29
 
 _CUSTOM_OP_ENABLED = None
+_COMPILE_CUSTOM_KERNELS = None
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
 _WEIGHT_PREFETCH_METHOD = None
@@ -151,6 +152,14 @@ def maybe_trans_nz(weight: torch.Tensor):
         return torch_npu.npu_format_cast(weight, ACL_FORMAT_FRACTAL_NZ)
 
 
+def custom_kernels_compiled():
+    global _COMPILE_CUSTOM_KERNELS
+    if _COMPILE_CUSTOM_KERNELS is None:
+        from vllm_ascend import _build_info  # type: ignore
+        _COMPILE_CUSTOM_KERNELS = _build_info.__compile_custom_kernels__
+    return _COMPILE_CUSTOM_KERNELS
+
+
 def _round_up(x: int, align: int):
     # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc.
     # input: 15, 16 -> output: 16
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 27cc3ca50c0..36e77a1da77 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -183,6 +183,10 @@ def uninstall_static_kernel(self):
 
 
     def sleep(self, level: int = 1) -> None:
+        if not custom_kernels_compiled():
+            raise ValueError(
+                "Sleep mode needs custom kernels. "
+                "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.")
         free_bytes_before_sleep = torch.npu.mem_get_info()[0]
         # Save the buffers before level 2 sleep
         if level == 2:
@@ -203,6 +207,10 @@ def sleep(self, level: int = 1) -> None:
             used_bytes / GiB_bytes)
 
     def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        if not custom_kernels_compiled():
+            raise ValueError(
+                "Sleep mode needs custom kernels. "
+                "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.")
         if envs_ascend.VLLM_ASCEND_ENABLE_NZ:
             raise ValueError(
                 "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "

From 7c534ea7f9d452425f102c78e403addf80a1bcf8 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Thu, 5 Feb 2026 14:55:55 +0800
Subject: [PATCH 2/4] Update ut/lint cann version to 8.5.0

Signed-off-by: leo-pony <nengjunma@outlook.com>
---
 .github/workflows/pr_test_light.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 7a5eb5b9dfe..74fb6eb6dcd 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -92,7 +92,7 @@ jobs:
     with:
       vllm: ${{ matrix.vllm_version }}
       runner: linux-amd64-cpu-8-hk
-      image: quay.nju.edu.cn/ascend/cann:8.2.rc2-910b-ubuntu22.04-py3.11
+      image: quay.nju.edu.cn/ascend/cann:8.5.0-910b-ubuntu22.04-py3.11
       type: pr
 
   e2e-light:

From 60e5ecd2c09c39e84cbd35d0f73034209463394a Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Thu, 5 Feb 2026 16:08:25 +0800
Subject: [PATCH 3/4] Remove sleep model custom kernel compile check

Signed-off-by: leo-pony <nengjunma@outlook.com>
---
 setup.py                     |  2 --
 vllm_ascend/envs.py          |  7 ++++---
 vllm_ascend/utils.py         |  9 ---------
 vllm_ascend/worker/worker.py | 14 ++++++--------
 4 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/setup.py b/setup.py
index 1dea266f40e..8e7ffbd183e 100644
--- a/setup.py
+++ b/setup.py
@@ -163,8 +163,6 @@ def gen_build_info():
     with open(package_dir, "w+") as f:
         f.write("# Auto-generated file\n")
         f.write(f"__device_type__ = '{device_type}'\n")
-        f.write(
-            f"__compile_custom_kernels__ = {envs.COMPILE_CUSTOM_KERNELS}\n")
     logging.info(f"Generated _build_info.py with SOC version: {soc_version}")
 
 
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index ec0993de2dd..1bb18de0cdc 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -36,9 +36,10 @@
     # Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
     "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"),
     # Whether to compile custom kernels. If not set, the default value is True.
-    # If set to False, the custom kernels will not be compiled. Please note that
-    # the sleep mode feature will be disabled as well if custom kernels are not
-    # compiled.
+    # If set to False, the custom kernels will not be compiled.
+    # This configuration option should only be set to False when running UT
+    # scenarios in an environment without an NPU. Do not set it to False in
+    # other scenarios.
     "COMPILE_CUSTOM_KERNELS":
     lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
     # The CXX compiler used for compiling the package. If not set, the default
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 5f80d6aa604..9aadfb66da9 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -53,7 +53,6 @@
 ACL_FORMAT_FRACTAL_NZ = 29
 
 _CUSTOM_OP_ENABLED = None
-_COMPILE_CUSTOM_KERNELS = None
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
 _WEIGHT_PREFETCH_METHOD = None
@@ -152,14 +151,6 @@ def maybe_trans_nz(weight: torch.Tensor):
         return torch_npu.npu_format_cast(weight, ACL_FORMAT_FRACTAL_NZ)
 
 
-def custom_kernels_compiled():
-    global _COMPILE_CUSTOM_KERNELS
-    if _COMPILE_CUSTOM_KERNELS is None:
-        from vllm_ascend import _build_info  # type: ignore
-        _COMPILE_CUSTOM_KERNELS = _build_info.__compile_custom_kernels__
-    return _COMPILE_CUSTOM_KERNELS
-
-
 def _round_up(x: int, align: int):
     # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc.
     # input: 15, 16 -> output: 16
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 36e77a1da77..e01fb41c7ed 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -88,6 +88,12 @@ def __init__(
             # Additional parameters for compatibility with vllm
             **kwargs):
         """Initialize the worker for Ascend."""
+        if not envs_ascend.COMPILE_CUSTOM_KERNELS:
+            logger.warning(
+                "COMPILE_CUSTOM_KERNELS is set to False. "
+                "In most scenarios, without custom kernels, vllm-ascend will not function correctly."
+            )
+
         # register patch for vllm
         from vllm_ascend.utils import adapt_patch
         adapt_patch()
@@ -183,10 +189,6 @@ def uninstall_static_kernel(self):
 
 
     def sleep(self, level: int = 1) -> None:
-        if not custom_kernels_compiled():
-            raise ValueError(
-                "Sleep mode needs custom kernels. "
-                "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.")
         free_bytes_before_sleep = torch.npu.mem_get_info()[0]
         # Save the buffers before level 2 sleep
         if level == 2:
@@ -207,10 +209,6 @@ def sleep(self, level: int = 1) -> None:
             used_bytes / GiB_bytes)
 
     def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        if not custom_kernels_compiled():
-            raise ValueError(
-                "Sleep mode needs custom kernels. "
-                "Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1.")
         if envs_ascend.VLLM_ASCEND_ENABLE_NZ:
             raise ValueError(
                 "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "

From 8111c21f389f3fc8dade7e0390422bf1a77aee09 Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Thu, 5 Feb 2026 16:44:06 +0800
Subject: [PATCH 4/4] Fix format

Signed-off-by: leo-pony <nengjunma@outlook.com>
---
 vllm_ascend/envs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 1bb18de0cdc..05e4131bf48 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -40,8 +40,7 @@
     # This configuration option should only be set to False when running UT
     # scenarios in an environment without an NPU. Do not set it to False in
     # other scenarios.
-    "COMPILE_CUSTOM_KERNELS":
-    lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
+    "COMPILE_CUSTOM_KERNELS": lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
     # The CXX compiler used for compiling the package. If not set, the default
     # value is None, which means the system default CXX compiler will be used.
     "CXX_COMPILER": lambda: os.getenv("CXX_COMPILER", None),