vllm-project · wangxiyuan · Jan 27, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
@@ -230,15 +230,12 @@ def test_check_and_update_config_enforce_eager_mode(
 
     @patch("vllm_ascend.utils.get_ascend_device_type",
            return_value=AscendDeviceType.A3)
-    @patch("vllm_ascend.utils.update_default_aclgraph_sizes")
     @patch("vllm_ascend.ascend_config.init_ascend_config")
     @patch(
         "vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
     )
     def test_check_and_update_config_unsupported_compilation_level(
-            self, mock_init_recompute, mock_init_ascend, mock_update_default,
-            mock_soc_version):
-        mock_update_default.return_value = MagicMock()
+            self, mock_init_recompute, mock_init_ascend, mock_soc_version):
         mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
         )
         vllm_config = TestNPUPlatform.mock_vllm_config()

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -30,11 +30,12 @@
 from vllm_ascend.utils import refresh_block_size
 
 # isort: off
-from vllm_ascend.utils import (
-    ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD,
-    COMPILATION_PASS_KEY, AscendDeviceType, enable_sp, get_ascend_device_type,
-    update_aclgraph_sizes, update_cudagraph_capture_sizes,
-    update_default_aclgraph_sizes, check_kv_extra_config)
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
+                               COMPRESSED_TENSORS_METHOD, COMPILATION_PASS_KEY,
+                               AscendDeviceType, enable_sp,
+                               get_ascend_device_type, update_aclgraph_sizes,
+                               update_cudagraph_capture_sizes,
+                               check_kv_extra_config)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -216,10 +217,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         # set cudaprah sizes before extending `compilation_config.splitting_ops`
         vllm_config._set_cudagraph_sizes()
-        # There are cases where default cudagraph_capture_sizes are not friendly
-        # to ascend ops && hardwares. We update these sizes here to improve
-        # default performance.
-        update_default_aclgraph_sizes(vllm_config)
         # TODO delete graph size update here when compilation_config.pass_config.enable_sp
         # is supported by vllm-ascend.
         if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \

diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -429,61 +429,6 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig,
     vllm_config.compilation_config.post_init_cudagraph_sizes()
 
 
-def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
-    """
-    Check whether it is vLLM default capture sizes.
-    """
-
-    max_cudagraph_capture_size = \
-        vllm_config.compilation_config.max_cudagraph_capture_size
-    cudagraph_capture_sizes = [
-        i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
-    ]
-    if max_cudagraph_capture_size >= 8:
-        # Step size 8 for small batch sizes, up to 256(not included)
-        cudagraph_capture_sizes += list(
-            range(8, min(max_cudagraph_capture_size + 1, 256), 8))
-    if max_cudagraph_capture_size >= 256:
-        # Step size 16 for larger batch sizes
-        cudagraph_capture_sizes += list(
-            range(256, max_cudagraph_capture_size + 1, 16))
-    # in newer version, vLLM use ascending order of cudagraph_capture_sizes.
-    target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
-    if target_cudagraph_capture_sizes == \
-            vllm_config.compilation_config.cudagraph_capture_sizes:
-        return True
-
-    return False
-
-
-def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
-    """
-    Update ACL graph default capture sizes, so that new sizes
-    are more friendly to ascend ops && hardware.
-    """
-
-    if vllm_config.model_config is None or \
-        vllm_config.model_config.enforce_eager or \
-        not _is_default_capture_sizes(vllm_config):
-        return
-
-    # modify the default capture_sizes for Qwen3-MoE models on dp settings.
-    # this is mainly because performance of _npu_paged_attention might degrades
-    # on special shapes.
-    # TODO(Angazenn): we will remove this once _npu_paged_attention is fully
-    # replaced by npu_fused_infer_attention_score which does not contain such bugs.
-    if vllm_config.model_config and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe" \
-        and vllm_config.parallel_config.tensor_parallel_size == 1 \
-        and vllm_config.parallel_config.data_parallel_size > 1 :
-
-        max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
-        new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
-            i for i in range(24, max_capture_size + 1, 8)
-        ]
-        update_cudagraph_capture_sizes(vllm_config,
-                                       new_cudagraph_capture_sizes)
-
-
 def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
     """Update ACL graph capture sizes based on hardware limitations"""
     # NOTE: Currently, we can only capture 1800 graphs at most,