diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 4ecef1f1bd4..48d1951591e 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -230,15 +230,12 @@ def test_check_and_update_config_enforce_eager_mode( @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) - @patch("vllm_ascend.utils.update_default_aclgraph_sizes") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( "vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config" ) def test_check_and_update_config_unsupported_compilation_level( - self, mock_init_recompute, mock_init_ascend, mock_update_default, - mock_soc_version): - mock_update_default.return_value = MagicMock() + self, mock_init_recompute, mock_init_ascend, mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) vllm_config = TestNPUPlatform.mock_vllm_config() diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 650cb4f0412..bda9c6cb238 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -30,11 +30,12 @@ from vllm_ascend.utils import refresh_block_size # isort: off -from vllm_ascend.utils import ( - ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, - COMPILATION_PASS_KEY, AscendDeviceType, enable_sp, get_ascend_device_type, - update_aclgraph_sizes, update_cudagraph_capture_sizes, - update_default_aclgraph_sizes, check_kv_extra_config) +from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, + COMPRESSED_TENSORS_METHOD, COMPILATION_PASS_KEY, + AscendDeviceType, enable_sp, + get_ascend_device_type, update_aclgraph_sizes, + update_cudagraph_capture_sizes, + check_kv_extra_config) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -216,10 +217,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # set cudaprah sizes before extending `compilation_config.splitting_ops` vllm_config._set_cudagraph_sizes() - # There are cases where default cudagraph_capture_sizes are not friendly - # to ascend ops && hardwares. We update these sizes here to improve - # default performance. - update_default_aclgraph_sizes(vllm_config) # TODO delete graph size update here when compilation_config.pass_config.enable_sp # is supported by vllm-ascend. if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \ diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 0f115da8970..2a9326eddd7 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -429,61 +429,6 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig, vllm_config.compilation_config.post_init_cudagraph_sizes() -def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool: - """ - Check whether it is vLLM default capture sizes. - """ - - max_cudagraph_capture_size = \ - vllm_config.compilation_config.max_cudagraph_capture_size - cudagraph_capture_sizes = [ - i for i in [1, 2, 4] if i <= max_cudagraph_capture_size - ] - if max_cudagraph_capture_size >= 8: - # Step size 8 for small batch sizes, up to 256(not included) - cudagraph_capture_sizes += list( - range(8, min(max_cudagraph_capture_size + 1, 256), 8)) - if max_cudagraph_capture_size >= 256: - # Step size 16 for larger batch sizes - cudagraph_capture_sizes += list( - range(256, max_cudagraph_capture_size + 1, 16)) - # in newer version, vLLM use ascending order of cudagraph_capture_sizes. - target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes) - if target_cudagraph_capture_sizes == \ - vllm_config.compilation_config.cudagraph_capture_sizes: - return True - - return False - - -def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None: - """ - Update ACL graph default capture sizes, so that new sizes - are more friendly to ascend ops && hardware. - """ - - if vllm_config.model_config is None or \ - vllm_config.model_config.enforce_eager or \ - not _is_default_capture_sizes(vllm_config): - return - - # modify the default capture_sizes for Qwen3-MoE models on dp settings. - # this is mainly because performance of _npu_paged_attention might degrades - # on special shapes. - # TODO(Angazenn): we will remove this once _npu_paged_attention is fully - # replaced by npu_fused_infer_attention_score which does not contain such bugs. - if vllm_config.model_config and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe" \ - and vllm_config.parallel_config.tensor_parallel_size == 1 \ - and vllm_config.parallel_config.data_parallel_size > 1 : - - max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size - new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [ - i for i in range(24, max_capture_size + 1, 8) - ] - update_cudagraph_capture_sizes(vllm_config, - new_cudagraph_capture_sizes) - - def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: """Update ACL graph capture sizes based on hardware limitations""" # NOTE: Currently, we can only capture 1800 graphs at most,