vllm-project · wangxiyuan · Dec 19, 2025 · Dec 17, 2025 · Dec 18, 2025
@@ -30,8 +30,7 @@ The following table lists additional configuration options available in vLLM Asc
 | `finegrained_tp_config`                  | dict | `{}`    | Configuration options for module tensor parallelism                                                                                             |
 | `weight_prefetch_config`            | dict | `{}`    | Configuration options for weight prefetch                                                                                                        |
 | `refresh`                           | bool | `false` | Whether to refresh global Ascend configuration content. This is usually used by rlhf or ut/e2e test case.                                      |
-| `expert_map_path`                   | str  | `None`  | When using expert load balancing for an MoE model, an expert map path needs to be passed in.                                                 |
-| `kv_cache_dtype`                    | str  | `None`  | When using the KV cache quantization method, KV cache dtype needs to be set, currently only int8 is supported.                                |
+| `expert_map_path`                   | str  | `None`  | When using expert load balancing for an MoE model, an expert map path needs to be passed in.                                                 |                              |
 | `enable_shared_expert_dp`           | bool | `False` | When the expert is shared in DP, it delivers better performance but consumes more memory. Currently only DeepSeek series models are supported. |
 | `lmhead_tensor_parallel_size`       | int  | `None`  | The custom tensor parallel size of lmhead. Restriction: Can only be used when tensor_parallel=1                                                                                                   |
 | `oproj_tensor_parallel_size`        | int  | `None`  | The custom tensor parallel size of oproj.                                                                                                     |

@@ -31,10 +31,6 @@ def test_get_kv_cache_shape_not_310p(self, mock_soc_version):
         result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40)
         self.assertEqual(result, (2, 10, 20, 30, 40))
 
-    def test_get_bsh_kv_cache_shape(self):
-        result = AscendAttentionBackend.get_bsh_kv_cache_shape(10, 20, 30, 40)
-        self.assertEqual(result, (2, 10, 20, 30 * 40))
-
     def test_swap_blocks(self):
         src_kv_cache = [torch.zeros((10, 20)), torch.zeros((10, 20))]
         dst_kv_cache = [torch.zeros((10, 20)), torch.zeros((10, 20))]

@@ -1,15 +1,13 @@
 from unittest.mock import MagicMock, patch
 
-import torch
 from vllm.attention.layer import Attention
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.linear import LinearBase
 
 from tests.ut.base import TestBase
 from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
-from vllm_ascend.quantization.quant_config import (AscendKVCacheMethod,
-                                                   AscendQuantConfig)
+from vllm_ascend.quantization.quant_config import AscendQuantConfig
 from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
 
 
@@ -19,7 +17,6 @@ def setUp(self):
         self.sample_config = {
             "weight": "INT8",
             "fa_quant_type": "C8",
-            "kv_quant_type": "C8",
             "layer1.weight": "INT8",
             "layer2.weight": "FLOAT",
             "fused_layer.weight": "FLOAT",
@@ -115,16 +112,6 @@ def test_get_quant_method_for_attention(self):
                 attention_layer, ".attn")
             self.assertIs(method, mock_ascend_kvcache.return_value)
 
-        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
-            patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
-                   return_value=MagicMock()) as mock_ascend_kvcache:
-            # Test with kv_quant_type
-            modified_config = {"kv_quant_type": "C8"}
-            config = AscendQuantConfig(modified_config)
-            config.packed_modules_mapping = None
-            method = config.get_quant_method(attention_layer, "attn")
-            self.assertIs(method, mock_ascend_kvcache.return_value)
-
     def test_get_quant_method_for_fused_moe(self):
         fused_moe_layer = MagicMock(spec=FusedMoE)
         fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
@@ -169,73 +156,3 @@ def test_is_layer_skipped_ascend(self):
 
     def test_get_scaled_act_names(self):
         self.assertEqual(self.ascend_config.get_scaled_act_names(), [])
-
-
-class TestAscendKVCacheMethod(TestBase):
-
-    def setUp(self):
-        # Setup common test fixtures
-        self.mock_quant_config = MagicMock(spec=AscendQuantConfig)
-        self.mock_quant_config.quant_description = {"kv_quant_type": "C8"}
-        self.prefix = "layer.attn"
-
-        # Mock quant_method
-        self.mock_quant_method = MagicMock()
-        self.patcher = patch(
-            'vllm_ascend.quantization.quant_config.get_quant_method')
-        self.mock_get_quant_method = self.patcher.start()
-        self.mock_get_quant_method.return_value = self.mock_quant_method
-
-        # Create instance
-        self.kv_cache_method = AscendKVCacheMethod(self.mock_quant_config,
-                                                   self.prefix)
-
-    def tearDown(self):
-        self.patcher.stop()
-
-    def test_create_weights(self):
-        """Test create_weights delegates to quant_method."""
-        mock_layer = MagicMock()
-        self.kv_cache_method.create_weights(mock_layer)
-        self.mock_quant_method.create_weights.assert_called_once_with(
-            mock_layer)
-
-    def test_process_weights_after_loading_with_method(self):
-        """Test process_weights when quant_method has the method."""
-        mock_layer = MagicMock()
-        self.kv_cache_method.process_weights_after_loading(mock_layer)
-        self.mock_quant_method.process_weights_after_loading.assert_called_once_with(
-            mock_layer)
-
-    def test_process_weights_after_loading_without_method(self):
-        """Test process_weights when quant_method lacks the method."""
-        # Reset mock to remove the method
-        del self.mock_quant_method.process_weights_after_loading
-        mock_layer = MagicMock()
-
-        # Should not raise exception
-        self.kv_cache_method.process_weights_after_loading(mock_layer)
-
-    def test_apply_delegation(self):
-        """Test apply properly delegates to quant_method."""
-        mock_layer = MagicMock()
-        mock_query = torch.randn(1, 32, 128)
-        mock_key = torch.randn(1, 32, 128)
-        mock_value = torch.randn(1, 32, 128)
-        mock_kv_cache = MagicMock()
-        mock_attn_metadata = MagicMock()
-        mock_scale = 1.0
-        mock_output = torch.zeros(1, 32, 128)
-        mock_attn_type = MagicMock()
-        expected_result = torch.randn(1, 32, 128)
-        self.mock_quant_method.apply.return_value = expected_result
-
-        result = self.kv_cache_method.apply(mock_layer, mock_query, mock_key,
-                                            mock_value, mock_kv_cache,
-                                            mock_attn_metadata, mock_attn_type,
-                                            mock_scale, mock_output)
-
-        self.mock_quant_method.apply.assert_called_once_with(
-            mock_layer, mock_query, mock_key, mock_value, mock_kv_cache,
-            mock_attn_metadata, mock_attn_type, mock_scale, mock_output)
-        self.assertTrue(torch.equal(result, expected_result))
@@ -39,18 +39,6 @@ def test_moe_quant_methods(self):
                                           "moe")
                 self.assertIsInstance(method, cls)
 
-    def test_with_fa_quant_type(self):
-        quant_description = {"fa_quant_type": "C8"}
-        method = get_quant_method(quant_description, ".attn", "attention")
-        self.assertIsInstance(
-            method, ASCEND_QUANTIZATION_METHOD_MAP["C8"]["attention"])
-
-    def test_with_kv_quant_type(self):
-        quant_description = {"kv_quant_type": "C8"}
-        method = get_quant_method(quant_description, ".attn", "attention")
-        self.assertIsInstance(
-            method, ASCEND_QUANTIZATION_METHOD_MAP["C8"]["attention"])
-
     def test_invalid_layer_type(self):
         quant_description = {"linear_layer.weight": "W8A8"}
         with self.assertRaises(NotImplementedError):