Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/source/user_guide/configuration/additional_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ The following table lists additional configuration options available in vLLM Asc
| `finegrained_tp_config` | dict | `{}` | Configuration options for module tensor parallelism |
| `weight_prefetch_config` | dict | `{}` | Configuration options for weight prefetch |
| `refresh` | bool | `false` | Whether to refresh global Ascend configuration content. This is usually used by rlhf or ut/e2e test case. |
| `expert_map_path` | str | `None` | When using expert load balancing for an MoE model, an expert map path needs to be passed in. |
| `kv_cache_dtype` | str | `None` | When using the KV cache quantization method, KV cache dtype needs to be set, currently only int8 is supported. |
| `expert_map_path` | str | `None` | When using expert load balancing for an MoE model, an expert map path needs to be passed in. | |
| `enable_shared_expert_dp` | bool | `False` | When the expert is shared in DP, it delivers better performance but consumes more memory. Currently only DeepSeek series models are supported. |
| `lmhead_tensor_parallel_size` | int | `None` | The custom tensor parallel size of lmhead. Restriction: Can only be used when tensor_parallel=1 |
| `oproj_tensor_parallel_size` | int | `None` | The custom tensor parallel size of oproj. |
Expand Down
4 changes: 0 additions & 4 deletions tests/ut/attention/test_attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ def test_get_kv_cache_shape_not_310p(self, mock_soc_version):
result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40)
self.assertEqual(result, (2, 10, 20, 30, 40))

def test_get_bsh_kv_cache_shape(self):
result = AscendAttentionBackend.get_bsh_kv_cache_shape(10, 20, 30, 40)
self.assertEqual(result, (2, 10, 20, 30 * 40))

def test_swap_blocks(self):
src_kv_cache = [torch.zeros((10, 20)), torch.zeros((10, 20))]
dst_kv_cache = [torch.zeros((10, 20)), torch.zeros((10, 20))]
Expand Down
85 changes: 1 addition & 84 deletions tests/ut/quantization/test_quant_config.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
from unittest.mock import MagicMock, patch

import torch
from vllm.attention.layer import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.linear import LinearBase

from tests.ut.base import TestBase
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
from vllm_ascend.quantization.quant_config import (AscendKVCacheMethod,
AscendQuantConfig)
from vllm_ascend.quantization.quant_config import AscendQuantConfig
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD


Expand All @@ -19,7 +17,6 @@ def setUp(self):
self.sample_config = {
"weight": "INT8",
"fa_quant_type": "C8",
"kv_quant_type": "C8",
"layer1.weight": "INT8",
"layer2.weight": "FLOAT",
"fused_layer.weight": "FLOAT",
Expand Down Expand Up @@ -115,16 +112,6 @@ def test_get_quant_method_for_attention(self):
attention_layer, ".attn")
self.assertIs(method, mock_ascend_kvcache.return_value)

with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
return_value=MagicMock()) as mock_ascend_kvcache:
# Test with kv_quant_type
modified_config = {"kv_quant_type": "C8"}
config = AscendQuantConfig(modified_config)
config.packed_modules_mapping = None
method = config.get_quant_method(attention_layer, "attn")
self.assertIs(method, mock_ascend_kvcache.return_value)

def test_get_quant_method_for_fused_moe(self):
fused_moe_layer = MagicMock(spec=FusedMoE)
fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
Expand Down Expand Up @@ -169,73 +156,3 @@ def test_is_layer_skipped_ascend(self):

def test_get_scaled_act_names(self):
self.assertEqual(self.ascend_config.get_scaled_act_names(), [])


class TestAscendKVCacheMethod(TestBase):

def setUp(self):
# Setup common test fixtures
self.mock_quant_config = MagicMock(spec=AscendQuantConfig)
self.mock_quant_config.quant_description = {"kv_quant_type": "C8"}
self.prefix = "layer.attn"

# Mock quant_method
self.mock_quant_method = MagicMock()
self.patcher = patch(
'vllm_ascend.quantization.quant_config.get_quant_method')
self.mock_get_quant_method = self.patcher.start()
self.mock_get_quant_method.return_value = self.mock_quant_method

# Create instance
self.kv_cache_method = AscendKVCacheMethod(self.mock_quant_config,
self.prefix)

def tearDown(self):
self.patcher.stop()

def test_create_weights(self):
"""Test create_weights delegates to quant_method."""
mock_layer = MagicMock()
self.kv_cache_method.create_weights(mock_layer)
self.mock_quant_method.create_weights.assert_called_once_with(
mock_layer)

def test_process_weights_after_loading_with_method(self):
"""Test process_weights when quant_method has the method."""
mock_layer = MagicMock()
self.kv_cache_method.process_weights_after_loading(mock_layer)
self.mock_quant_method.process_weights_after_loading.assert_called_once_with(
mock_layer)

def test_process_weights_after_loading_without_method(self):
"""Test process_weights when quant_method lacks the method."""
# Reset mock to remove the method
del self.mock_quant_method.process_weights_after_loading
mock_layer = MagicMock()

# Should not raise exception
self.kv_cache_method.process_weights_after_loading(mock_layer)

def test_apply_delegation(self):
"""Test apply properly delegates to quant_method."""
mock_layer = MagicMock()
mock_query = torch.randn(1, 32, 128)
mock_key = torch.randn(1, 32, 128)
mock_value = torch.randn(1, 32, 128)
mock_kv_cache = MagicMock()
mock_attn_metadata = MagicMock()
mock_scale = 1.0
mock_output = torch.zeros(1, 32, 128)
mock_attn_type = MagicMock()
expected_result = torch.randn(1, 32, 128)
self.mock_quant_method.apply.return_value = expected_result

result = self.kv_cache_method.apply(mock_layer, mock_query, mock_key,
mock_value, mock_kv_cache,
mock_attn_metadata, mock_attn_type,
mock_scale, mock_output)

self.mock_quant_method.apply.assert_called_once_with(
mock_layer, mock_query, mock_key, mock_value, mock_kv_cache,
mock_attn_metadata, mock_attn_type, mock_scale, mock_output)
self.assertTrue(torch.equal(result, expected_result))
12 changes: 0 additions & 12 deletions tests/ut/quantization/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,6 @@ def test_moe_quant_methods(self):
"moe")
self.assertIsInstance(method, cls)

def test_with_fa_quant_type(self):
quant_description = {"fa_quant_type": "C8"}
method = get_quant_method(quant_description, ".attn", "attention")
self.assertIsInstance(
method, ASCEND_QUANTIZATION_METHOD_MAP["C8"]["attention"])

def test_with_kv_quant_type(self):
quant_description = {"kv_quant_type": "C8"}
method = get_quant_method(quant_description, ".attn", "attention")
self.assertIsInstance(
method, ASCEND_QUANTIZATION_METHOD_MAP["C8"]["attention"])

def test_invalid_layer_type(self):
quant_description = {"linear_layer.weight": "W8A8"}
with self.assertRaises(NotImplementedError):
Expand Down
Loading
Loading