Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions tests/e2e/310p/multicard/test_dense_model_multicard.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,26 @@ def test_qwen3_dense_tp2_fp16():
]
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-8B",
tensor_parallel_size=2,
enforce_eager=True,
dtype="float16"
"Qwen/Qwen3-8B",
tensor_parallel_size=2,
enforce_eager=True,
dtype="float16",
max_model_len=16384,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


def test_qwen3_dense_tp4_w8a8():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-32B-W8A8",
tensor_parallel_size=4,
enforce_eager=True,
dtype="float16",
quantization="ascend"
"vllm-ascend/Qwen3-32B-W8A8",
tensor_parallel_size=4,
enforce_eager=True,
dtype="float16",
quantization="ascend",
max_model_len=16384,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
33 changes: 19 additions & 14 deletions tests/e2e/310p/multicard/test_moe_model_multicard.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,37 +24,42 @@ def test_qwen3_moe_tp4_fp16():
]
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=4,
enforce_eager=True,
dtype="float16"
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=4,
enforce_eager=True,
dtype="float16",
max_model_len=16384,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


def test_qwen3_moe_ep4_fp16():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=4,
enforce_eager=True,
dtype="float16",
enable_expert_parallel=True
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=4,
enforce_eager=True,
dtype="float16",
enable_expert_parallel=True,
max_model_len=16384,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


def test_qwen3_moe_tp2_w8a8():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-W8A8",
tensor_parallel_size=2,
enforce_eager=True,
dtype="float16",
quantization="ascend"
"vllm-ascend/Qwen3-30B-A3B-W8A8",
tensor_parallel_size=2,
enforce_eager=True,
dtype="float16",
quantization="ascend",
max_model_len=16384,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
21 changes: 12 additions & 9 deletions tests/e2e/310p/singlecard/test_dense_model_singlecard.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,26 @@ def test_qwen3_dense_tp1_fp16():
]
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-8B",
tensor_parallel_size=1,
enforce_eager=True,
dtype="float16"
"Qwen/Qwen3-8B",
tensor_parallel_size=1,
enforce_eager=True,
dtype="float16",
max_model_len=16384,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


def test_qwen3_dense_tp1_w8a8():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-8B-W8A8",
tensor_parallel_size=1,
enforce_eager=True,
dtype="float16",
quantization="ascend"
"vllm-ascend/Qwen3-8B-W8A8",
tensor_parallel_size=1,
enforce_eager=True,
dtype="float16",
quantization="ascend",
max_model_len=16384,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
9 changes: 5 additions & 4 deletions tests/ut/_310p/attention/test_attention_mask_310.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@

class TestAttentionMaskBuilder310(TestBase):
def setUp(self):
self.attention_mask_builder = AttentionMaskBuilder310(torch.device("cpu"))
self.max_seqlen = 4096
self.attention_mask_builder = AttentionMaskBuilder310(torch.device("cpu"), self.max_seqlen)

def test_get_attention_mask_310_for_pooling_model(self):
model_config = MagicMock()
Expand All @@ -36,7 +37,7 @@ def test_get_attention_mask_310(self, mock_format_cast):
mock_format_cast.side_effect = lambda x, y: x
model_config = MagicMock()
attn_mask = self.attention_mask_builder.get_attention_mask(model_config)
self.assertEqual(attn_mask.shape, (1, 128, 2048, 16))
self.assertEqual(attn_mask.shape, (1, self.max_seqlen // 16, self.max_seqlen, 16))
self.assertEqual(attn_mask[0][-1][0][-1], torch.tensor(float("-inf"), dtype=torch.float16))

@patch("torch_npu.npu_format_cast")
Expand All @@ -47,7 +48,7 @@ def test_get_swa_mask_310(self, mock_format_cast):

sliding_window = 128
swa_mask = self.attention_mask_builder.get_swa_mask(torch.float16, sliding_window)
self.assertEqual(swa_mask.shape, (1, 128, 2048, 16))
self.assertEqual(swa_mask.shape, (1, self.max_seqlen // 16, self.max_seqlen, 16))
self.assertEqual(swa_mask[0][-1][0][-1], torch.tensor(float("-inf"), dtype=torch.float16))
self.assertEqual(swa_mask[0][0][-1][0], torch.tensor(float("-inf"), dtype=torch.float16))

Expand All @@ -58,4 +59,4 @@ def test_get_splitfuse_attn_mask_310(self, mock_format_cast):
attn_metadata.query_start_loc = torch.tensor([0, 1, 5])
attn_metadata.seq_lens = torch.tensor([7, 4])
attn_mask = self.attention_mask_builder.get_splitfuse_mask(attn_metadata, torch.device("cpu"))
self.assertEqual(attn_mask.shape, (1, 128, 16, 16))
self.assertEqual(attn_mask.shape, (1, self.max_seqlen // 16, 16, 16))
10 changes: 5 additions & 5 deletions vllm_ascend/_310p/attention/attention_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,20 @@

class AttentionMaskBuilder310:
chunked_prefill_attn_mask = None
max_seqlen = 2048
max_seqlen = 16384

def __init__(self, device: torch.device):
def __init__(self, device: torch.device, max_seqlen: int):
"""
Initializes the AttentionMaskBuilder for the 310P device.

Args:
device (torch.device): The device on which tensors will be allocated.
max_seqlen (int): Maximum length of a sequence (including prompt and generated text).
"""
AttentionMaskBuilder310.max_seqlen = max_seqlen
self.attn_mask_cache = None
self.device = device
self.swa_mask = None
self._seq_len_cached = 0

@staticmethod
def gen_causal_additive_mask(max_seq_len: int, device: torch.device):
Expand Down Expand Up @@ -147,8 +148,7 @@ def _get_causal_mask(self, max_seq_len: int) -> torch.Tensor:
Returns:
torch.Tensor: The cached causal mask in ACL_FORMAT_FRACTAL_NZ.
"""
if self.attn_mask_cache is None or max_seq_len > self._seq_len_cached:
if self.attn_mask_cache is None:
attn_mask = self.gen_causal_additive_mask(max_seq_len, self.device)
self.attn_mask_cache = torch_npu.npu_format_cast(nd_to_nz_2d(attn_mask), ACL_FORMAT_FRACTAL_NZ)
self._seq_len_cached = max_seq_len
return self.attn_mask_cache
1 change: 0 additions & 1 deletion vllm_ascend/_310p/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def __init__(self, *args, **kwargs):
Initializes the 310P backend and sets up the device-specific mask builder.
"""
super().__init__(*args, **kwargs)
self.attn_mask_builder = AttentionMaskBuilder310(self.device)

@staticmethod
def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int, head_size: int):
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/_310p/attention/metadata_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,5 @@ def __init__(
super().__init__(kv_cache_spec, layer_names, vllm_config, device)

# Override the mask builder with the 310P-specific version
self.attn_mask_builder: Any = AttentionMaskBuilder310(self.device)
max_model_len = vllm_config.model_config.max_model_len
self.attn_mask_builder: Any = AttentionMaskBuilder310(self.device, max_model_len)
21 changes: 21 additions & 0 deletions vllm_ascend/_310p/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute
from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
from vllm_ascend.quantization.methods.base import QuantType
from vllm_ascend.utils import vllm_version_is

from .experts_selector import select_experts
from .moe_comm_method import AllGatherCommImpl310
Expand Down Expand Up @@ -153,6 +154,26 @@ def __init__(self, *args, **kwargs):

_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)

if not vllm_version_is("0.15.0"):
self.runner = self._init_runner()

if not vllm_version_is("0.15.0"):

def _init_runner(self):
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)

def init_experts_map(self, moe_config):
"""
Initialize expert mapping for MoE (Mixture of Experts) model.
Expand Down