Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
73 commits
Select commit Hold shift + click to select a range
bd906fb
set_weight_prefetch_method in model runner v2
Ronald1995 Feb 26, 2026
71db8cb
Merge branch 'main' into acl_graph
Ronald1995 Mar 4, 2026
00e2104
fix eagle import error
Ronald1995 Mar 4, 2026
d328742
update supported vllm version
Ronald1995 Mar 4, 2026
55692de
adapt to build_attn_metadata
Ronald1995 Mar 4, 2026
9dcc2f7
fix AclGraphManager args missing
Ronald1995 Mar 4, 2026
812083f
fix AscendSampler args missing
Ronald1995 Mar 4, 2026
955c3e8
fix sampler error
Ronald1995 Mar 4, 2026
7e391b9
fix add_request error
Ronald1995 Mar 4, 2026
7269858
fix prepare_prefill_inputs args error
Ronald1995 Mar 4, 2026
08b0843
fix set_stream error
Ronald1995 Mar 4, 2026
c76287b
support full decode mode
Ronald1995 Mar 5, 2026
bca6fe1
change calling method of forward_context attribute
Ronald1995 Mar 5, 2026
28cd2ae
fix flash_comm_v1_enabled
Ronald1995 Mar 5, 2026
f2ce655
refactor ExtraForwardContext
Ronald1995 Mar 5, 2026
631bac1
fix moe_comm_method import error
Ronald1995 Mar 5, 2026
7871920
patch mem_get_info
Ronald1995 Mar 5, 2026
16f5593
fix capture_graph args error
Ronald1995 Mar 5, 2026
d930945
fix prepare_capture_inputs_wrapper error
Ronald1995 Mar 6, 2026
266e02b
minor fix of prepare_capture_inputs_wrapper
Ronald1995 Mar 6, 2026
b893bd9
update full graph params
Ronald1995 Mar 6, 2026
0609590
fix cudagraph_manager args error
Ronald1995 Mar 6, 2026
b97fe83
fix get_forward_context error
Ronald1995 Mar 7, 2026
81bc2fc
add comment
Ronald1995 Mar 7, 2026
35c61c2
fix position error
Ronald1995 Mar 7, 2026
7cf1e92
fix positions error
Ronald1995 Mar 7, 2026
9404394
fix wrapper error
Ronald1995 Mar 7, 2026
7d55a62
fix wrapper error
Ronald1995 Mar 7, 2026
5e20140
fix wrapper error
Ronald1995 Mar 7, 2026
b318c05
fix attn_backend error
Ronald1995 Mar 7, 2026
779f37b
fix attn_backends error
Ronald1995 Mar 7, 2026
de977fa
fix error of getting is_draft_model
Ronald1995 Mar 7, 2026
2debe73
fix graph params error
Ronald1995 Mar 7, 2026
4e8c725
change need_capture to needs_capture
Ronald1995 Mar 7, 2026
ab0102f
_pad_query_start_loc_for_fia
Ronald1995 Mar 9, 2026
de05436
fix query_start_loc definition error
Ronald1995 Mar 9, 2026
77572d4
fix query_start_loc_np error
Ronald1995 Mar 9, 2026
a562840
fix arange_np error
Ronald1995 Mar 9, 2026
0a79ba0
fix error of num_reqs_pad
Ronald1995 Mar 9, 2026
29526bd
fix error slot_mapping dtype
Ronald1995 Mar 10, 2026
2eaeee0
change BlockTable to BlockTables
Ronald1995 Mar 10, 2026
24e40fb
fix seq_lens_np error
Ronald1995 Mar 10, 2026
ac065d1
add log
Ronald1995 Mar 10, 2026
bd5a82c
Merge branch 'main' into acl_graph
Ronald1995 Mar 10, 2026
8eecb60
resolve merge conflit and add e2e test for graph mode
Ronald1995 Mar 10, 2026
4e60fbc
remove use_mrope args in AclGraphManager
Ronald1995 Mar 10, 2026
ada08e4
fix is_draft_model missing error
Ronald1995 Mar 10, 2026
547d862
implement model_state
Ronald1995 Mar 10, 2026
816d3e9
remove extra input_embeds in AscendInputBatch
Ronald1995 Mar 10, 2026
dc40c5c
add dcp_local_seq_lens in AscendInputBatch
Ronald1995 Mar 10, 2026
d307a5c
add dcp_local_seq_lens arg in build_attn_metadata
Ronald1995 Mar 10, 2026
2c3e4bb
add model_state arg in capture_graph
Ronald1995 Mar 10, 2026
41765ef
remove input_embeds and mrope_positions in capture_graph
Ronald1995 Mar 10, 2026
80b668c
fix ruff error
Ronald1995 Mar 10, 2026
507ba2d
fix ruff error
Ronald1995 Mar 10, 2026
cf1df41
fix ruff and isort error
Ronald1995 Mar 11, 2026
13dcbab
fix markdownlint error
Ronald1995 Mar 11, 2026
7e1f701
remove unused forward_context
Ronald1995 Mar 11, 2026
fe40211
make attention_v1 same as main branch
Ronald1995 Mar 11, 2026
0c927f9
fix mypy error
Ronald1995 Mar 11, 2026
8e6cba3
fix mypy error
Ronald1995 Mar 11, 2026
043d1b1
set_mc2_tokens_capacity in model runner
Ronald1995 Mar 11, 2026
e75399e
fix isort error
Ronald1995 Mar 11, 2026
4373d98
fix mypy error
Ronald1995 Mar 11, 2026
ae6ea0d
mock ascend_forward_context.get_forward_context in ut
Ronald1995 Mar 11, 2026
9702fd8
fix ut of acl_graph
Ronald1995 Mar 11, 2026
a0993bb
fix ut error
Ronald1995 Mar 11, 2026
02131f8
fix ut error
Ronald1995 Mar 11, 2026
c59ea67
refine ExtraForwardContext
Ronald1995 Mar 12, 2026
ce07e8c
add comment
Ronald1995 Mar 12, 2026
1e09364
Merge branch 'main' into acl_graph
Ronald1995 Mar 12, 2026
95d1517
Merge branch 'main' into acl_graph
Ronald1995 Mar 12, 2026
e0fe30d
Merge branch 'main' into acl_graph
Ronald1995 Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_token_dispatcher_with_all_gather_quant(
):
context_mock = MagicMock()
context_mock.fused_moe_state = 0
with patch("vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context",
with patch("vllm_ascend.ascend_forward_context.get_forward_context",
return_value=context_mock):
a = torch.randn((m, k), device=device, dtype=dtype) / 10
w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
Expand Down
26 changes: 26 additions & 0 deletions tests/e2e/singlecard/model_runner_v2/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,29 @@ def test_egale_spec_decoding(
},
) as runner:
runner.model.generate(prompts, sampling_params)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("compilation_config", [{"cudagraph_mode": "FULL_DECODE_ONLY"}, {}])
@patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})
def test_qwen3_dense_graph_mode(
model: str,
max_tokens: int,
enforce_eager: bool,
) -> None:
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=enforce_eager,
) as runner:
runner.model.generate(prompts, sampling_params)
8 changes: 4 additions & 4 deletions tests/ut/_310p/attention/test_attention_v1_310.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def setUp(self):

@patch("torch_npu._npu_reshape_and_cache")
@patch("torch_npu._npu_flash_attention")
@patch("vllm_ascend.attention.attention_v1.get_forward_context")
@patch("vllm_ascend.ascend_forward_context.get_forward_context")
def test_forward_prefill_310(
self, mock_get_forward_context, mock_npu_npu_flash_attention, mock_npu_reshape_and_cache
):
Expand Down Expand Up @@ -105,7 +105,7 @@ def test_forward_prefill_310(
@patch("torch_npu.npu_format_cast", return_value=torch.randn((1, 128, 16, 16), dtype=torch.float16))
@patch("torch_npu._npu_reshape_and_cache")
@patch("torch_npu._npu_paged_attention_splitfuse")
@patch("vllm_ascend.attention.attention_v1.get_forward_context")
@patch("vllm_ascend.ascend_forward_context.get_forward_context")
def test_forward_chunked_prefill_310(
self,
mock_get_forward_context,
Expand Down Expand Up @@ -140,7 +140,7 @@ def test_forward_chunked_prefill_310(
@patch("torch_npu.npu_format_cast", return_value=torch.randn((1, 128, 16, 16), dtype=torch.float16))
@patch("torch_npu._npu_reshape_and_cache")
@patch("torch_npu._npu_paged_attention_splitfuse")
@patch("vllm_ascend.attention.attention_v1.get_forward_context")
@patch("vllm_ascend.ascend_forward_context.get_forward_context")
def test_forward_prefill_cache_hit_310(
self,
mock_get_forward_context,
Expand Down Expand Up @@ -175,7 +175,7 @@ def test_forward_prefill_cache_hit_310(
@patch("vllm_ascend.attention.attention_v1.using_paged_attention")
@patch("torch_npu._npu_paged_attention")
@patch("torch_npu._npu_reshape_and_cache")
@patch("vllm_ascend.attention.attention_v1.get_forward_context")
@patch("vllm_ascend.ascend_forward_context.get_forward_context")
def test_forward_paged_attention_310(
self, mock_get_forward_context, mock_npu_reshape_and_cache, mock_paged_attention, mock_using_paged_attention
):
Expand Down
2 changes: 1 addition & 1 deletion tests/ut/attention/test_attention_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def mock_attention_with_nomask_and_mask(q, k_mask, **kwargs):
@patch('torch_npu.npu_attention_update')
@patch("torch_npu.npu_fused_infer_attention_score")
@patch(
'vllm_ascend.attention.context_parallel.attention_cp.get_forward_context'
'vllm_ascend.ascend_forward_context.get_forward_context'
)
@patch_distributed_groups(dcp_size=2, pcp_size=2)
def test_forward_decode_pcp_dcp(self, mock_all2all, mock_dcp, mock_pcp,
Expand Down
8 changes: 4 additions & 4 deletions tests/ut/attention/test_attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def test_forward_no_attn_metadata(self):

@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu.npu_fused_infer_attention_score')
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
def test_forward_fused_infer_attention(
self, mock_get_forward_context,
mock_npu_fused_infer_attention_score, mock_npu_reshape_and_cache):
Expand Down Expand Up @@ -248,7 +248,7 @@ def test_forward_fused_infer_attention(
@patch('vllm_ascend.attention.attention_v1.using_paged_attention')
@patch('torch_npu._npu_paged_attention')
@patch('torch_npu._npu_reshape_and_cache')
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
def test_forward_paged_attention(self, mock_get_forward_context,
mock_npu_reshape_and_cache,
mock_paged_attention,
Expand Down Expand Up @@ -279,7 +279,7 @@ def test_forward_paged_attention(self, mock_get_forward_context,
mock_paged_attention.assert_called_once()
assert output.shape == (4, 8 * 64)

@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
@patch('torch_npu.npu_fused_infer_attention_score')
@patch('torch_npu._npu_reshape_and_cache')
def test_forward_decode_only_swa(self, mock_npu_reshape_and_cache,
Expand Down Expand Up @@ -311,7 +311,7 @@ def test_forward_decode_only_swa(self, mock_npu_reshape_and_cache,
mock_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8, 64)

@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
@patch('torch_npu._npu_paged_attention')
@patch('torch_npu.npu_fused_infer_attention_score')
@patch('torch_npu._npu_reshape_and_cache')
Expand Down
2 changes: 1 addition & 1 deletion tests/ut/attention/test_mla_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def test_process_attn_out_lse(self):
self.assertEqual(result.shape[1], N)
self.assertEqual(result.shape[2], self.impl.kv_lora_rank + 1)

@patch('vllm_ascend.attention.context_parallel.mla_cp.get_forward_context')
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
@patch("torch_npu.npu_fused_infer_attention_score")
@patch('torch_npu.npu_attention_update')
@patch_distributed_groups(dcp_size=2, pcp_size=2, needs_mocks=False)
Expand Down
4 changes: 2 additions & 2 deletions tests/ut/attention/test_mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,7 @@ def test_compute_prefill_context(self, mock_ring, mock_load):
self.assertEqual(out.shape, prefix_out.shape)
self.assertEqual(lse.shape, prefix_lse.shape)

@patch('vllm_ascend.attention.mla_v1.get_forward_context')
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
@patch("vllm_ascend.attention.mla_v1.AscendMLAImpl._v_up_proj")
@patch("torch_npu.npu_fused_infer_attention_score")
def test_forward_decode_without_graph(self,
Expand Down Expand Up @@ -1095,7 +1095,7 @@ def test_exec_kv_decode(self, mock_kv_rmsnorm_rope_cache):
self.assertEqual(k_pe.shape[-1], self.impl.qk_rope_head_dim)
self.assertEqual(k_nope.shape[-1], self.impl.kv_lora_rank)

@patch('vllm_ascend.attention.mla_v1.get_forward_context')
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
@patch("torch_npu.npu_fused_infer_attention_score")
def test_forward_decode(self, mock_npu_fused_infer_attention_score,
mock_get_forward_context):
Expand Down
Loading
Loading