vllm-project · wangxiyuan · Mar 13, 2026 · Feb 26, 2026 · Mar 4, 2026 · Mar 4, 2026
@@ -184,7 +184,7 @@ def test_token_dispatcher_with_all_gather_quant(
 ):
     context_mock = MagicMock()
     context_mock.fused_moe_state = 0
-    with patch("vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context",
+    with patch("vllm_ascend.ascend_forward_context.get_forward_context",
                return_value=context_mock):
         a = torch.randn((m, k), device=device, dtype=dtype) / 10
         w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)

@@ -85,3 +85,29 @@ def test_egale_spec_decoding(
         },
     ) as runner:
         runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("compilation_config", [{"cudagraph_mode": "FULL_DECODE_ONLY"}, {}])
+@patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})
+def test_qwen3_dense_graph_mode(
+    model: str,
+    max_tokens: int,
+    enforce_eager: bool,
+) -> None:
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
+    with VllmRunner(
+        model,
+        max_model_len=1024,
+        enforce_eager=enforce_eager,
+    ) as runner:
+        runner.model.generate(prompts, sampling_params)
@@ -74,7 +74,7 @@ def setUp(self):
 
     @patch("torch_npu._npu_reshape_and_cache")
     @patch("torch_npu._npu_flash_attention")
-    @patch("vllm_ascend.attention.attention_v1.get_forward_context")
+    @patch("vllm_ascend.ascend_forward_context.get_forward_context")
     def test_forward_prefill_310(
         self, mock_get_forward_context, mock_npu_npu_flash_attention, mock_npu_reshape_and_cache
     ):
@@ -105,7 +105,7 @@ def test_forward_prefill_310(
     @patch("torch_npu.npu_format_cast", return_value=torch.randn((1, 128, 16, 16), dtype=torch.float16))
     @patch("torch_npu._npu_reshape_and_cache")
     @patch("torch_npu._npu_paged_attention_splitfuse")
-    @patch("vllm_ascend.attention.attention_v1.get_forward_context")
+    @patch("vllm_ascend.ascend_forward_context.get_forward_context")
     def test_forward_chunked_prefill_310(
         self,
         mock_get_forward_context,
@@ -140,7 +140,7 @@ def test_forward_chunked_prefill_310(
     @patch("torch_npu.npu_format_cast", return_value=torch.randn((1, 128, 16, 16), dtype=torch.float16))
     @patch("torch_npu._npu_reshape_and_cache")
     @patch("torch_npu._npu_paged_attention_splitfuse")
-    @patch("vllm_ascend.attention.attention_v1.get_forward_context")
+    @patch("vllm_ascend.ascend_forward_context.get_forward_context")
     def test_forward_prefill_cache_hit_310(
         self,
         mock_get_forward_context,
@@ -175,7 +175,7 @@ def test_forward_prefill_cache_hit_310(
     @patch("vllm_ascend.attention.attention_v1.using_paged_attention")
     @patch("torch_npu._npu_paged_attention")
     @patch("torch_npu._npu_reshape_and_cache")
-    @patch("vllm_ascend.attention.attention_v1.get_forward_context")
+    @patch("vllm_ascend.ascend_forward_context.get_forward_context")
     def test_forward_paged_attention_310(
         self, mock_get_forward_context, mock_npu_reshape_and_cache, mock_paged_attention, mock_using_paged_attention
     ):

@@ -95,7 +95,7 @@ def mock_attention_with_nomask_and_mask(q, k_mask, **kwargs):
     @patch('torch_npu.npu_attention_update')
     @patch("torch_npu.npu_fused_infer_attention_score")
     @patch(
-        'vllm_ascend.attention.context_parallel.attention_cp.get_forward_context'
+        'vllm_ascend.ascend_forward_context.get_forward_context'
     )
     @patch_distributed_groups(dcp_size=2, pcp_size=2)
     def test_forward_decode_pcp_dcp(self, mock_all2all, mock_dcp, mock_pcp,

@@ -212,7 +212,7 @@ def test_forward_no_attn_metadata(self):
 
     @patch('torch_npu._npu_reshape_and_cache')
     @patch('torch_npu.npu_fused_infer_attention_score')
-    @patch('vllm_ascend.attention.attention_v1.get_forward_context')
+    @patch('vllm_ascend.ascend_forward_context.get_forward_context')
     def test_forward_fused_infer_attention(
             self, mock_get_forward_context,
             mock_npu_fused_infer_attention_score, mock_npu_reshape_and_cache):
@@ -248,7 +248,7 @@ def test_forward_fused_infer_attention(
     @patch('vllm_ascend.attention.attention_v1.using_paged_attention')
     @patch('torch_npu._npu_paged_attention')
     @patch('torch_npu._npu_reshape_and_cache')
-    @patch('vllm_ascend.attention.attention_v1.get_forward_context')
+    @patch('vllm_ascend.ascend_forward_context.get_forward_context')
     def test_forward_paged_attention(self, mock_get_forward_context,
                                      mock_npu_reshape_and_cache,
                                      mock_paged_attention,
@@ -279,7 +279,7 @@ def test_forward_paged_attention(self, mock_get_forward_context,
         mock_paged_attention.assert_called_once()
         assert output.shape == (4, 8 * 64)
 
-    @patch('vllm_ascend.attention.attention_v1.get_forward_context')
+    @patch('vllm_ascend.ascend_forward_context.get_forward_context')
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('torch_npu._npu_reshape_and_cache')
     def test_forward_decode_only_swa(self, mock_npu_reshape_and_cache,
@@ -311,7 +311,7 @@ def test_forward_decode_only_swa(self, mock_npu_reshape_and_cache,
         mock_fused_infer_attention_score.assert_called_once()
         assert output.shape == (10, 8, 64)
 
-    @patch('vllm_ascend.attention.attention_v1.get_forward_context')
+    @patch('vllm_ascend.ascend_forward_context.get_forward_context')
     @patch('torch_npu._npu_paged_attention')
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('torch_npu._npu_reshape_and_cache')

@@ -449,7 +449,7 @@ def test_process_attn_out_lse(self):
         self.assertEqual(result.shape[1], N)
         self.assertEqual(result.shape[2], self.impl.kv_lora_rank + 1)
 
-    @patch('vllm_ascend.attention.context_parallel.mla_cp.get_forward_context')
+    @patch('vllm_ascend.ascend_forward_context.get_forward_context')
     @patch("torch_npu.npu_fused_infer_attention_score")
     @patch('torch_npu.npu_attention_update')
     @patch_distributed_groups(dcp_size=2, pcp_size=2, needs_mocks=False)

@@ -929,7 +929,7 @@ def test_compute_prefill_context(self, mock_ring, mock_load):
         self.assertEqual(out.shape, prefix_out.shape)
         self.assertEqual(lse.shape, prefix_lse.shape)
 
-    @patch('vllm_ascend.attention.mla_v1.get_forward_context')
+    @patch('vllm_ascend.ascend_forward_context.get_forward_context')
     @patch("vllm_ascend.attention.mla_v1.AscendMLAImpl._v_up_proj")
     @patch("torch_npu.npu_fused_infer_attention_score")
     def test_forward_decode_without_graph(self,
@@ -1095,7 +1095,7 @@ def test_exec_kv_decode(self, mock_kv_rmsnorm_rope_cache):
         self.assertEqual(k_pe.shape[-1], self.impl.qk_rope_head_dim)
         self.assertEqual(k_nope.shape[-1], self.impl.kv_lora_rank)
 
-    @patch('vllm_ascend.attention.mla_v1.get_forward_context')
+    @patch('vllm_ascend.ascend_forward_context.get_forward_context')
     @patch("torch_npu.npu_fused_infer_attention_score")
     def test_forward_decode(self, mock_npu_fused_infer_attention_score,
                             mock_get_forward_context):