Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 68 additions & 8 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from vllm.config import CacheConfig, CompilationMode, CUDAGraphMode, VllmConfig

from tests.ut.base import TestBase
from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
from vllm_ascend.spec_decode.interface import SpecDcodeType

Expand All @@ -25,13 +26,24 @@ def setUp(self):
self.vllm_config.scheduler_config.max_num_seqs = 32
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
Copy link
Copy Markdown
Collaborator

@realliujiaxu realliujiaxu Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is mrope disabled?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related ut mainly focuses on testing language model instead of multimodal model. Since the following assertion involves positions, we have to disable uses_mrope here, otherwise mrope_positions will be initialized replacing positions. By the way, we should and will complement this ut in the near feature.

self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
self.vllm_config.additional_config = None

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
)
self.mock_supports_multimodal_inputs.start()

def tearDown(self):
self.mock_cpugpubuffer.stop()
self.mock_supports_multimodal_inputs.stop()

def test_initialization_eagle_graph(self):
self.vllm_config.speculative_config.method = "eagle"
Expand All @@ -40,12 +52,12 @@ def test_initialization_eagle_graph(self):
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
self.vllm_config.scheduler_config.async_scheduling = False
init_ascend_config(self.vllm_config)

proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)

self.assertEqual(proposer.name, SpecDcodeType.EAGLE)
self.assertEqual(proposer.block_size, 16)
self.assertEqual(proposer.hidden_size, 4096)
self.assertTrue(proposer.use_cuda_graph)
Expand All @@ -60,12 +72,12 @@ def test_initialization_eagle3_enforce_eager(self):
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.NONE
self.vllm_config.model_config.enforce_eager = True
init_ascend_config(self.vllm_config)

proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)

self.assertEqual(proposer.name, SpecDcodeType.EAGLE3)
self.assertEqual(proposer.hidden_size, 2048)
self.assertFalse(proposer.use_cuda_graph)
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
Expand All @@ -77,12 +89,12 @@ def test_initialization_eagle3_full_graph_async(self):
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
self.vllm_config.scheduler_config.async_scheduling = True
init_ascend_config(self.vllm_config)

proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)

self.assertEqual(proposer.name, SpecDcodeType.EAGLE3)
self.assertEqual(proposer.hidden_size, 2048)
self.assertFalse(proposer.use_cuda_graph)
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
Expand All @@ -102,16 +114,28 @@ def setUp(self):
self.vllm_config.scheduler_config.max_num_seqs = 32
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
)
self.mock_supports_multimodal_inputs.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)

def tearDown(self):
self.mock_cpugpubuffer.stop()
self.mock_supports_multimodal_inputs.stop()

@patch(
"vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
Expand Down Expand Up @@ -204,10 +228,20 @@ def setUp(self):
self.vllm_config.scheduler_config.max_num_seqs = 32
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(4)
])
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
)
self.mock_supports_multimodal_inputs.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)
Expand All @@ -216,6 +250,7 @@ def setUp(self):

def tearDown(self):
self.mock_cpugpubuffer.stop()
self.mock_supports_multimodal_inputs.stop()

@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
Expand Down Expand Up @@ -287,16 +322,28 @@ def setUp(self):
1: MagicMock(get_token_id=lambda x: 101),
2: MagicMock(get_token_id=lambda x: 102),
}
self.runner.pcp_size = 1

self.vllm_config.cache_config.block_size = 16
self.vllm_config.scheduler_config.max_num_batched_tokens = 1024
self.vllm_config.scheduler_config.max_num_seqs = 32
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
)
self.mock_supports_multimodal_inputs.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)
Expand All @@ -306,6 +353,7 @@ def setUp(self):

def tearDown(self):
self.mock_cpugpubuffer.stop()
self.mock_supports_multimodal_inputs.stop()

# TODO: This is equivalent to disable_padded_drafter_batch=True.
# We need to add some cases about disable_padded_drafter_batch=False in future.
Expand Down Expand Up @@ -355,16 +403,28 @@ def setUp(self):
self.vllm_config.scheduler_config.max_num_seqs = 32
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
"vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
)
self.mock_supports_multimodal_inputs.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)

def tearDown(self):
self.mock_cpugpubuffer.stop()
self.mock_supports_multimodal_inputs.stop()

# TODO: This is equivalent to disable_padded_drafter_batch=True.
# We need to add a test_prepare_inputs_padded in future.
Expand Down
30 changes: 19 additions & 11 deletions tests/ut/spec_decode/test_mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@

from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.spec_decode.interface import SpecDcodeType
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer


class TestMtpProposer:

@pytest.fixture(autouse=True)
def patch_supports_multimodal_inputs(self):
with patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs"
):
yield

@pytest.fixture
def vllm_config(self):
config = MagicMock(spec=VllmConfig)
Expand All @@ -31,6 +37,9 @@ def vllm_config(self):
config.speculative_config.method = "deepseek_mtp"
config.speculative_config.draft_model_config = MagicMock()
config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])

config.model_config = MagicMock(spec=ModelConfig)
config.model_config.dtype = torch.float16
Expand Down Expand Up @@ -68,15 +77,14 @@ def runner(self):
runner.reserved_mc2_mask = None
return runner

@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_init(self, mock_cpu_gpu_buffer, vllm_config, runner):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance

# Test basic initialization
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)

assert proposer.name == SpecDcodeType.MTP
assert proposer.vllm_config == vllm_config
assert proposer.device == torch.device("cpu")
assert proposer.dtype == torch.float16
Expand All @@ -89,7 +97,7 @@ def test_init(self, mock_cpu_gpu_buffer, vllm_config, runner):
assert not hasattr(proposer, "mrope_positions")
assert proposer.use_sparse is False

@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_init_with_aclgraph(self, mock_cpu_gpu_buffer, vllm_config,
runner):
mock_buffer_instance = MagicMock()
Expand All @@ -105,7 +113,7 @@ def test_init_with_aclgraph(self, mock_cpu_gpu_buffer, vllm_config,
"vllm_ascend.spec_decode.mtp_proposer.process_weights_after_loading")
@patch("vllm_ascend.spec_decode.mtp_proposer.set_default_torch_dtype")
@patch("vllm_ascend.spec_decode.mtp_proposer.set_current_vllm_config")
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_load_model(self, mock_cpu_gpu_buffer, mock_set_config,
mock_set_dtype, mock_process_weights, mock_get_loader,
mock_get_layers, vllm_config, runner):
Expand Down Expand Up @@ -148,7 +156,7 @@ def get_layers_side_effect(vllm_config, cache_cls):

@patch("vllm_ascend.spec_decode.mtp_proposer.get_forward_context")
@patch("vllm_ascend.spec_decode.mtp_proposer.set_ascend_forward_context")
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_dummy_run(self, mock_cpu_gpu_buffer, mock_set_context,
mock_get_forward_context, vllm_config, runner):
mock_buffer_instance = MagicMock()
Expand All @@ -173,7 +181,7 @@ def test_dummy_run(self, mock_cpu_gpu_buffer, mock_set_context,

@patch("vllm_ascend.spec_decode.mtp_proposer.get_forward_context")
@patch("vllm_ascend.spec_decode.mtp_proposer.set_ascend_forward_context")
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_dummy_run_full_graph(self, mock_cpu_gpu_buffer, mock_set_context,
mock_get_forward_context, vllm_config,
runner):
Expand Down Expand Up @@ -201,7 +209,7 @@ def test_dummy_run_full_graph(self, mock_cpu_gpu_buffer, mock_set_context,
# Check that model was called correct number of times
assert proposer.model.call_count == vllm_config.speculative_config.num_speculative_tokens

@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_generate_token_ids(self, mock_cpu_gpu_buffer):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
Expand Down Expand Up @@ -272,7 +280,7 @@ def test_generate_token_ids(self, mock_cpu_gpu_buffer):
proposer._propose.assert_called_once()
assert torch.equal(draft_token_ids, proposer._propose.return_value)

@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_prepare_next_token_ids_cpu(self, mock_cpu_gpu_buffer):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
Expand All @@ -295,7 +303,7 @@ def test_prepare_next_token_ids_cpu(self, mock_cpu_gpu_buffer):
assert torch.all(
result == torch.tensor([30, 50, 60], dtype=torch.int32))

@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_prepare_next_token_ids_padded(self, mock_cpu_gpu_buffer):
mock_common_attn_metadata = MagicMock(spec=CommonAttentionMetadata)
mock_common_attn_metadata.seq_lens_cpu = torch.tensor(
Expand Down Expand Up @@ -377,7 +385,7 @@ def test_prepare_next_token_ids_padded(self, mock_cpu_gpu_buffer):
device=torch.device("cpu"))
assert torch.equal(next_token_ids, expected_next_tokens)

@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
@patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
def test_prepare_inputs_padded(self, mock_cpu_gpu_buffer):
mock_buffer_instance = MagicMock()
mock_cpu_gpu_buffer.return_value = mock_buffer_instance
Expand Down
Loading
Loading