Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 62 additions & 35 deletions tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
from typing import Any

import pytest
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

from tests.e2e.conftest import VllmRunner
from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

Expand Down Expand Up @@ -115,41 +116,67 @@ def test_eagle_correctness(
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
'''
pytest.skip("To be aligned with GPU")
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm

# NOTE: e2e of eagle has many problems before.
# We first check whether it is functioning properly.
# Should fix the e2e with VllmRunner in future.
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
with VllmRunner(
model_name,
max_num_seqs=1,
max_num_batched_tokens=2048,
gpu_memory_utilization=0.6,
speculative_config={
"method": "eagle3" if use_eagle3 else "eagle",
"model": spec_model_name,
"num_speculative_tokens": 2,
"max_model_len": 128,
},
max_model_len=128,
enforce_eager=False,
) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)

matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")

# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompts = [{
"role": "user",
"content": "Hello, my name is"
}, {
"role": "user",
"content": "The president of the United States is"
}, {
"role": "user",
"content": "The capital of France is"
}, {
"role": "user",
"content": "The future of AI is"
}]
prompts = [
tokenizer.apply_chat_template(
[prompt],
tokenize=False,
add_generation_prompt=True,
) for prompt in prompts
]

sampling_params = SamplingParams(
max_tokens=300,
temperature=0.0,
ignore_eos=False,
)

# Create an LLM.
llm = LLM(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed you're using LLM in a test case, we must clean up npu hbm when not using VllmRunner. plz modify this case to use VllmRunner or clean up by hand if keeping it as is.
plz refer to https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/conftest.py#L81

model=model_name,
tensor_parallel_size=1,
pipeline_parallel_size=1,
data_parallel_size=1,
disable_log_stats=False,
max_model_len=4096,
seed=1024,
async_scheduling=True,
compilation_config={
"level": 3,
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_num_of_warmups": 1,
"cudagraph_capture_sizes": [12],
},
speculative_config={
"disable_padded_drafter_batch": False,
"method": "eagle3" if use_eagle3 else "eagle",
"model": spec_model_name,
"num_speculative_tokens": 2,
"max_model_len": 128,
"draft_vocab_size": 128256,
},
)
llm.generate(prompts, sampling_params)
cleanup_dist_env_and_memory()
del llm


@pytest.mark.skip(
Expand Down
78 changes: 45 additions & 33 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
self.mock_cpugpubuffer.start()

def tearDown(self):
self.mock_cpugpubuffer.stop()

def test_initialization_eagle(self):
self.vllm_config.speculative_config.method = "eagle"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
Expand All @@ -44,7 +51,7 @@ def test_initialization_eagle(self):
self.assertEqual(proposer.input_ids.shape, (1024, ))
self.assertEqual(proposer.positions.shape, (1024, ))
self.assertEqual(proposer.hidden_states.shape, (1024, 4096))
self.assertEqual(proposer.arange.shape, (33, ))
self.assertEqual(proposer.arange.shape, (1024, ))

def test_initialization_eagle3(self):
self.vllm_config.speculative_config.method = "eagle3"
Expand Down Expand Up @@ -77,10 +84,16 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)

def tearDown(self):
self.mock_cpugpubuffer.stop()

@patch(
"vllm_ascend.spec_decode.eagle_proposer.get_layers_from_vllm_config")
@patch("vllm_ascend.spec_decode.eagle_proposer.get_model")
Expand Down Expand Up @@ -172,11 +185,17 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)
self.proposer.model = MagicMock()

def tearDown(self):
self.mock_cpugpubuffer.stop()

@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
def test_dummy_run_basic(self, mock_context):
num_tokens = 32
Expand Down Expand Up @@ -216,14 +235,22 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)
self.proposer.attn_layer_name = "layer_0"
self.proposer._propose = MagicMock(
return_value=torch.tensor([[1, 2], [3, 4], [5, 6]]))

def test_generate_token_ids_without_metadata(self):
def tearDown(self):
self.mock_cpugpubuffer.stop()

# TODO: This is equivalent to disable_padded_drafter_batch=True.
# We need to add some cases about disable_padded_drafter_batch=False in future.
def test_generate_token_ids(self):
valid_sampled = [[20, 30, 40]]
scheduler_output = MagicMock()
scheduler_output.num_scheduled_tokens = [2, 1, 3]
Expand All @@ -239,44 +266,21 @@ def test_generate_token_ids_without_metadata(self):
return_value={"layer_0": mock_attn_metadata})

result = self.proposer.generate_token_ids(
valid_sampled_token_ids=valid_sampled,
sampled_token_ids=valid_sampled,
scheduler_output=scheduler_output,
positions=positions,
num_scheduled_tokens=num_scheduled,
hidden_states=hidden_states,
)

self.proposer._propose.assert_called_once()
self.assertEqual(result, [[1, 2], [3, 4], [5, 6]])

def test_generate_token_ids_with_metadata(self):
valid_sampled = [[5], [6, 7], [8, 9, 10]]
spec_metadata = MagicMock()
spec_metadata.num_draft_tokens = [2, 3, 4]

mock_attn_metadata = MagicMock()
mock_attn_metadata.slot_mapping = torch.tensor([0, 1, 2, 3, 4, 5])
mock_attn_metadata.query_start_loc = torch.tensor([0, 1, 3, 6])
mock_attn_metadata.block_tables = MagicMock()
self.proposer._get_eagle_atten_dict = MagicMock(
return_value={"layer_0": mock_attn_metadata})
self.proposer._prepare_inputs = MagicMock(
return_value=(torch.tensor([0, 2, 5]), torch.tensor([1, 3, 5])))

result = self.proposer.generate_token_ids(
valid_sampled_token_ids=valid_sampled,
spec_decode_metadata=spec_metadata,
positions=torch.randn(6, 1),
hidden_states=torch.randn(6, 4096),
)

self.proposer._prepare_inputs.assert_called_once()
self.assertEqual(self.proposer._propose.call_count, 1)
self.assertEqual(len(result), 3)
self.assertEqual(result.numpy().tolist(), [[1, 2], [3, 4], [5, 6]])


class TestEagleProposerHelperMethods(TestBase):

# TODO: Can add some tests about prepare_next_token_ids in future.

def setUp(self):
self.vllm_config = MagicMock(spec=VllmConfig)
self.vllm_config.scheduler_config = MagicMock(max_num_seqs=3)
Expand All @@ -293,21 +297,29 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048

self.mock_cpugpubuffer = patch(
"vllm_ascend.spec_decode.eagle_proposer.CpuGpuBuffer")
self.mock_cpugpubuffer.start()
self.proposer = EagleProposer(vllm_config=self.vllm_config,
device=self.device,
runner=self.runner)

def tearDown(self):
self.mock_cpugpubuffer.stop()

# TODO: This is equivalent to disable_padded_drafter_batch=True.
# We need to add a test_prepare_inputs_padded in future.
def test_prepare_inputs(self):
self.proposer.token_arange_np = np.arange(10)
mock_attn = MagicMock()
mock_attn.slot_mapping = torch.tensor([0, 1, 2, 3, 4, 5])
num_rejected = torch.tensor([1, 0, 1], device=self.device)
mock_return_attn = MagicMock()

with patch.object(self.proposer,
'_prepare_inputs',
return_value=(torch.tensor([0, 2, 5]),
'prepare_inputs',
return_value=(mock_return_attn,
torch.tensor([1, 2, 4]))):
cu_num_tokens, indices = self.proposer._prepare_inputs(
return_attn, indices = self.proposer.prepare_inputs(
mock_attn, num_rejected)
self.assertEqual(cu_num_tokens.tolist(), [0, 2, 5])
self.assertEqual(indices.tolist(), [1, 2, 4])
5 changes: 4 additions & 1 deletion vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,9 @@ def reshape_and_cache(
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
slots = attn_metadata.slot_mapping
if get_ascend_device_type() == AscendDeviceType._910_95:
# TODO: Once eagle running to here, it may has error because of the 0 dim of slot_mapping.
# Should check if the 0 dim of slot_mapping must equal to the 0 dim of key.
# If it's necessary, the slots should be sliced.
torch_npu.npu_scatter_pa_kv_cache(
key=key[:attn_metadata.num_actual_tokens],
value=value[:attn_metadata.num_actual_tokens].contiguous(),
Expand All @@ -742,7 +745,7 @@ def reshape_and_cache(
value=value[:attn_metadata.num_actual_tokens],
key_cache=self.key_cache,
value_cache=self.value_cache,
slot_indices=slots)
slot_indices=slots[:attn_metadata.num_actual_tokens])
return key, value

def forward_impl(
Expand Down
29 changes: 29 additions & 0 deletions vllm_ascend/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,35 @@ class AscendCommonAttentionMetadata:
prefill_context_parallel_metadata: Optional[
AscendPrefillContextParallelMetadata] = None

# TODO: Remove it when vLLM no longer uses this function.
def unpadded(self, num_actual_tokens: int,
num_actual_reqs: int) -> "AscendCommonAttentionMetadata":
# This only use to eagle now. It will be use to enforce_eager in future.
return AscendCommonAttentionMetadata(
query_start_loc=self.query_start_loc[:num_actual_reqs + 1],
query_start_loc_cpu=self.query_start_loc_cpu[:num_actual_reqs + 1],
seq_lens=self.seq_lens[:num_actual_reqs],
seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
num_computed_tokens_cpu=self.
num_computed_tokens_cpu[:num_actual_reqs],
num_reqs=num_actual_reqs,
num_actual_tokens=num_actual_tokens,
max_query_len=self.max_query_len,
decode_token_per_req=self.decode_token_per_req,
block_table_tensor=self.block_table_tensor[:num_actual_reqs],
slot_mapping=self.slot_mapping[:num_actual_tokens],
actual_seq_lengths_q=self.actual_seq_lengths_q[:num_actual_tokens],
positions=self.positions[:num_actual_tokens],
attn_mask=self.attn_mask,
spec_attn_mask=self.spec_attn_mask,
attn_state=self.attn_state,
is_only_prefill=self.is_only_prefill,
graph_pad_size=-1, # It should be -1 when not run in fullgraph mode.
num_input_tokens=num_actual_tokens,
prefill_context_parallel_metadata=self.
prefill_context_parallel_metadata,
)


def filter_chunked_req_indices(
seq_len: torch.Tensor,
Expand Down
Loading
Loading