From 6f9bdb7d2437f26379ccaba198e1c436f4247bad Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 1 Jul 2025 01:17:06 +0300 Subject: [PATCH 1/2] Fix hpu_model_runner due to PR (#20232) Signed-off-by: Chendi.Xue --- vllm_hpu/v1/worker/hpu_model_runner.py | 43 ++++++++++++-------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/vllm_hpu/v1/worker/hpu_model_runner.py b/vllm_hpu/v1/worker/hpu_model_runner.py index 5530dd82c5..1300eb2c3e 100644 --- a/vllm_hpu/v1/worker/hpu_model_runner.py +++ b/vllm_hpu/v1/worker/hpu_model_runner.py @@ -768,35 +768,34 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: req_ids_to_add.append(req_id) # Update the states of the running/resumed requests. - for req_data in scheduler_output.scheduled_cached_reqs: - req_id = req_data.req_id + req_data = scheduler_output.scheduled_cached_reqs + for i, req_id in enumerate(req_data.req_ids): req_state = self.requests[req_id] - # Update the cached states. - num_computed_tokens = req_data.num_computed_tokens + num_computed_tokens = req_data.num_computed_tokens[i] + new_token_ids = req_data.new_token_ids[i] + new_block_ids = req_data.new_block_ids[i] + resumed_from_preemption = req_data.resumed_from_preemption[i] req_state.num_computed_tokens = num_computed_tokens + + # Update the cached states. # Add the sampled token(s) from the previous step (if any). # This doesn't include "unverified" tokens like spec decode tokens. - num_new_tokens = (num_computed_tokens + - len(req_data.new_token_ids) - + num_new_tokens = (num_computed_tokens + len(new_token_ids) - req_state.num_tokens) if num_new_tokens == 1: # Avoid slicing list in most common case. - req_state.output_token_ids.append(req_data.new_token_ids[-1]) + req_state.output_token_ids.append(new_token_ids[-1]) elif num_new_tokens > 0: req_state.output_token_ids.extend( - req_data.new_token_ids[-num_new_tokens:]) + new_token_ids[-num_new_tokens:]) + # Update the block IDs. - if not req_data.resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_block_ids in zip( # type: ignore[call-overload] - req_state.block_ids, - req_data.new_block_ids, - strict=True): - block_ids.extend(new_block_ids) + if not resumed_from_preemption: + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: - # The request is resumed from preemption. - # Replace the existing block IDs with the new ones. - req_state.block_ids = req_data.new_block_ids + req_state.block_ids = new_block_ids req_index = self.input_batch.req_id_to_index.get(req_id) if req_index is None: @@ -809,14 +808,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(req_data.new_block_ids, - req_index) + self.input_batch.block_table.append_row(new_block_ids, req_index) # Add new_token_ids to token_ids_cpu. start_token_index = num_computed_tokens - end_token_index = num_computed_tokens + len(req_data.new_token_ids) + end_token_index = num_computed_tokens + len(new_token_ids) self.input_batch.token_ids_cpu[ - req_index, - start_token_index:end_token_index] = req_data.new_token_ids + req_index, start_token_index:end_token_index] = new_token_ids self.input_batch.num_tokens_no_spec[req_index] = end_token_index # Add spec_token_ids to token_ids_cpu. spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( From 890d67bc789880c7a1f1c41840fc7dc77bd4d6de Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 1 Jul 2025 01:17:56 +0300 Subject: [PATCH 2/2] add UT in plugin and will be used by upstream test Signed-off-by: Chendi.Xue --- tests/upstream_tests/ci_tests.sh | 17 ++++++++ tests/upstream_tests/generate.py | 68 ++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 tests/upstream_tests/ci_tests.sh create mode 100644 tests/upstream_tests/generate.py diff --git a/tests/upstream_tests/ci_tests.sh b/tests/upstream_tests/ci_tests.sh new file mode 100644 index 0000000000..539fa7a0db --- /dev/null +++ b/tests/upstream_tests/ci_tests.sh @@ -0,0 +1,17 @@ +# basic model +echo "Testing basic model with vllm-hpu plugin v1" +echo HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/upstream_tests/generate.py --model facebook/opt-125m +HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/upstream_tests/generate.py --model facebook/opt-125m +echo "Test with basic model passed" + +# tp=2 +echo "Testing tensor parallel size 2 with vllm-hpu plugin v1" +echo HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/upstream_tests/generate.py --model facebook/opt-125m --tensor-parallel-size 2 +HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/upstream_tests/generate.py --model facebook/opt-125m --tensor-parallel-size 2 +echo "Test with tensor parallel size 2 passed" + +# mla and moe +echo "Testing MLA and MoE with vllm-hpu plugin v1" +echo HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/upstream_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code +HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/upstream_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code +echo "Test with deepseek v2 lite passed" \ No newline at end of file diff --git a/tests/upstream_tests/generate.py b/tests/upstream_tests/generate.py new file mode 100644 index 0000000000..37e0a4289f --- /dev/null +++ b/tests/upstream_tests/generate.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def create_parser(): + parser = FlexibleArgumentParser() + # Add engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + + return parser + + +def main(args: dict): + # Pop arguments not used by LLM + max_tokens = args.pop("max_tokens") + temperature = args.pop("temperature") + top_p = args.pop("top_p") + top_k = args.pop("top_k") + + # Create an LLM + llm = LLM(**args) + + # Create a sampling params object + sampling_params = llm.get_default_sampling_params() + if max_tokens is not None: + sampling_params.max_tokens = max_tokens + if temperature is not None: + sampling_params.temperature = temperature + if top_p is not None: + sampling_params.top_p = top_p + if top_k is not None: + sampling_params.top_k = top_k + + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + os._exit(0) + + +if __name__ == "__main__": + parser = create_parser() + args: dict = vars(parser.parse_args()) + main(args)