diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py index f399ea652fe..941055cf724 100644 --- a/tests/multicard/test_offline_inference_distributed.py +++ b/tests/multicard/test_offline_inference_distributed.py @@ -22,7 +22,6 @@ """ import os -import pytest import vllm # noqa: F401 from tests.conftest import VllmRunner @@ -47,8 +46,6 @@ def test_models_distributed_QwQ(): vllm_model.generate_greedy(example_prompts, max_tokens) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1", - reason="deepseek v2 lite is not supported on v1") def test_models_distributed_DeepSeek(): example_prompts = [ "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index d39a1499fb2..054bd953c2a 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -239,10 +239,8 @@ def build(self, # it blocks on all previous kernels. device = self.runner.device - block_table = self.runner.input_batch.block_table[0].get_device_tensor( - ) - block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = ( - block_table[:num_reqs]) + block_table = (self.runner.input_batch.block_table[0]. + get_device_tensor()[:num_reqs]) slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to( device, non_blocking=True) input_positions = self.runner.positions_cpu[:num_actual_tokens].to(