diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index e2a0516fda6..64b63e246aa 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=d68209402ddab3f54a09bc1f4de9a9495a283b60 + VLLM_COMMIT=8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 03866afa616..5a35f5dc463 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0] + vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index b044184dc66..638390f4c3f 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: d68209402ddab3f54a09bc1f4de9a9495a283b60 + vllm: 8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5 changes: runs-on: linux-aarch64-a2-0 outputs: @@ -84,7 +84,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0] + vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -96,7 +96,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0] + vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index 89f11f02f70..fc5220ec25f 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60] + vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 2f88e52c80a..161139dcac0 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -53,7 +53,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py index d82218e0958..85fc224b3a4 100755 --- a/tests/ut/attention/test_mla_v1.py +++ b/tests/ut/attention/test_mla_v1.py @@ -17,6 +17,7 @@ AscendMLAPrefillMetadata, ChunkedContextMetadata) from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.utils import vllm_version_is class TestAscendMLABackend(TestBase): @@ -226,7 +227,9 @@ def mock_parent_init(self, kv_cache_spec, layer_names, vllm_config, ) self.parent_init_patcher = patch( - "vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__", + ("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__" + if vllm_version_is('0.14.0') else + "vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"), mock_parent_init) self.parent_init_patcher.start() @@ -452,7 +455,9 @@ def mock_parent_init(self, kv_cache_spec, layer_names, vllm_config, ) self.parent_init_patcher = patch( - "vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__", + ("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__" + if vllm_version_is('0.14.0') else + "vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"), mock_parent_init) self.parent_init_patcher.start() diff --git a/tests/ut/attention/test_sfa_v1.py b/tests/ut/attention/test_sfa_v1.py index aca95244a10..5edc2d0e895 100644 --- a/tests/ut/attention/test_sfa_v1.py +++ b/tests/ut/attention/test_sfa_v1.py @@ -12,7 +12,7 @@ from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder) -from vllm_ascend.utils import enable_dsa_cp +from vllm_ascend.utils import enable_dsa_cp, vllm_version_is class TestAscendSFABackend(TestBase): @@ -117,7 +117,9 @@ def mock_parent_init(self, kv_cache_spec, layer_names, vllm_config, ) self.parent_init_patcher = patch( - "vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__", + ("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__" + if vllm_version_is('0.14.0') else + "vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"), mock_parent_init) self.parent_init_patcher.start() diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index a20fa893da5..96f2d004bef 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -9,6 +9,7 @@ from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.eplb.core.eplb_utils import init_eplb_config +from vllm_ascend.utils import vllm_version_is # isort: on @@ -20,8 +21,24 @@ def setUp(self, mock_fix_incompatible_config): "refresh": True, "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, } - moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl") - moe_config = FusedMoEConfig(8, 8, 8192, 5, moe_parallel_config, torch.float16) + if vllm_version_is('0.14.0'): + moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl") + moe_config = FusedMoEConfig(8, 8, 8192, 5, moe_parallel_config, torch.float16) + else: + from vllm.model_executor.layers.fused_moe.config import RoutingMethodType + moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + moe_config = FusedMoEConfig( + num_experts=8, + experts_per_token=8, + hidden_dim=8192, + intermediate_size_per_partition=5, + num_local_experts=8, + activation="silu", + device="npu", + routing_method=RoutingMethodType.Simulated, + moe_parallel_config=moe_parallel_config, + in_dtype=torch.float16, + ) moe_config.supports_eplb = True self.vllm_config = vllm_config self.moe_config = moe_config diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py index a6d6ef85218..e8c30e7b362 100644 --- a/tests/ut/spec_decode/test_eagle_proposer.py +++ b/tests/ut/spec_decode/test_eagle_proposer.py @@ -51,6 +51,7 @@ def tearDown(self): def test_initialization_eagle_graph(self): self.vllm_config.speculative_config.method = "eagle" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 + self.vllm_config.speculative_config.draft_model_config.uses_mrope = False self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False self.vllm_config.model_config.uses_mrope = False @@ -65,10 +66,11 @@ def test_initialization_eagle_graph(self): self.assertEqual(proposer.hidden_size, 4096) self.assertTrue(proposer.use_cuda_graph) - self.assertEqual(proposer.input_ids.shape, (1024, )) - self.assertEqual(proposer.positions.shape, (1024, )) - self.assertEqual(proposer.hidden_states.shape, (1024, 4096)) - self.assertEqual(proposer.arange.shape, (1024, )) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.input_ids.shape, (expected_max_num_tokens, )) + self.assertEqual(proposer.positions.shape, (expected_max_num_tokens, )) + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 4096)) + self.assertEqual(proposer.arange.shape, (expected_max_num_tokens, )) def test_initialization_eagle3_enforce_eager(self): self.vllm_config.speculative_config.method = "eagle3" @@ -83,7 +85,8 @@ def test_initialization_eagle3_enforce_eager(self): self.assertEqual(proposer.hidden_size, 2048) self.assertFalse(proposer.use_cuda_graph) - self.assertEqual(proposer.hidden_states.shape, (1024, 2048)) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048)) def test_initialization_eagle3_full_graph_async(self): self.vllm_config.speculative_config.method = "eagle3" @@ -100,7 +103,8 @@ def test_initialization_eagle3_full_graph_async(self): self.assertEqual(proposer.hidden_size, 2048) self.assertTrue(proposer.use_cuda_graph) - self.assertEqual(proposer.hidden_states.shape, (1024, 2048)) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048)) def test_initialization_mtp_full_graph_async(self): self.vllm_config.speculative_config.method = "mtp" @@ -117,7 +121,8 @@ def test_initialization_mtp_full_graph_async(self): self.assertEqual(proposer.hidden_size, 2048) self.assertFalse(proposer.use_cuda_graph) - self.assertEqual(proposer.hidden_states.shape, (1024, 2048)) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048)) class TestEagleProposerLoadModel(TestBase): diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py index e800a8d5c7c..29a55c06021 100644 --- a/tests/ut/spec_decode/test_mtp_proposer.py +++ b/tests/ut/spec_decode/test_mtp_proposer.py @@ -33,6 +33,7 @@ def vllm_config(self): config.speculative_config.method = "mtp" config.speculative_config.draft_model_config = MagicMock() config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 + config.speculative_config.draft_model_config.uses_mrope = False config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 496934747c3..f3603ca8547 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -19,6 +19,7 @@ is_drafter_moe_model, is_moe_model, speculative_enable_dispatch_gmm_combine_decode, + vllm_version_is, ) @@ -42,20 +43,26 @@ def set_ascend_forward_context( batch_descriptor: BatchDescriptor | None = None, model_instance: torch.nn.Module = None, is_draft_model=False, + skip_compiled: bool = False, ): """A context manager that stores the current forward context, can be attention metadata, etc. We add some additional param into forward_context. """ - with set_forward_context( - attn_metadata, - vllm_config, - virtual_engine=virtual_engine, - num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp, - cudagraph_runtime_mode=aclgraph_runtime_mode, - batch_descriptor=batch_descriptor, - ): + forward_context_kwargs = { + "attn_metadata": attn_metadata, + "vllm_config": vllm_config, + "virtual_engine": virtual_engine, + "num_tokens": num_tokens, + "num_tokens_across_dp": num_tokens_across_dp, + "cudagraph_runtime_mode": aclgraph_runtime_mode, + "batch_descriptor": batch_descriptor, + } + + if not vllm_version_is("0.14.0"): + forward_context_kwargs["skip_compiled"] = skip_compiled + + with set_forward_context(**forward_context_kwargs): forward_context = get_forward_context() from vllm_ascend.ops.fused_moe.moe_comm_method import get_moe_comm_method diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 9383a165c42..f8e8df1b411 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -12,7 +12,6 @@ from vllm.utils.math_utils import cdiv, round_down from vllm.v1.attention.backend import ( # type: ignore AttentionBackend, AttentionCGSupport, MLAAttentionImpl) -from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec @@ -38,12 +37,19 @@ from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, - weak_ref_tensors) + weak_ref_tensors, vllm_version_is) from vllm_ascend.worker.npu_input_batch import NPUInputBatch + if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput +# isort: off +if vllm_version_is('0.14.0'): + from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore +else: + from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder +# isort: on MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 BUILD_METADATA_STEP_PREFILL = 0 diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index e36a928aa0d..9df4d0d5862 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -14,7 +14,6 @@ from vllm.triton_utils import HAS_TRITON from vllm.v1.attention.backend import ( # type: ignore AttentionBackend, AttentionCGSupport, MLAAttentionImpl) -from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder from vllm.v1.kv_cache_interface import AttentionSpec from vllm_ascend import envs @@ -37,11 +36,16 @@ from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, _round_up, dispose_layer, - enable_dsa_cp, enable_dsa_cp_with_layer_shard, maybe_trans_nz) + enable_dsa_cp, enable_dsa_cp_with_layer_shard, maybe_trans_nz, vllm_version_is) from vllm_ascend.worker.npu_input_batch import NPUInputBatch if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput +if vllm_version_is('0.14.0'): + from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore +else: + from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder +# isort: on # token count limits within bmm_transpose operator BMM_TRANS_MAX_SUPPORTED_TOKENS = 1024 diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py index 11c2a43343b..4a5a041f57c 100644 --- a/vllm_ascend/patch/platform/patch_multiproc_executor.py +++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py @@ -16,7 +16,7 @@ from vllm.v1.executor.multiproc_executor import ( FutureWrapper, MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc, set_multiprocessing_worker_envs) - +from vllm_ascend.utils import vllm_version_is class AscendMultiprocExecutor(MultiprocExecutor): @@ -28,15 +28,7 @@ def _init_executor(self) -> None: self.shutdown_event = threading.Event() self.failure_callback: FailureCallback | None = None - self.world_size = self.parallel_config.world_size - assert self.world_size % self.parallel_config.nnodes_within_dp == 0, ( - f"global world_size ({self.parallel_config.world_size}) must be " - f"divisible by nnodes_within_dp " - f"({self.parallel_config.nnodes_within_dp}). ") - self.local_world_size = self.parallel_config.local_world_size - tensor_parallel_size = self.parallel_config.tensor_parallel_size - pp_parallel_size = self.parallel_config.pipeline_parallel_size - pcp_parallel_size = self.parallel_config.prefill_context_parallel_size + tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes() assert self.world_size == tensor_parallel_size * pp_parallel_size * pcp_parallel_size, ( f"world_size ({self.world_size}) must be equal to the " f"tensor_parallel_size ({tensor_parallel_size}) x pipeline" @@ -76,6 +68,7 @@ def _init_executor(self) -> None: self.parallel_config.node_rank_within_dp) for local_rank in range(self.local_world_size): global_rank = global_start_rank + local_rank + is_driver_worker = self._is_driver_worker(global_rank) unready_workers.append( AscendWorkerProc.make_worker_process( vllm_config=self.vllm_config, @@ -84,6 +77,7 @@ def _init_executor(self) -> None: distributed_init_method=distributed_init_method, input_shm_handle=scheduler_output_handle, shared_worker_lock=shared_worker_lock, + is_driver_worker=is_driver_worker, )) # Workers must be created before wait_for_ready to avoid @@ -120,6 +114,9 @@ def _init_executor(self) -> None: # Wait for all remote response mqs to be ready. for response_mq in self.response_mqs: response_mq.wait_until_ready() + self.futures_queue = deque[tuple[FutureWrapper, Callable]]() + self._post_init_executor() + success = True finally: if not success: @@ -131,10 +128,26 @@ def _init_executor(self) -> None: self._ensure_worker_termination( [uw.proc for uw in unready_workers]) - self.futures_queue = deque[tuple[FutureWrapper, Callable]]() - self.output_rank = self._get_output_rank() + def _get_parallel_sizes(self) -> tuple[int, int, int]: + self.world_size = self.parallel_config.world_size + assert self.world_size % self.parallel_config.nnodes_within_dp == 0, ( + f"global world_size ({self.parallel_config.world_size}) must be " + f"divisible by nnodes_within_dp " + f"({self.parallel_config.nnodes_within_dp}). " + ) + self.local_world_size = self.parallel_config.local_world_size + tp_size = self.parallel_config.tensor_parallel_size + pp_size = self.parallel_config.pipeline_parallel_size + pcp_size = self.parallel_config.prefill_context_parallel_size + return tp_size, pp_size, pcp_size + + def _post_init_executor(self) -> None: + pass + + def _is_driver_worker(self, rank: int) -> bool: + return rank % self.parallel_config.tensor_parallel_size == 0 class AscendWorkerProc(WorkerProc): @@ -146,6 +159,7 @@ def make_worker_process( distributed_init_method: str, input_shm_handle, # Receive SchedulerOutput shared_worker_lock: LockType, + is_driver_worker: bool = False, ) -> UnreadyWorkerProcHandle: context = get_mp_context() # (reader, writer) @@ -164,6 +178,8 @@ def make_worker_process( "death_pipe": death_reader, "shared_worker_lock": shared_worker_lock, } + if not vllm_version_is('0.14.0'): + process_kwargs["is_driver_worker"] = is_driver_worker # Run EngineCore busy loop in background process. proc = context.Process( target=WorkerProc.worker_main, diff --git a/vllm_ascend/patch/worker/patch_v2_egale.py b/vllm_ascend/patch/worker/patch_v2_egale.py index 108df8cc2a9..24470e63361 100644 --- a/vllm_ascend/patch/worker/patch_v2_egale.py +++ b/vllm_ascend/patch/worker/patch_v2_egale.py @@ -21,7 +21,7 @@ import vllm from vllm.v1.worker.gpu.input_batch import InputBatch from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.spec_decode.eagle import (prepare_eagle_decode, prepare_eagle_inputs) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 789dd091ce6..b22f240ad59 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -44,7 +44,7 @@ from vllm_ascend.ops.triton.spec_decode.utils import \ prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled +from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, vllm_version_is # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -421,7 +421,11 @@ def _propose( self.input_ids[last_token_indices] = next_token_ids if self.use_cuda_graph and \ num_tokens <= self.runner.cudagraph_batch_sizes[-1]: - num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) + if vllm_version_is('0.14.0'): + num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) + else: + num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[ + num_tokens] else: num_input_tokens = num_tokens diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index dac8b5d1c68..53cde944dbc 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -17,7 +17,7 @@ from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable +from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable, vllm_version_is class MtpProposer(EagleProposer): @@ -97,7 +97,7 @@ def dummy_run(self, attn_metadata = None input_ids = self.input_ids[:num_tokens] - positions = self.positions[:num_tokens] + positions = self._get_positions(num_tokens) previous_hidden_states = self.hidden_states[:num_tokens] for i in range(self.num_speculative_tokens): if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL: @@ -244,14 +244,18 @@ def _propose( # Note(qcs): We may need to refactor these check logics. if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ -1]: - num_input_tokens = self.vllm_config.pad_for_cudagraph( - num_scheduled_tokens) + if vllm_version_is('0.14.0'): + num_input_tokens = self.vllm_config.pad_for_cudagraph( + num_scheduled_tokens) + else: + num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[ + num_scheduled_tokens] else: # Eager mode, no padding needed num_input_tokens = num_tokens # copy inputs to buffer for cudagraph - self.positions[:num_tokens] = target_positions + self._set_positions(num_tokens, target_positions) self.hidden_states[:num_tokens] = target_hidden_states # eager/acl piecewise mode need to update num_tokens_across_dp (num_input_tokens, num_tokens_across_dp, @@ -311,7 +315,7 @@ def _propose( model_kwargs = {} model_kwargs["attn_metadata"] = attn_metadata input_ids = self.input_ids[:num_input_tokens] - positions = self.positions[:num_input_tokens] + positions = self._get_positions(num_input_tokens) hidden_states = self.hidden_states[:num_input_tokens] hidden_states, positions = self.maybe_pad_and_reduce( @@ -474,7 +478,7 @@ def _propose( # copy inputs to buffer for cudagraph self.input_ids[:batch_size] = input_ids - self.positions[:batch_size] = clamped_positions + self._set_positions(batch_size, clamped_positions) self.hidden_states[:hidden_states.shape[0]] = hidden_states if self.pcp_size * self.dcp_size > 1: # update local seq_len and batch_seq_mask @@ -502,7 +506,10 @@ def _propose( else: attn_metadata_i.slot_mapping[:batch_size] = slot_mapping if self.speculative_config.disable_padded_drafter_batch: - self.positions[batch_size:num_input_tokens] = 0 + if self.uses_mrope: + self.mrope_positions[:, batch_size:num_input_tokens] = 0 + else: + self.positions[batch_size:num_input_tokens] = 0 self.input_ids[batch_size:num_input_tokens] = 0 self.hidden_states[batch_size:num_input_tokens].fill_(0) @@ -511,8 +518,8 @@ def _propose( prefill_metadata.seq_lens_list = prefill_metadata.seq_lens.tolist( ) prefill_metadata.context_lens = attn_metadata_i.seq_lens - prefill_metadata.input_positions = self.positions[: - num_input_tokens] + prefill_metadata.input_positions = self._get_positions( + num_input_tokens) prefill_metadata.max_seq_lens += 1 prefill_metadata.max_seq_lens = min( prefill_metadata.max_seq_lens, @@ -527,8 +534,8 @@ def _propose( decode_metadata.seq_lens_list = decode_seq_lens_list + [ 0 ] * (graph_pad_size - len(decode_seq_lens_list)) - decode_metadata.input_positions = self.positions[: - num_input_tokens] + decode_metadata.input_positions = self._get_positions( + num_input_tokens) decode_metadata.max_seq_lens += 1 decode_metadata.max_seq_lens = min( decode_metadata.max_seq_lens, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7409d21c2e1..2d5546632ad 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -104,7 +104,7 @@ from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration, enable_sp, get_ascend_device_type, is_moe_model, lmhead_tp_enable, maybe_trans_nz, - set_weight_prefetch_method) + set_weight_prefetch_method, vllm_version_is) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -578,8 +578,12 @@ def _prepare_inputs( if (self.use_aclgraph and total_num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): # Add padding to the batch size. - num_input_tokens = self.vllm_config.pad_for_cudagraph( - total_num_scheduled_tokens) + if vllm_version_is('0.14.0'): + num_input_tokens = self.vllm_config.pad_for_cudagraph( + total_num_scheduled_tokens) + else: + num_input_tokens = self.cudagraph_dispatcher._bs_to_padded_graph_size[ + total_num_scheduled_tokens] elif self.use_aclgraph and enable_sp(self.vllm_config): # When using aclgraph, if total_num_scheduled_tokens exceeds the maximum graph size, # the model will fall back to running its FX graph in eager mode. @@ -1511,9 +1515,17 @@ def execute_model( head_dim=self.model_config.get_vocab_size(), generators=self.input_batch.sampling_metadata.generators) + # Encoder-decoder models can only compile the pure decode steps where no + # encoder inputs are present. Use eager for the first pass. + num_encoder_reqs = len(scheduler_output.scheduled_encoder_inputs) + has_encoder_input = ( + self.model_config.is_encoder_decoder and num_encoder_reqs > 0 + ) + # Run forward pass with ProfileExecuteDuration().capture_async("forward"): - with set_ascend_forward_context( + with ( + set_ascend_forward_context( attn_metadata, self.vllm_config, num_tokens=num_input_tokens, @@ -1522,26 +1534,18 @@ def execute_model( batch_descriptor=batch_descriptor, num_actual_tokens=scheduler_output. total_num_scheduled_tokens, - model_instance=self.model): - self.maybe_setup_kv_connector(scheduler_output) - + model_instance=self.model, + skip_compiled=has_encoder_input), + self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output, + ): hidden_states = self._generate_process_reqs_hidden_states( maybe_padded_num_tokens, input_ids, positions, intermediate_tensors, inputs_embeds, model_kwargs) - self.maybe_wait_for_kv_save() - finished_sending, finished_recving = self.get_finished_kv_transfer( - scheduler_output) - aux_hidden_states = None if self.use_aux_hidden_state_outputs: hidden_states, aux_hidden_states = hidden_states - kv_connector_output = KVConnectorOutput( - finished_sending=finished_sending, - finished_recving=finished_recving) - finished_sending = None - finished_recving = None with ProfileExecuteDuration().capture_async("post process"): # Broadcast PP output for external_launcher (torchrun) # to make sure we are synced across pp ranks diff --git a/vllm_ascend/worker/v2/aclgraph_utils.py b/vllm_ascend/worker/v2/aclgraph_utils.py index 1fab82d246d..06f43309d90 100644 --- a/vllm_ascend/worker/v2/aclgraph_utils.py +++ b/vllm_ascend/worker/v2/aclgraph_utils.py @@ -22,7 +22,6 @@ import torch import torch.nn as nn from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.worker.gpu.block_table import BlockTables from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager @@ -31,6 +30,12 @@ from vllm.v1.worker.gpu.input_batch import InputBuffers from vllm_ascend.worker.v2.utils import torch_cuda_wrapper +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is('0.14.0'): + from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +else: + from vllm.v1.attention.backend import AttentionMetadataBuilder class AclGraphManager(CudaGraphManager): diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index e8ed5a28445..df473377e79 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -23,13 +23,18 @@ import numpy as np import torch from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.kv_cache_interface import EncoderOnlyAttentionSpec, KVCacheConfig from vllm_ascend.attention.attention_mask import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is('0.14.0'): + from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +else: + from vllm.v1.attention.backend import AttentionMetadataBuilder _ATTENTION_MASK_BUILDER = None diff --git a/vllm_ascend/worker/v2/sample/penalties.py b/vllm_ascend/worker/v2/sample/penalties.py index aaec6cee810..59aff72a5ce 100644 --- a/vllm_ascend/worker/v2/sample/penalties.py +++ b/vllm_ascend/worker/v2/sample/penalties.py @@ -20,7 +20,7 @@ import torch from vllm.triton_utils import tl, triton -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata @triton.jit diff --git a/vllm_ascend/worker/v2/sample/sampler.py b/vllm_ascend/worker/v2/sample/sampler.py index e54536c7a0b..8989363b901 100644 --- a/vllm_ascend/worker/v2/sample/sampler.py +++ b/vllm_ascend/worker/v2/sample/sampler.py @@ -17,7 +17,7 @@ import torch from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.sample.min_p import apply_min_p from vllm.v1.worker.gpu.sample.sampler import Sampler