diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index a5a0a63aaec..bbad7b5aa3a 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -18,7 +18,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true env: UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 7e24b5d396e..bdb75d25ae0 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029 + VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 9544bba796f..948d88fdf44 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029 +ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 5e002c65fbe..d9e8b62e7c1 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index aa508c938da..6368a0e44f9 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 35141a7eeda941a60ad5a4956670c60fd5a77029 + vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index 2eecbb1cc2e..1b8e6d3837b 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index c872c1796bb..cdb9d0465aa 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -129,7 +129,7 @@ def __init__(self, vllm_config: "VllmConfig"): # when enable_async_exponential is True, AscendSampler will be different from vllm Sampler, # which make batch_invariant mode not working. # so we disable async exponential when batch_invariant mode is enabled. - from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + from vllm_ascend.batch_invariant import vllm_is_batch_invariant self.enable_async_exponential = ( bool(additional_config.get("enable_async_exponential", False)) and not vllm_is_batch_invariant() diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 6443c5216e2..183c89afda4 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -688,7 +688,20 @@ def full_graph_pa( graph_params.handles[num_tokens].append(handle) return output - def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata): + def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata, kv_cache=None): + # PrefillNoCache doesn't need key_cache, but other modes do + # Only initialize/require cache for modes that actually use it + if attn_metadata.attn_state != AscendAttentionState.PrefillNoCache: + # Initialize cache from kv_cache if not already set (for DecodeOnly mode) + if self.key_cache is None and kv_cache is not None: + if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + + if self.key_cache is None: + raise RuntimeError(f"key_cache is None in _get_fia_params for mode {attn_metadata.attn_state}. kv_cache={kv_cache}") + if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: block_size = 128 block_table = None @@ -766,6 +779,7 @@ def forward_fused_infer_attention( value: torch.Tensor, attn_metadata: AscendMetadata, output: torch.Tensor, + kv_cache=None, ): # we inherit ForwardContext in model runner v2, when enable model # runner v2, there is not capturing attribute in forward_context, @@ -781,7 +795,7 @@ def forward_fused_infer_attention( and self.sinks is None ): return self._forward_fia_slidingwindow(query, attn_metadata, output) - key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata) + key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata, kv_cache) num_tokens = attn_metadata.actual_seq_lengths_q[-1] query = query[:num_tokens] if ( @@ -927,7 +941,7 @@ def forward_impl( ): output = self.forward_paged_attention(query, attn_metadata, output) else: - output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output) + output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output, kv_cache) return output @@ -963,6 +977,16 @@ def forward( num_tokens = query.shape[0] if attn_metadata is None: return output.fill_(0) + + # Initialize key_cache and value_cache from kv_cache if not already set. + # This is needed for DecodeOnly mode where key/value are None but we still + # need access to the cache for attention computation. + if self.key_cache is None and kv_cache is not None: + if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + output_padded = None if key is not None and value is not None: output_padded = output diff --git a/vllm_ascend/batch_invariant.py b/vllm_ascend/batch_invariant.py index 7f27fd6e4ef..20270068eb7 100644 --- a/vllm_ascend/batch_invariant.py +++ b/vllm_ascend/batch_invariant.py @@ -20,14 +20,28 @@ import torch import torch_npu +import vllm.envs as envs from vllm.logger import logger -from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.triton_utils import HAS_TRITON # in case recursive call in reduce_sum. torch_sum = torch.sum +def vllm_is_batch_invariant() -> bool: + """Check if batch-invariant mode is enabled. + + This is a compatibility wrapper for the vllm function that was removed + in recent upstream vLLM refactoring. + """ + # Try to access from envs module, fall back to environment variable + if hasattr(envs, 'VLLM_BATCH_INVARIANT'): + return bool(envs.VLLM_BATCH_INVARIANT) + else: + # Fallback to environment variable for older vLLM versions + return bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0"))) + + if HAS_TRITON: from vllm_ascend.ops.triton.batch_invariant.matmul import ( addmm_batch_invariant, diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index bd68ed16b27..9cfe4371f3a 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -5,12 +5,21 @@ from vllm.v1.attention.backend import AttentionBackend # type: ignore from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager -from vllm.v1.kv_offload.backends.cpu import CPUBackend -from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.spec import OffloadingSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler +# Handle import compatibility with different vLLM versions +try: + from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager +except ModuleNotFoundError: + # Fallback for older vLLM versions where the path might be different + try: + from vllm.v1.kv_offload.cpu_manager import CPUOffloadingManager # noqa: F401 + except ModuleNotFoundError: + # If still not found, let it fail at usage time with better error message + CPUOffloadingManager = None # type: ignore + from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler @@ -36,8 +45,9 @@ def get_manager(self) -> OffloadingManager: assert len(self.gpu_block_size) == 1 gpu_block_size = self.gpu_block_size[0] offloaded_block_size = gpu_block_size * self.block_size_factor - self._manager = LRUOffloadingManager( - CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), + self._manager = CPUOffloadingManager( + block_size=offloaded_block_size, + num_blocks=self.num_cpu_blocks, enable_events=enable_events, ) return self._manager diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 1d420a14bab..c308c56daec 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -183,7 +183,10 @@ def mla_forward( attn_metadata = forward_context.attn_metadata[self.mla_attn.layer_name] else: attn_metadata = forward_context.attn_metadata - kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0] + if vllm_version_is("0.18.0"): + kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine] + else: + kv_cache = self.mla_attn.kv_cache self.mla_attn.impl.forward( self.mla_attn.layer_name, hidden_states, kv_cache, attn_metadata, need_gather_q_kv, output ) diff --git a/vllm_ascend/patch/worker/patch_qwen3_5.py b/vllm_ascend/patch/worker/patch_qwen3_5.py index 3c78d2f9fad..133bd30eff6 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_5.py +++ b/vllm_ascend/patch/worker/patch_qwen3_5.py @@ -135,7 +135,10 @@ def _forward_core( non_spec_token_indx = attn_metadata.non_spec_token_indx spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 - self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0] + if vllm_version_is("0.18.0"): + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + else: + self_kv_cache = self.kv_cache conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] num_actual_tokens = attn_metadata.num_actual_tokens diff --git a/vllm_ascend/patch/worker/patch_qwen3_next.py b/vllm_ascend/patch/worker/patch_qwen3_next.py index ff7e0c2292a..642a90f681e 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next.py @@ -125,7 +125,10 @@ def _forward_core( non_spec_token_indx = attn_metadata.non_spec_token_indx spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 - self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0] + if vllm_version_is("0.18.0"): + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + else: + self_kv_cache = self.kv_cache conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] num_actual_tokens = attn_metadata.num_actual_tokens diff --git a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py index 1bd00e0c058..21cb03951e7 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py @@ -44,8 +44,7 @@ def bind_kv_cache( # Bind kv_caches to forward context for layer_name, kv_cache in kv_caches.items(): - # NOTE: Use list because of v0 PP virtual engine. - forward_context[layer_name].kv_cache = [kv_cache] + forward_context[layer_name].kv_cache = kv_cache utils.bind_kv_cache = bind_kv_cache diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 68082152aed..13d7ab8e474 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -1,5 +1,5 @@ import torch -from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant +from vllm_ascend.batch_invariant import vllm_is_batch_invariant from vllm.triton_utils import HAS_TRITON from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index e47f4590908..f13237818ef 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -91,6 +91,9 @@ class SpecDecodeBaseProposer(EagleProposer): def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None): super().__init__(vllm_config, device, runner) + # Assign runner before it's used in the methods below + self.runner = runner + self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling self.pass_hidden_states_to_model = pass_hidden_states_to_model self.decode_threshold = 1 + self.num_speculative_tokens diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 7e773432f40..9eb164378ab 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -259,7 +259,7 @@ def enable_custom_op(): Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device(). """ - from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + from vllm_ascend.batch_invariant import vllm_is_batch_invariant global _CUSTOM_OP_ENABLED diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index bac0cba61ec..c45b52fc596 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -221,6 +221,38 @@ class ExecuteModelState(NamedTuple): class NPUModelRunner(GPUModelRunner): + @staticmethod + def _get_device_tensor(buf): + """Get device tensor from either CpuGpuBuffer or direct Tensor. + + Compatibility wrapper for handling both old (CpuGpuBuffer) and new + (direct Tensor) versions of vLLM. + """ + return buf.gpu if hasattr(buf, 'gpu') else buf + + @staticmethod + def _get_buffer_gpu(buf): + """Get GPU tensor from either CpuGpuBuffer or direct Tensor. + + For CpuGpuBuffer: returns buf.gpu + For plain Tensor on device: returns buf directly + """ + if hasattr(buf, 'gpu'): + return buf.gpu + else: + # Plain tensor - already on device + return buf + + @staticmethod + def _safe_copy_to_gpu(buf, *args, **kwargs): + """Safely copy buffer to GPU, handling both CpuGpuBuffer and plain Tensor. + + For CpuGpuBuffer: calls copy_to_gpu() + For plain Tensor: no-op (already on device) + """ + if hasattr(buf, 'copy_to_gpu'): + buf.copy_to_gpu(*args, **kwargs) + def __init__(self, vllm_config: VllmConfig, device: torch.device): # TODO(qcs): These manual pad and unpad for GPUModelRunner are # used to expand some buffers, which need to be reverted after @@ -260,6 +292,16 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.sampler = AscendSampler() self.attn_state: AscendAttentionState | None = None + # Ensure query_pos is initialized (parent class should do this, but add + # fallback for compatibility with upstream vLLM versions) + if not hasattr(self, 'query_pos'): + arange_size = max(self.max_num_reqs + 1, self.max_num_tokens) + self.query_pos = self._make_buffer(arange_size, dtype=torch.int64) + logger.warning( + "query_pos was not initialized by parent GPUModelRunner, " + "initializing fallback buffer of size %d", arange_size + ) + # Ascend-specific configurations self.ascend_config = get_ascend_config() set_weight_prefetch_method(self.ascend_config.weight_prefetch_config) @@ -333,8 +375,20 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.use_sparse, ) # TODO(zhenwenqi) after https://github.com/vllm-project/vllm/pull/28988 is merged, we can delete this + # Keep input_ids as CpuGpuBuffer (has .copy_to_gpu() method used by upstream) + # But positions must be a plain tensor (needs to be directly subscriptable) self.input_ids = self._make_buffer(max_buffer_num_tokens, dtype=torch.int32) - self.positions = self._make_buffer(max_buffer_num_tokens, dtype=torch.int64) + self.positions = torch.zeros( + max_buffer_num_tokens, dtype=torch.int64, device=self.device + ) + + # Create a CPU numpy buffer for positions computation when + # self.positions is a plain tensor (non-CpuGpuBuffer case). + self._positions_cpu_buf = torch.zeros( + max_buffer_num_tokens, dtype=torch.int64, + pin_memory=self.pin_memory, + ) + self._positions_np_buf = self._positions_cpu_buf.numpy() self._set_up_drafter() @@ -579,19 +633,28 @@ def _pad_query_start_loc_for_fia( # Uniform-batch case: num_reqs must be no greater than num_reqs_padded assert num_reqs <= num_reqs_padded - last_loc = self.query_start_loc.np[num_reqs] - self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1] = ( - self.arange_np[1 : num_reqs_padded + 1 - num_reqs] * self.uniform_decode_query_len + last_loc - ) + if hasattr(self.query_start_loc, 'np'): + last_loc = self.query_start_loc.np[num_reqs] + self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1] = ( + self.arange_np[1 : num_reqs_padded + 1 - num_reqs] * self.uniform_decode_query_len + last_loc + ) + else: + last_loc = self.query_start_loc.cpu()[num_reqs].item() + self.query_start_loc[num_reqs + 1 : num_reqs_padded + 1] = ( + torch.from_numpy(self.arange_np[1 : num_reqs_padded + 1 - num_reqs]).to(self.query_start_loc.dtype) * self.uniform_decode_query_len + last_loc + ).to(self.query_start_loc.device) else: # Mixed-batch case: num_reqs must equal num_reqs_padded assert num_reqs == num_reqs_padded # Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly - self.query_start_loc.np[num_reqs_padded + 1] = num_tokens_padded + if hasattr(self.query_start_loc, 'np'): + self.query_start_loc.np[num_reqs_padded + 1] = num_tokens_padded + else: + self.query_start_loc[num_reqs_padded + 1] = num_tokens_padded num_reqs_padded = num_reqs_padded + 1 - self.query_start_loc.copy_to_gpu() + self._safe_copy_to_gpu(self.query_start_loc) return num_reqs_padded @@ -637,8 +700,18 @@ def _prepare_inputs( self.with_prefill = with_prefill # Get positions. - positions_np = self.positions.np[:total_num_scheduled_tokens] - cu_num_tokens, arange = self._get_cumsum_and_arange(num_scheduled_tokens) + # Use query_pos.np as output buffer for _get_cumsum_and_arange to avoid + # corrupting self.arange_np (which is used as both read source and would + # be overwritten if used as arange_out, causing aliasing bugs). + cu_num_tokens = self._get_cumsum_and_arange(num_scheduled_tokens, self.query_pos.np) + arange = self.query_pos.np[:total_num_scheduled_tokens] + + # Handle both CpuGpuBuffer (.np property) and plain Tensor compatibility + if hasattr(self.positions, 'np'): + positions_np = self.positions.np[:total_num_scheduled_tokens] + else: + # Plain tensor - need a CPU numpy buffer for computation + positions_np = self._positions_np_buf[:total_num_scheduled_tokens] np.add(self.input_batch.num_computed_tokens_cpu[req_indices], arange, out=positions_np) self.input_batch.block_table.compute_slot_mapping(req_indices, positions_np) @@ -673,8 +746,12 @@ def _prepare_inputs( # Re-update after PCP split sequences. total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens) - cu_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) - positions_np = self.positions.np[:total_num_scheduled_tokens] + cu_num_tokens = self._get_cumsum_and_arange(num_scheduled_tokens, self.query_pos.np) + # Handle both CpuGpuBuffer (.np property) and plain Tensor compatibility + if hasattr(self.positions, 'np'): + positions_np = self.positions.np[:total_num_scheduled_tokens] + else: + positions_np = self._positions_np_buf[:total_num_scheduled_tokens] np.add( self.input_batch.num_computed_tokens_cpu[req_indices], position_pcp[:total_num_scheduled_tokens], @@ -696,16 +773,18 @@ def _prepare_inputs( # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. + input_ids_cpu = self.input_ids.cpu if hasattr(self.input_ids, 'cpu') and isinstance(self.input_ids.cpu, torch.Tensor) else self.input_ids.cpu() torch.index_select( self.input_batch.token_ids_cpu_tensor.flatten(), 0, token_indices_tensor, - out=self.input_ids.cpu[:total_num_scheduled_tokens], + out=input_ids_cpu[:total_num_scheduled_tokens], ) if self.enable_prompt_embeds: is_token_ids = self.input_batch.is_token_ids_tensor.flatten() + is_token_ids_cpu = self.is_token_ids.cpu if hasattr(self.is_token_ids, 'cpu') and isinstance(self.is_token_ids.cpu, torch.Tensor) else self.is_token_ids.cpu() torch.index_select( - is_token_ids, 0, token_indices_tensor, out=self.is_token_ids.cpu[:total_num_scheduled_tokens] + is_token_ids, 0, token_indices_tensor, out=is_token_ids_cpu[:total_num_scheduled_tokens] ) # Because we did not pre-allocate a massive prompt_embeds CPU tensor on @@ -740,54 +819,105 @@ def _prepare_inputs( actual_num_sched = actual_end - start_pos if actual_num_sched > 0: - self.inputs_embeds.cpu[output_idx : output_idx + actual_num_sched].copy_( + inputs_embeds_cpu = self.inputs_embeds.cpu if hasattr(self.inputs_embeds, 'cpu') and isinstance(self.inputs_embeds.cpu, torch.Tensor) else self.inputs_embeds.cpu() + inputs_embeds_cpu[output_idx : output_idx + actual_num_sched].copy_( req_embeds[start_pos:actual_end] ) output_idx += num_sched - self.query_start_loc.np[0] = 0 - self.query_start_loc.np[1 : num_reqs + 1] = cu_num_tokens - self.query_start_loc.copy_to_gpu() + # Handle both CpuGpuBuffer and plain Tensor + if hasattr(self.query_start_loc, 'np'): + self.query_start_loc.np[0] = 0 + self.query_start_loc.np[1 : num_reqs + 1] = cu_num_tokens + else: + self.query_start_loc[0] = 0 + self.query_start_loc[1 : num_reqs + 1] = torch.from_numpy(cu_num_tokens).to(self.query_start_loc.dtype).to(self.query_start_loc.device) + self._safe_copy_to_gpu(self.query_start_loc) # Now, query_start_loc is padded. # But gdn needs an unpadded one. # gdn_query_start_loc is an unpadded version of query_start_loc. # TODO delete it if fia's check is removed. if self._has_gdn: - self.gdn_query_start_loc.np[0] = 0 - self.gdn_query_start_loc.np[1 : num_reqs + 1] = cu_num_tokens - self.gdn_query_start_loc.np[num_reqs + 1 :].fill(cu_num_tokens[-1]) - self.gdn_query_start_loc.copy_to_gpu() + if hasattr(self.gdn_query_start_loc, 'np'): + self.gdn_query_start_loc.np[0] = 0 + self.gdn_query_start_loc.np[1 : num_reqs + 1] = cu_num_tokens + self.gdn_query_start_loc.np[num_reqs + 1 :].fill(cu_num_tokens[-1]) + else: + self.gdn_query_start_loc[0] = 0 + self.gdn_query_start_loc[1 : num_reqs + 1] = torch.from_numpy(cu_num_tokens).to(self.gdn_query_start_loc.dtype).to(self.gdn_query_start_loc.device) + self.gdn_query_start_loc[num_reqs + 1 :].fill_(cu_num_tokens[-1]) + self._safe_copy_to_gpu(self.gdn_query_start_loc) + + # Handle both CpuGpuBuffer (.np property) and plain Tensor compatibility + if hasattr(self.seq_lens, 'np'): + self.seq_lens.np[:num_reqs] = self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens + else: + # Plain tensor - convert to tensor and assign + computed_tokens_tensor = torch.from_numpy(self.input_batch.num_computed_tokens_cpu[:num_reqs]).to(self.seq_lens.dtype) + self.seq_lens[:num_reqs] = computed_tokens_tensor + num_scheduled_tokens - self.seq_lens.np[:num_reqs] = self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens - self.seq_lens.cpu[num_reqs:].fill_(0) - self.seq_lens.copy_to_gpu() + if hasattr(self.seq_lens, 'np'): + # CpuGpuBuffer - .cpu is a property + self.seq_lens.cpu[num_reqs:].fill_(0) + else: + # Plain tensor on GPU - fill directly on the tensor + self.seq_lens[num_reqs:].fill_(0) + + self._safe_copy_to_gpu(self.seq_lens) # Fill unused with -1. Needed for reshape_and_cache in attention_cp - self.query_start_loc.gpu[num_reqs + 1 :].fill_(-1) + if hasattr(self.query_start_loc, 'gpu'): + self.query_start_loc.gpu[num_reqs + 1 :].fill_(-1) + else: + # Plain tensor - already on device + self.query_start_loc[num_reqs + 1 :].fill_(-1) + + # Build prev_positions mapping for async scheduling input_ids handling. + self._compute_prev_positions(num_reqs) # Copy the tensors to the NPU. - self._prepare_input_ids(scheduler_output, total_num_scheduled_tokens, cu_num_tokens) + self._prepare_input_ids(scheduler_output, num_reqs, total_num_scheduled_tokens, cu_num_tokens) # Calculate M-RoPE positions. # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) self._calc_mrope_positions(scheduler_output) - self.mrope_positions.gpu.copy_( - self.mrope_positions.cpu, - non_blocking=True, - ) + if hasattr(self.mrope_positions, 'gpu'): + self.mrope_positions.gpu.copy_( + self.mrope_positions.cpu, + non_blocking=True, + ) + else: + # Plain tensor - already on GPU, no-op or move from CPU if needed + mrope_positions_cpu = self.mrope_positions.cpu if hasattr(self.mrope_positions, 'cpu') and isinstance(self.mrope_positions.cpu, torch.Tensor) else self.mrope_positions.cpu() + self.mrope_positions.copy_(mrope_positions_cpu, non_blocking=True) elif self.uses_xdrope_dim > 0: self._calc_xdrope_positions(scheduler_output) # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) - self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( - self.xdrope_positions.cpu[:, :total_num_scheduled_tokens], - non_blocking=True, - ) + xdrope_cpu = self.xdrope_positions.cpu if hasattr(self.xdrope_positions, 'cpu') and isinstance(self.xdrope_positions.cpu, torch.Tensor) else self.xdrope_positions.cpu() + if hasattr(self.xdrope_positions, 'gpu'): + self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( + xdrope_cpu[:, :total_num_scheduled_tokens], + non_blocking=True, + ) + else: + # Plain tensor - already on GPU, copy from CPU version + self.xdrope_positions[:, :total_num_scheduled_tokens].copy_( + xdrope_cpu[:, :total_num_scheduled_tokens], + non_blocking=True, + ) else: # Common case (1D positions) - self.positions.copy_to_gpu(total_num_scheduled_tokens) + if hasattr(self.positions, 'copy_to_gpu'): + self.positions.copy_to_gpu(total_num_scheduled_tokens) + else: + # Plain tensor: copy from CPU numpy buffer to GPU + self.positions[:total_num_scheduled_tokens].copy_( + self._positions_cpu_buf[:total_num_scheduled_tokens], + non_blocking=True, + ) # Record the index of requests that should not be sampled, # so that we could clear the sampled tokens before returning @@ -805,12 +935,20 @@ def _prepare_inputs( ) discard_requests_mask = original_seq_lens_np < num_tokens_np else: - discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np + # Handle both CpuGpuBuffer and plain Tensor + if hasattr(self.seq_lens, 'np'): + discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np + else: + discard_requests_mask = self.seq_lens.cpu().numpy()[:num_reqs] < num_tokens_np discard_request_indices = np.nonzero(discard_requests_mask)[0] self.num_discarded_requests = len(discard_request_indices) - self.discard_request_indices.np[: self.num_discarded_requests] = discard_request_indices - self.discard_request_indices.copy_to_gpu(self.num_discarded_requests) + # Handle both CpuGpuBuffer and plain Tensor + if hasattr(self.discard_request_indices, 'np'): + self.discard_request_indices.np[: self.num_discarded_requests] = discard_request_indices + else: + self.discard_request_indices[: self.num_discarded_requests] = torch.from_numpy(discard_request_indices).to(self.discard_request_indices.dtype) + self._safe_copy_to_gpu(self.discard_request_indices, self.num_discarded_requests) use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 if not use_spec_decode: # NOTE(woosuk): Due to chunked prefills, the batch may contain @@ -825,7 +963,8 @@ def _prepare_inputs( logits_indices = self.pcp_manager.get_logits_indices(cu_num_tokens, num_reqs, tokens_original) logits_indices = logits_indices.pin_memory().to(self.device, non_blocking=True) else: - logits_indices = self.query_start_loc.gpu[1 : num_reqs + 1] - 1 + query_start_loc_gpu = self._get_buffer_gpu(self.query_start_loc) + logits_indices = query_start_loc_gpu[1 : num_reqs + 1] - 1 else: # Get the number of draft tokens for each request. # Iterate over the dictionary rather than all requests since not all @@ -857,9 +996,13 @@ def _prepare_inputs( num_sampled_tokens = num_draft_tokens + 1 # For DECODE only cuda graph of some attention backends (e.g., GDN). - self.num_decode_draft_tokens.np[:num_reqs] = num_decode_draft_tokens - self.num_decode_draft_tokens.np[num_reqs:].fill(-1) - self.num_decode_draft_tokens.copy_to_gpu() + if hasattr(self.num_decode_draft_tokens, 'np'): + self.num_decode_draft_tokens.np[:num_reqs] = num_decode_draft_tokens + self.num_decode_draft_tokens.np[num_reqs:].fill(-1) + else: + self.num_decode_draft_tokens[:num_reqs] = torch.from_numpy(num_decode_draft_tokens).to(self.num_decode_draft_tokens.dtype).to(self.num_decode_draft_tokens.device) + self.num_decode_draft_tokens[num_reqs:].fill_(-1) + self._safe_copy_to_gpu(self.num_decode_draft_tokens) # save logits_indices for pcp spec decode usage self.logits_indices = logits_indices @@ -972,7 +1115,8 @@ def _calc_spec_decode_metadata( # Compute the draft token ids. # draft_token_indices: [ 1, 2, 3, 105, 106, 208] - draft_token_ids = self.input_ids.gpu[logits_indices] + input_ids_gpu = self._get_buffer_gpu(self.input_ids) + draft_token_ids = input_ids_gpu[logits_indices] draft_token_ids = draft_token_ids[target_logits_indices + 1] if self.pcp_size > 1: logits_indices = logits_indices_pcp @@ -1047,9 +1191,9 @@ def propose_draft_token_ids( req_scheduled_tokens = scheduler_output.num_scheduled_tokens if self.use_cp: long_seq_metadata = self.long_seq_metadata # type: ignore - input_ids_pcp_full = self.pcp_manager.input_ids_pcp_full.gpu - query_start_loc_pcp_full = self.pcp_manager.query_start_loc_pcp_full.gpu - query_start_loc_pcp_full_cpu = self.pcp_manager.query_start_loc_pcp_full.cpu + input_ids_pcp_full = self._get_buffer_gpu(self.pcp_manager.input_ids_pcp_full) + query_start_loc_pcp_full = self._get_buffer_gpu(self.pcp_manager.query_start_loc_pcp_full) + query_start_loc_pcp_full_cpu = self.pcp_manager.query_start_loc_pcp_full.cpu if hasattr(self.pcp_manager.query_start_loc_pcp_full, 'cpu') and isinstance(self.pcp_manager.query_start_loc_pcp_full.cpu, torch.Tensor) else self.pcp_manager.query_start_loc_pcp_full.cpu() num_reqs = self.input_batch.num_reqs num_prefill_reqs = self.pcp_manager.num_prefill_reqs num_decode_reqs = self.pcp_manager.num_decode_reqs @@ -1071,7 +1215,8 @@ def propose_draft_token_ids( else: token_indices_to_sample = None # input_ids can be None for multimodal models. - target_token_ids = self.input_ids.gpu[:num_scheduled_tokens] + input_ids_gpu = self._get_buffer_gpu(self.input_ids) + target_token_ids = input_ids_gpu[:num_scheduled_tokens] target_positions = self._get_positions(num_scheduled_tokens) if self.use_aux_hidden_state_outputs: target_hidden_states = torch.cat([h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1) @@ -1106,7 +1251,8 @@ def propose_draft_token_ids( if self.use_aux_hidden_state_outputs: target_hidden_states = torch.cat([h for h in aux_hidden_states], dim=-1) else: - target_token_ids = self.input_ids.gpu[token_indices] + input_ids_gpu = self._get_buffer_gpu(self.input_ids) + target_token_ids = input_ids_gpu[token_indices] target_positions = self._get_positions(token_indices) if self.use_aux_hidden_state_outputs: target_hidden_states = torch.cat([h[token_indices] for h in aux_hidden_states], dim=-1) @@ -1297,7 +1443,10 @@ def execute_model( if enable_sp() and num_tokens_padded == num_tokens_unpadded: if num_reqs_padded > old_num_reqs_padded: num_reqs_padded = old_num_reqs_padded - self.query_start_loc.np[num_reqs_padded + 1] = 0 + if hasattr(self.query_start_loc, 'np'): + self.query_start_loc.np[num_reqs_padded + 1] = 0 + else: + self.query_start_loc[num_reqs_padded + 1] = 0 (attn_metadata, spec_decode_common_attn_metadata) = self._build_attention_metadata( num_tokens=num_tokens_unpadded @@ -1677,7 +1826,10 @@ def _bookkeeping_sync( list[int], ]: # TODO: implement PR 28597 from vllm - discard_sampled_tokens_req_indices = self.discard_request_indices.np[: self.num_discarded_requests] + if hasattr(self.discard_request_indices, 'np'): + discard_sampled_tokens_req_indices = self.discard_request_indices.np[: self.num_discarded_requests] + else: + discard_sampled_tokens_req_indices = self.discard_request_indices.cpu().numpy()[: self.num_discarded_requests] for i in discard_sampled_tokens_req_indices: gen = self.input_batch.generators.get(int(i)) if gen is not None: @@ -2042,11 +2194,18 @@ def _build_attention_metadata( # window size when capturing to make sure the correct kernel is selected. max_seq_len = self.max_model_len else: - max_seq_len = self.seq_lens.np[:num_reqs].max().item() + if hasattr(self.seq_lens, 'np'): + max_seq_len = self.seq_lens.np[:num_reqs].max().item() + else: + max_seq_len = self.seq_lens.cpu()[:num_reqs].max().item() if use_spec_decode and self.need_accepted_tokens: - self.num_accepted_tokens.np[:num_reqs] = self.input_batch.num_accepted_tokens_cpu[:num_reqs] - self.num_accepted_tokens.np[num_reqs:].fill(1) - self.num_accepted_tokens.copy_to_gpu() + if hasattr(self.num_accepted_tokens, 'np'): + self.num_accepted_tokens.np[:num_reqs] = self.input_batch.num_accepted_tokens_cpu[:num_reqs] + self.num_accepted_tokens.np[num_reqs:].fill(1) + else: + self.num_accepted_tokens[:num_reqs] = torch.from_numpy(self.input_batch.num_accepted_tokens_cpu[:num_reqs]).to(self.num_accepted_tokens.dtype).to(self.num_accepted_tokens.device) + self.num_accepted_tokens[num_reqs:].fill_(1) + self._safe_copy_to_gpu(self.num_accepted_tokens) kv_cache_groups = self.kv_cache_config.kv_cache_groups @@ -2089,7 +2248,7 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): ) else: blk_table = self.input_batch.block_table[kv_cache_gid] - slot_mapping = blk_table.slot_mapping.gpu[:maybe_pcp_full_tokens] + slot_mapping = self._get_buffer_gpu(blk_table.slot_mapping)[:maybe_pcp_full_tokens] maybe_num_reqs_padded = num_reqs_padded * self.decode_token_per_req if self.use_cp else num_reqs_padded blk_table_tensor = blk_table.get_device_tensor()[:maybe_num_reqs_padded] @@ -2111,12 +2270,18 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0) self.long_seq_metadata, block_table_gid_0 = _get_pcp_metadata(block_table_gid_0) + # Handle both CpuGpuBuffer and plain Tensor for CPU access + query_start_loc_cpu = self.query_start_loc.cpu if hasattr(self.query_start_loc, 'cpu') and isinstance(self.query_start_loc.cpu, torch.Tensor) else self.query_start_loc.cpu() + seq_lens_cpu = self.seq_lens.cpu if hasattr(self.seq_lens, 'cpu') and isinstance(self.seq_lens.cpu, torch.Tensor) else self.seq_lens.cpu() + query_start_loc_gpu = self._get_buffer_gpu(self.query_start_loc) + seq_lens_gpu = self._get_buffer_gpu(self.seq_lens) + cm_base = AscendCommonAttentionMetadata( - query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1], - query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1], - seq_lens=self.seq_lens.gpu[:num_reqs_padded], + query_start_loc=query_start_loc_gpu[: num_reqs_padded + 1], + query_start_loc_cpu=query_start_loc_cpu[: num_reqs_padded + 1], + seq_lens=seq_lens_gpu[:num_reqs_padded], # TODO - seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded], + seq_lens_cpu=seq_lens_cpu[:num_reqs_padded], # TODO num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs_padded], num_reqs=num_reqs_padded, @@ -2128,7 +2293,7 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): causal=True, num_input_tokens=num_tokens_padded, actual_seq_lengths_q=self.actual_seq_lengths_q, - positions=self.positions.gpu, + positions=self._get_buffer_gpu(self.positions), attn_state=self.attn_state, decode_token_per_req=self.decode_token_per_req, prefill_context_parallel_metadata=self.long_seq_metadata, @@ -2154,9 +2319,10 @@ def _build_attn_group_metadata( if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): assert ubid is None, "UBatching not supported with GDN yet" patch_torch_npu_argsort() + num_decode_draft_tokens_cpu = self.num_decode_draft_tokens.cpu if hasattr(self.num_decode_draft_tokens, 'cpu') and isinstance(self.num_decode_draft_tokens.cpu, torch.Tensor) else self.num_decode_draft_tokens.cpu() extra_attn_metadata_args = dict( - num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs_padded], - num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[:num_reqs_padded], + num_accepted_tokens=self._get_buffer_gpu(self.num_accepted_tokens)[:num_reqs_padded], + num_decode_draft_tokens_cpu=num_decode_draft_tokens_cpu[:num_reqs_padded], ) if for_cudagraph_capture: @@ -2205,8 +2371,9 @@ def _build_attn_group_metadata( attn_group = self.attn_groups[kv_cache_gid][0] builder = attn_group.get_metadata_builder(0) if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): - cm.query_start_loc_cpu = self.gdn_query_start_loc.cpu[: num_reqs_padded + 1] - cm.query_start_loc = self.gdn_query_start_loc.gpu[: num_reqs_padded + 1] + gdn_query_start_loc_cpu = self.gdn_query_start_loc.cpu if hasattr(self.gdn_query_start_loc, 'cpu') and isinstance(self.gdn_query_start_loc.cpu, torch.Tensor) else self.gdn_query_start_loc.cpu() + cm.query_start_loc_cpu = gdn_query_start_loc_cpu[: num_reqs_padded + 1] + cm.query_start_loc = self._get_buffer_gpu(self.gdn_query_start_loc)[: num_reqs_padded + 1] if kv_cache_gid > 0: cm.block_table_tensor, cm.slot_mapping = _get_block_table_and_slot_mapping(kv_cache_gid) @@ -2348,8 +2515,9 @@ def _dummy_run( num_reqs, ) if self.speculative_config: - self.pcp_manager.query_lens_pcp_full.cpu[:num_reqs] = torch.from_numpy(num_scheduled_tokens) - self.pcp_manager.query_lens_pcp_full.copy_to_gpu() + query_lens_pcp_full_cpu = self.pcp_manager.query_lens_pcp_full.cpu if hasattr(self.pcp_manager.query_lens_pcp_full, 'cpu') and isinstance(self.pcp_manager.query_lens_pcp_full.cpu, torch.Tensor) else self.pcp_manager.query_lens_pcp_full.cpu() + query_lens_pcp_full_cpu[:num_reqs] = torch.from_numpy(num_scheduled_tokens) + self._safe_copy_to_gpu(self.pcp_manager.query_lens_pcp_full) if cudagraph_runtime_mode is None: cudagraph_runtime_mode = _cudagraph_mode else: @@ -2392,13 +2560,20 @@ def _dummy_run( if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config) else max_query_len ) # type: ignore[assignment] - self.seq_lens.np[:num_reqs_padded] = seq_lens - self.seq_lens.np[num_reqs_padded:] = 0 - self.seq_lens.copy_to_gpu() + if hasattr(self.seq_lens, 'np'): + self.seq_lens.np[:num_reqs_padded] = seq_lens + self.seq_lens.np[num_reqs_padded:] = 0 + else: + self.seq_lens[:num_reqs_padded] = torch.tensor(seq_lens, dtype=self.seq_lens.dtype, device=self.seq_lens.device) + self.seq_lens[num_reqs_padded:] = 0 + self._safe_copy_to_gpu(self.seq_lens) - cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) - self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens - self.query_start_loc.copy_to_gpu() + cum_num_tokens = self._get_cumsum_and_arange(num_scheduled_tokens, self.arange_np) + if hasattr(self.query_start_loc, 'np'): + self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens + else: + self.query_start_loc[1 : num_reqs_padded + 1] = torch.from_numpy(cum_num_tokens).to(self.query_start_loc.dtype).to(self.query_start_loc.device) + self._safe_copy_to_gpu(self.query_start_loc) num_reqs_padded = self._pad_query_start_loc_for_fia( num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_runtime_mode, batch_desc.num_reqs ) @@ -2428,17 +2603,17 @@ def _dummy_run( assert num_tokens_padded <= self.max_num_tokens if self.is_multimodal_model and not self.model_config.is_encoder_decoder or self.enable_prompt_embeds: input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] + inputs_embeds = self._get_device_tensor(self.inputs_embeds)[:num_tokens_padded] else: - input_ids = self.input_ids.gpu[:num_tokens_padded] + input_ids = self._get_device_tensor(self.input_ids)[:num_tokens_padded] inputs_embeds = None if self.uses_mrope: - positions = self.mrope_positions.gpu[:, :num_tokens_padded] + positions = self._get_device_tensor(self.mrope_positions)[:, :num_tokens_padded] elif self.uses_xdrope_dim > 0: - positions = self.xdrope_positions.gpu[:, :num_tokens_padded] + positions = self._get_device_tensor(self.xdrope_positions)[:, :num_tokens_padded] else: - positions = self.positions.gpu[:num_tokens_padded] + positions = self._get_device_tensor(self.positions)[:num_tokens_padded] # update global cos, sin update_cos_sin(positions)