diff --git a/tests/unit_tests/sampler/test_hpu_sampler.py b/tests/unit_tests/sampler/test_hpu_sampler.py index 560a630085..1f0f2d51c7 100644 --- a/tests/unit_tests/sampler/test_hpu_sampler.py +++ b/tests/unit_tests/sampler/test_hpu_sampler.py @@ -13,7 +13,7 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler -from vllm.model_executor.utils import set_random_seed +from vllm.utils.torch_utils import set_random_seed from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils.platform_utils import is_pin_memory_available diff --git a/vllm_gaudi/v1/spec_decode/hpu_eagle.py b/vllm_gaudi/v1/spec_decode/hpu_eagle.py index dba9e5189f..7a0d508b0a 100644 --- a/vllm_gaudi/v1/spec_decode/hpu_eagle.py +++ b/vllm_gaudi/v1/spec_decode/hpu_eagle.py @@ -174,6 +174,7 @@ def prepare_attn_metadata( positions, model_runner): # Prepare attn metadata on CPU. (Improve for pure HPU based attn metadata preparation) + block_size = model_runner.block_size batch_size = positions.shape[0] exceeds_max_model_len = positions >= self.max_model_len clamped_positions = torch.where(exceeds_max_model_len, 0, positions) @@ -186,7 +187,7 @@ def prepare_attn_metadata( # block_tables_list is a nested list of shape [num_seq, num_blocks] # num_blocks should include the slots needed for the current token # positions are the context lengths, and we need +1 for num_blocks - num_blocks = torch.ceil((positions + 1) / self.block_size).int() + num_blocks = torch.ceil((positions + 1) / block_size).int() num_blocks = num_blocks[:num_seq].tolist() block_tables_list = [] for i, n in enumerate(num_blocks): @@ -198,7 +199,7 @@ def prepare_attn_metadata( # Compute slot mapping in [batch_size, 1] shape clamped_positions = clamped_positions.view(-1, 1) - block_numbers = clamped_positions // self.block_size + block_numbers = clamped_positions // block_size # Limit with num_seq because block_table_cpu_tensor is in the shape [num_seq, x] block_numbers = block_numbers.to(torch.int64)[:num_seq] @@ -208,8 +209,8 @@ def prepare_attn_metadata( block_ids.apply_(model_runner.defragmenter.resolve) # Calculate the slot mapping and fill with padding - slot_mapping = block_ids * self.block_size + clamped_positions % self.block_size - dummy_slots = itertools.cycle(range(model_runner._PAD_SLOT_ID, model_runner._PAD_SLOT_ID + self.block_size)) + slot_mapping = block_ids * block_size + clamped_positions % block_size + dummy_slots = itertools.cycle(range(model_runner._PAD_SLOT_ID, model_runner._PAD_SLOT_ID + block_size)) slot_mapping[num_seq:].apply_(lambda _, ds=dummy_slots: next(ds)) # Slot mapping needs to be int64 (long) type slot_mapping = slot_mapping.to(torch.int64) @@ -232,7 +233,7 @@ def prepare_attn_metadata( block_groups=block_groups_device, input_positions=None, slot_mapping=slot_mapping_device, - block_size=self.block_size, + block_size=block_size, window_block_list=None, window_block_usage=None, window_block_groups=None, diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 7d3d392d35..1c431ec3be 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -2841,9 +2841,7 @@ def _pool( pooling_metadata = self.input_batch.get_pooling_metadata() seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs] - pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(), - seq_lens_cpu, - device=hidden_states.device) + pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np, seq_lens_cpu, device=hidden_states.device) num_reqs = self.input_batch.num_reqs @@ -3158,6 +3156,25 @@ def execute_model( return EMPTY_MODEL_RUNNER_OUTPUT # For D case, wait until kv finish load here return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + + if self.input_batch.pooling_params: + (input_ids, position_ids, num_scheduled_tokens, attn_metadata, + total_scheduled_tokens) = self._prepare_inputs_for_pooling(scheduler_output) + + with set_forward_context(attn_metadata, self.vllm_config): + hidden_states = self.model.forward( + input_ids=input_ids, + positions=position_ids, + ) + + flattened = hidden_states.view(-1, hidden_states.shape[-1]) + pooled_output = self._pool( + flattened, + total_scheduled_tokens, + np.array(num_scheduled_tokens, dtype=np.int32), + ) + return pooled_output + self.scheduler_output = scheduler_output self.warmup_mode = warmup_mode self.batch_changed = batch_changed @@ -3233,23 +3250,7 @@ def sample_tokens(self, grammar_output: "GrammarOutput | None") -> ModelRunnerOu # Return [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] batch_changed = self.batch_changed - if self.input_batch.pooling_params: - (input_ids, position_ids, num_scheduled_tokens, attn_metadata, - total_scheduled_tokens) = self._prepare_inputs_for_pooling(scheduler_output) - with set_forward_context(attn_metadata, self.vllm_config): - hidden_states = self.model.forward( - input_ids=input_ids, - positions=position_ids, - ) - - flattened = hidden_states.view(-1, hidden_states.shape[-1]) - pooled_output = self._pool( - flattened, - total_scheduled_tokens, - np.array(num_scheduled_tokens, dtype=np.int32), - ) - return pooled_output # If necessary, swap decodes/prompts to have all decodes on the start ensure_decodes_first(self.input_batch) @@ -3903,8 +3904,8 @@ def warmup_pooler(self): ) # flattened = hidden_states.view(-1, hidden_states.shape[-1]) - num_scheduled_tokens_list = [query_len] * bs - prompt_lens_cpu = torch.tensor(num_scheduled_tokens_list, dtype=torch.int32, device="cpu") + num_scheduled_tokens_np = np.full(query_len, bs) + prompt_lens_cpu = torch.tensor(num_scheduled_tokens_np, dtype=torch.int32, device="cpu") prompt_token_ids = dummy_input_ids.view(bs, query_len).to(device=device, dtype=torch.int32) supported_tasks = self.get_supported_pooling_tasks() if "embed" in supported_tasks: @@ -3927,8 +3928,8 @@ def warmup_pooler(self): pooling_params=pooling_params_list, pooling_states=[PoolingStates() for _ in range(bs)], ) - seq_lens_cpu = seq_lens_tensor.cpu().tolist() - pooling_metadata.build_pooling_cursor(num_scheduled_tokens_list, seq_lens_cpu, device=hidden_states.device) + seq_lens_cpu = seq_lens_tensor.cpu() + pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np, seq_lens_cpu, device=hidden_states.device) try: _pooler_output = model.pooler(hidden_states=hidden_states, pooling_metadata=pooling_metadata) diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py index 2f00c5cfb5..f1b3878ff3 100644 --- a/vllm_gaudi/v1/worker/hpu_worker.py +++ b/vllm_gaudi/v1/worker/hpu_worker.py @@ -26,8 +26,7 @@ has_kv_transfer_group, ) from vllm.distributed.parallel_state import get_tp_group -from vllm.model_executor import set_random_seed -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import (STR_DTYPE_TO_TORCH_DTYPE, set_random_seed) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import (DraftTokenIds, AsyncModelRunnerOutput, ModelRunnerOutput) from vllm.v1.worker.utils import bind_kv_cache @@ -85,11 +84,6 @@ def __init__( else: self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[self.cache_config.cache_dtype] - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils.import_utils import init_cached_hf_modules - init_cached_hf_modules() - self.gc_track_recompiles = get_config().track_graph_compilation and not get_config().high_level_profiler_enabled self.step = 0 self.profile_steps = get_config().VLLM_PROFILE_STEPS