From a54fc7400171bf5a35d5fa1c74a1e51ed8a10787 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 7 Jan 2026 09:22:12 -0500 Subject: [PATCH 01/28] is_null instead of 0 check Signed-off-by: NickLucche --- vllm/v1/core/kv_cache_manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 7f8d804753d2..3ff9f2b7b048 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -84,6 +84,11 @@ def get_unhashed_block_ids(self) -> list[int]: assert len(self.blocks) == 1, "Only one group is supported" return [block.block_id for block in self.blocks[0] if block.block_hash is None] + def get_unhashed_block_ids_all_groups(self) -> list[int]: + """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" + # Skip padding blocks. + return [[block.block_id for block in group if block.block_hash is None and not block.is_null] for group in self.blocks] + def new_empty(self) -> "KVCacheBlocks": """ Creates a new KVCacheBlocks instance with no blocks. From c664dbf5e05c9a5408565e605715b63dee6f9102 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 7 Jan 2026 10:11:00 -0500 Subject: [PATCH 02/28] get_sw_clippped_blocks to fix over-allocation for swa on D Signed-off-by: NickLucche --- .../kv_connector/v1/nixl_connector.py | 52 +++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 87091d650b17..7f02d4cdb235 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -57,6 +57,7 @@ from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import SlidingWindowSpec from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: @@ -333,7 +334,7 @@ def __init__( self.kv_transfer_config = vllm_config.kv_transfer_config if role == KVConnectorRole.SCHEDULER: self.connector_scheduler: NixlConnectorScheduler | None = ( - NixlConnectorScheduler(vllm_config, self.engine_id) + NixlConnectorScheduler(vllm_config, self.engine_id, kv_cache_config) ) self.connector_worker: NixlConnectorWorker | None = None elif role == KVConnectorRole.WORKER: @@ -515,10 +516,11 @@ def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: class NixlConnectorScheduler: """Implementation of Scheduler side methods""" - def __init__(self, vllm_config: VllmConfig, engine_id: str): + def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: Optional["KVCacheConfig"] = None): self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size self.engine_id: EngineId = engine_id + self.kv_cache_config = kv_cache_config self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST self.side_channel_port = ( envs.VLLM_NIXL_SIDE_CHANNEL_PORT @@ -551,12 +553,35 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): # remote prefill or aborted. self._reqs_not_processed: set[ReqId] = set() + # Gather Sliding Window sizes for each kv cache group (if any) + # in number of blocks per SW group. + sw_sizes_tokens = [group.kv_cache_spec.sliding_window if isinstance(group.kv_cache_spec, SlidingWindowSpec) else 0 for group in kv_cache_config.kv_cache_groups] + self.sw_sizes = [n_tokens // self.block_size for n_tokens in sw_sizes_tokens] + print(f"sw_sizes: {self.sw_sizes}\n", flush=True) + def shutdown(self): self._stop_event.set() if self._nixl_handshake_listener_t is not None: self._nixl_handshake_listener_t.join() self._nixl_handshake_listener_t = None + + def get_sw_clippped_blocks(self, block_ids: tuple[list[int], ...]) -> tuple[list[int], ...]: + """ + Clip the number of blocks to the sliding window size for each kv cache group + that employs SWA. + This is necessary because the KV Cache manager initially allocates blocks for + the entire sequence length, and successively cleans up blocks that are outside + the window prior to the `request_finished_all_groups` hook. + """ + # NOTE (NickLucche) This logic is currently handled at the connector level + # because offloading connectors might want to receive the whole sequence even + # for SWA groups. We will abstract this logic once the interface is more stable + assert len(block_ids) == len(self.sw_sizes), "Number of KV cache groups must match" + print("CLIPPING BLOCKS", block_ids) + print("to ", tuple([blocks[-self.sw_sizes[i]:] for i, blocks in enumerate(block_ids)]), "\n", flush=True) + return tuple([blocks[-self.sw_sizes[i]:] for i, blocks in enumerate(block_ids)]) + def set_xfer_handshake_metadata( self, metadata: dict[int, KVConnectorHandshakeMetadata] ) -> None: @@ -705,10 +730,19 @@ def update_state_after_alloc( # a full prefix cache hit on the D worker. We need to call # send_notif in _read_blocks to free the memory on the P. local_block_ids = ( - blocks.get_unhashed_block_ids() + blocks.get_unhashed_block_ids_all_groups() if num_external_tokens > 0 else [] ) + local_block_ids = self.get_sw_clippped_blocks(local_block_ids) + # FIXME we're allocating one more here for the SWA ones, which break len(local)==len(remote)..? + # this is still 17 + print( + f"update_state_after_alloc local_block_ids unhashed: {local_block_ids}\n", + flush=True, + ) + # ok so if num_external_tokens==0, we just record the request here but dont actually + # read from worker, just send_notif # Get unhashed blocks to pull from remote. self._reqs_need_recv[request.request_id] = ( request, @@ -750,9 +784,11 @@ def build_connector_meta( req = req_to_save assert req.kv_transfer_params is not None + new_block_id_groups = self.get_sw_clippped_blocks(new_block_id_groups) meta.add_new_req_to_save( request_id=req_id, - local_block_ids=new_block_id_groups[0], + # FIXME new_block_id_groups[0] when hma is off? + local_block_ids=new_block_id_groups, kv_transfer_params=req.kv_transfer_params, ) assert scheduler_output.num_scheduled_tokens is not None @@ -777,6 +813,8 @@ def build_connector_meta( self._reqs_in_batch = set() self._reqs_not_processed = set() self._reqs_need_send = {} + if len(meta.reqs_to_recv) > 0: + print("build_connector_meta", meta.reqs_to_recv, "\n", flush=True) return meta @@ -838,6 +876,12 @@ def request_finished( self._reqs_need_send[request.request_id] = ( time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT ) + # NOTE HMA will "mark" empty/null blocks in groups with 0s (eg SWA ones), + # trimming down after allocating for the whole sequence length. + # Here we "unpad" blocks to send the actual remote blocks to be read. + # Equal to `get_sw_clippped_blocks` in functionality but for P, after + # manager has cleaned up blocks and marked them as null. + block_ids = tuple([block for block in group if block != 0] for group in block_ids) return delay_free_blocks, dict( do_remote_prefill=True, From f284578ca4f0f6e5aa8b7f9f314281615ddc483a Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 7 Jan 2026 10:14:16 -0500 Subject: [PATCH 03/28] fix issue with null blocks on P being one extra (17) by clipping Signed-off-by: NickLucche --- .../kv_transfer/kv_connector/v1/nixl_connector.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 7f02d4cdb235..cf72eab84ddb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -863,7 +863,12 @@ def request_finished( # TODO: check whether block_ids actually ever be 0. If not we could # remove the conditional below - delay_free_blocks = len(block_ids) > 0 + print(f"request_finished block_ids: {block_ids}\n\n", flush=True) + if isinstance(block_ids, tuple): + # FIXME just use kvcache_config to figure out if hma is on + delay_free_blocks = any(len(group) > 0 for group in block_ids) + else: + delay_free_blocks = len(block_ids) > 0 if delay_free_blocks: # Prefill request on remote. It will be read from D upon completion @@ -877,11 +882,11 @@ def request_finished( time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT ) # NOTE HMA will "mark" empty/null blocks in groups with 0s (eg SWA ones), - # trimming down after allocating for the whole sequence length. + # trimming down after allocating for the whole sequence length. Empty + # blocks are always at the start of the list. # Here we "unpad" blocks to send the actual remote blocks to be read. - # Equal to `get_sw_clippped_blocks` in functionality but for P, after - # manager has cleaned up blocks and marked them as null. - block_ids = tuple([block for block in group if block != 0] for group in block_ids) + block_ids = self.get_sw_clippped_blocks(block_ids) + print(f"request_finished unpadded block_ids: {block_ids}\n\n", flush=True) return delay_free_blocks, dict( do_remote_prefill=True, From 2e9e384261d1c2ba63ffad0fa5035852f64d0322 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 7 Jan 2026 10:16:21 -0500 Subject: [PATCH 04/28] remove llama4 opt Signed-off-by: NickLucche --- .../kv_connector/v1/nixl_connector.py | 118 ++++-------------- 1 file changed, 22 insertions(+), 96 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index cf72eab84ddb..726d82069f5f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -729,14 +729,14 @@ def update_state_after_alloc( # If remote_blocks and num_external_tokens = 0, we have # a full prefix cache hit on the D worker. We need to call # send_notif in _read_blocks to free the memory on the P. + + # TODO sync with Chen on prefix cache + HMA local_block_ids = ( blocks.get_unhashed_block_ids_all_groups() if num_external_tokens > 0 else [] ) local_block_ids = self.get_sw_clippped_blocks(local_block_ids) - # FIXME we're allocating one more here for the SWA ones, which break len(local)==len(remote)..? - # this is still 17 print( f"update_state_after_alloc local_block_ids unhashed: {local_block_ids}\n", flush=True, @@ -1063,10 +1063,6 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config - # TODO(mgoin): remove this once we have hybrid memory allocator - # Optimization for models with local attention (Llama 4) - # List of block window sizes for each layer for local attention - self.block_window_per_layer: list[int | None] = [] self.use_mla = self.model_config.use_mla # Get the attention backend from the first layer @@ -1503,28 +1499,6 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self.register_local_xfer_handler(self.block_size) ) - # TODO(mgoin): Hybrid memory allocator is currently disabled for - # models with local attention (Llama 4). Can remove this once enabled. - if self.model_config.hf_config.model_type == "llama4": - from transformers import Llama4TextConfig - - assert isinstance(self.model_config.hf_text_config, Llama4TextConfig) - llama4_config = self.model_config.hf_text_config - no_rope_layers = llama4_config.no_rope_layers - chunk_size = llama4_config.attention_chunk_size - chunk_block_size = math.ceil(chunk_size / self.block_size) - for layer_idx in range(self.num_layers): - # no_rope_layers[layer_idx] == 0 means NoPE (global) - # Any other value means RoPE (local chunked) - is_local_attention = no_rope_layers[layer_idx] != 0 - block_window = chunk_block_size if is_local_attention else None - self.block_window_per_layer.append(block_window) - logger.debug( - "Llama 4 block window per layer mapping: %s", - self.block_window_per_layer, - ) - assert len(self.block_window_per_layer) == self.num_layers - # After KV Caches registered, listen for new connections. agent_metadata = NixlAgentMetadata( engine_id=self.engine_id, @@ -2354,55 +2328,15 @@ def _read_blocks( # workers will issue xfers to parts of the P worker remote kv caches. # Get descs ids. - local_block_descs_ids: np.ndarray - remote_block_descs_ids: np.ndarray - - if not self.block_window_per_layer: - # Default case: assume global attention - remote_block_descs_ids = self._get_block_descs_ids( - dst_engine_id, - remote_block_ids, - ) - local_block_descs_ids = self._get_block_descs_ids( - self.engine_id, - local_block_ids, - block_size_ratio=block_size_ratio, - ) - else: - # TODO(mgoin): remove this once we have hybrid memory allocator - # Optimization for models with local attention (Llama 4) - local_descs_list = [] - remote_descs_list = [] - for layer_idx, block_window in enumerate(self.block_window_per_layer): - # For each layer: - if block_window is None: - # If not chunked, we just use the - # full block lists (global attention) - layer_local_block_ids = local_block_ids - layer_remote_block_ids = remote_block_ids - else: - # If chunked, get the last block_window blocks - layer_local_block_ids = local_block_ids[-block_window:] - layer_remote_block_ids = remote_block_ids[-block_window:] - - # Get descs ids for the layer. - layer_local_desc_ids = self._get_block_descs_ids( - self.engine_id, - layer_local_block_ids, - layer_idx, - block_size_ratio=block_size_ratio, - ) - layer_remote_desc_ids = self._get_block_descs_ids( - dst_engine_id, - layer_remote_block_ids, - layer_idx, - ) - - local_descs_list.append(layer_local_desc_ids) - remote_descs_list.append(layer_remote_desc_ids) - - local_block_descs_ids = np.concatenate(local_descs_list) - remote_block_descs_ids = np.concatenate(remote_descs_list) + remote_block_descs_ids = self._get_block_descs_ids( + dst_engine_id, + remote_block_ids, + ) + local_block_descs_ids = self._get_block_descs_ids( + self.engine_id, + local_block_ids, + block_size_ratio=block_size_ratio, + ) assert len(local_block_descs_ids) == len(remote_block_descs_ids) @@ -2462,30 +2396,22 @@ def get_mapped_blocks(self, block_ids, block_size_ratio): def _get_block_descs_ids( self, engine_id: str, - block_ids: list[int], - layer_idx: int | None = None, + block_ids: tuple[list[int], ...], block_size_ratio: float | None = None, ) -> np.ndarray: """ Get the descs ids for a set of block ids. - If layer_idx is provided, we use the region_ids for the given layer. - Otherwise, we use all regions. + When HMA is enabled number of descriptors across kv cache groups might differ. + A single flattened array is returned for all groups anyway. """ - if layer_idx is None: - region_ids = np.arange(self.num_regions) - else: - assert layer_idx < self.num_layers - if self.num_layers < self.num_regions: - # If we have more regions than layers, we assume that - # the regions are organized as [K0, V0, K1, V1, ...] - # and we select K_i and V_i - assert 2 * self.num_layers == self.num_regions - region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2) - else: - # Otherwise, we assume we have MLA and select i-th layer - assert self.num_layers == self.num_regions - region_ids = np.arange(layer_idx, layer_idx + 1) - + region_ids = np.arange(self.num_regions) + # NOTE (NickLucche) With HMA, every kv group has the same number of layers and + # layers from different groups share the same kv tensor. + # eg block_ids=[[1, 2], [3]]->blocks [1, 2] need to be read across all regions, + # same for [3], but group0-group1 blocks will always differ (different areas). + # Therefore we can just flatten the block_ids and compute the descs ids for all + # groups at once. + print("get_block_descs_ids", block_ids, "\n") num_blocks = self.dst_num_blocks[engine_id] if block_size_ratio is not None: num_blocks = int(num_blocks * block_size_ratio) From c1234f0a95129c93a073c486242e6b7b37121e79 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 7 Jan 2026 14:07:51 -0500 Subject: [PATCH 05/28] supportshma + scheduler change Signed-off-by: NickLucche --- .../kv_connector/v1/nixl_connector.py | 44 ++++++++++++++----- vllm/v1/core/sched/scheduler.py | 9 ++-- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 726d82069f5f..7eba1f8cd022 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -38,6 +38,7 @@ KVConnectorHandshakeMetadata, KVConnectorMetadata, KVConnectorRole, + SupportsHMA ) from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorPromMetrics, @@ -236,7 +237,7 @@ def compute_nixl_compatibility_hash( @dataclass class RemoteMeta: - block_ids: list[int] + block_ids: tuple[list[int], ...] host: str port: int engine_id: str @@ -245,9 +246,9 @@ class RemoteMeta: @dataclass class ReqMeta: - local_block_ids: list[int] + local_block_ids: tuple[list[int], ...] # To be used when logical block size does not match the kernel block size - local_physical_block_ids: list[int] + local_physical_block_ids: tuple[list[int], ...] tp_size: int remote: RemoteMeta | None = None @@ -275,7 +276,7 @@ def _add_new_req( def add_new_req_to_save( self, request_id: ReqId, - local_block_ids: list[int], + local_block_ids: tuple[list[int], ...], kv_transfer_params: dict[str, Any], ): self.reqs_to_save[request_id] = self._add_new_req( @@ -299,7 +300,8 @@ def add_new_req_to_recv( self.reqs_to_recv[request_id] = req -class NixlConnector(KVConnectorBase_V1): +class NixlConnector(KVConnectorBase_V1, SupportsHMA): + @property def prefer_cross_layer_blocks(self) -> bool: backend = get_current_attn_backend(self._vllm_config) @@ -327,6 +329,7 @@ def __init__( kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__(vllm_config, role, kv_cache_config) + print("NixlConnector init", kv_cache_config.kv_cache_groups, "\n", flush=True) assert vllm_config.kv_transfer_config is not None assert vllm_config.kv_transfer_config.engine_id is not None @@ -397,6 +400,18 @@ def request_finished( ) -> tuple[bool, dict[str, Any] | None]: assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) + + def request_finished_all_groups( + self, + request: "Request", + block_ids: tuple[list[int], ...], + ) -> tuple[bool, dict[str, Any] | None]: + print( + f"request_finished_all_groups: {request.request_id}, {block_ids}", + flush=True, + ) + return self.connector_scheduler.request_finished(request, block_ids) + def set_xfer_handshake_metadata( self, metadata: dict[int, KVConnectorHandshakeMetadata] @@ -741,8 +756,6 @@ def update_state_after_alloc( f"update_state_after_alloc local_block_ids unhashed: {local_block_ids}\n", flush=True, ) - # ok so if num_external_tokens==0, we just record the request here but dont actually - # read from worker, just send_notif # Get unhashed blocks to pull from remote. self._reqs_need_recv[request.request_id] = ( request, @@ -821,7 +834,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: list[int], + block_ids: list[int] | tuple[list[int], ...], ) -> tuple[bool, dict[str, Any] | None]: """ Once a request is finished, determine whether request blocks @@ -1072,8 +1085,8 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.backend_name = self.attn_backend.get_name() self.kv_cache_layout = get_kv_cache_layout() self.host_buffer_kv_cache_layout = self.kv_cache_layout - logger.debug("Detected attention backend %s", self.backend_name) - logger.debug("Detected kv cache layout %s", self.kv_cache_layout) + logger.info("Detected attention backend %s", self.backend_name) + logger.info("Detected kv cache layout %s", self.kv_cache_layout) # lazy initialized in register_kv_caches self.compat_hash: str | None = None @@ -1412,6 +1425,11 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): for cache in cache_list: base_addr = cache.data_ptr() if base_addr in seen_base_addresses: + # NOTE (NickLucche) HMA employs memory pooling to share tensors + # across groups. This results in skipping all tensors but the ones + # pointed to by group0. Also, generally we will have more blocks + # per tensor but fewer regions. + print(f"layer {layer_name} already seen, skipping", flush=True) continue logger.debug( @@ -2418,8 +2436,11 @@ def _get_block_descs_ids( # Compute the desc ids for each block. region_ids = region_ids[:, None] - block_ids = np.array(block_ids)[None, :] + block_ids = np.concatenate(block_ids)[None, :] descs_ids = region_ids * num_blocks + block_ids + print( + "get_block_descs_ids num output", len(descs_ids.flatten()), "\n", flush=True + ) return descs_ids.flatten() def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]: @@ -2431,6 +2452,7 @@ def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]: if self._physical_blocks_per_logical_kv_block == 1: # Noop when physical and logical block sizes are the same return block_ids + # FIXME should you just flatten the tuple here? Result should be the same block_ids_np = np.array(block_ids) block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape( 1, -1 diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index bf397ad681ca..4ba57e045680 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1990,9 +1990,12 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: self.failed_recving_kv_req_ids.remove(request.request_id) else: # Now that the blocks are ready, actually cache them. - (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id) - num_computed_tokens = len(block_ids) * self.block_size - # Handle the case where num request tokens less than one block. + # FIXME this should only be changed if hma is enabled support_hma check here! + # (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id) + block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + # Get number of blocks on full attention layer, we can retrieve at most + # this many tokens + num_computed_tokens = max(len(group) for group in block_ids) * self.block_size # Handle the case where num request tokens less than one block. num_computed_tokens = min(num_computed_tokens, request.num_tokens) if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 From 8cfd981a7bde7140d5c1604b125ab4c491498c91 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 12 Jan 2026 06:46:55 -0500 Subject: [PATCH 06/28] partial prefix cache hit + block_size_ratio + signatures Signed-off-by: NickLucche --- .../kv_connector/v1/nixl_connector.py | 76 ++++++++++++------- vllm/v1/core/kv_cache_manager.py | 2 +- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 7eba1f8cd022..ea7dfbede775 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -68,6 +68,7 @@ TransferHandle = int ReqId = str +BlockIds = list[int] | tuple[list[int], ...] # # NIXL Connector Version @@ -237,7 +238,8 @@ def compute_nixl_compatibility_hash( @dataclass class RemoteMeta: - block_ids: tuple[list[int], ...] + # Non-HMA | HMA blocks + block_ids: BlockIds host: str port: int engine_id: str @@ -246,9 +248,9 @@ class RemoteMeta: @dataclass class ReqMeta: - local_block_ids: tuple[list[int], ...] + local_block_ids: BlockIds # To be used when logical block size does not match the kernel block size - local_physical_block_ids: tuple[list[int], ...] + local_physical_block_ids: BlockIds tp_size: int remote: RemoteMeta | None = None @@ -263,7 +265,7 @@ def __init__(self): def _add_new_req( self, - local_block_ids: list[int], + local_block_ids: BlockIds, kv_transfer_params: dict[str, Any], ) -> ReqMeta: return ReqMeta( @@ -276,7 +278,7 @@ def _add_new_req( def add_new_req_to_save( self, request_id: ReqId, - local_block_ids: tuple[list[int], ...], + local_block_ids: BlockIds, kv_transfer_params: dict[str, Any], ): self.reqs_to_save[request_id] = self._add_new_req( @@ -286,7 +288,7 @@ def add_new_req_to_save( def add_new_req_to_recv( self, request_id: ReqId, - local_block_ids: list[int], + local_block_ids: BlockIds, kv_transfer_params: dict[str, Any], ): req = self._add_new_req(local_block_ids, kv_transfer_params) @@ -326,7 +328,7 @@ def __init__( self, vllm_config: VllmConfig, role: KVConnectorRole, - kv_cache_config: "KVCacheConfig | None" = None, + kv_cache_config: KVCacheConfig, ): super().__init__(vllm_config, role, kv_cache_config) print("NixlConnector init", kv_cache_config.kv_cache_groups, "\n", flush=True) @@ -342,7 +344,7 @@ def __init__( self.connector_worker: NixlConnectorWorker | None = None elif role == KVConnectorRole.WORKER: self.connector_scheduler = None - self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id) + self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id, kv_cache_config) ############################################################ # Class Methods @@ -398,6 +400,7 @@ def request_finished( request: "Request", block_ids: list[int], ) -> tuple[bool, dict[str, Any] | None]: + # Hybrid memory allocator (HMA) disabled assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) @@ -406,10 +409,12 @@ def request_finished_all_groups( request: "Request", block_ids: tuple[list[int], ...], ) -> tuple[bool, dict[str, Any] | None]: + # Hybrid memory allocator (HMA) enabled print( f"request_finished_all_groups: {request.request_id}, {block_ids}", flush=True, ) + assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) @@ -531,7 +536,7 @@ def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: class NixlConnectorScheduler: """Implementation of Scheduler side methods""" - def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: Optional["KVCacheConfig"] = None): + def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: KVCacheConfig): self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size self.engine_id: EngineId = engine_id @@ -548,6 +553,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: Opt self.use_host_buffer = ( vllm_config.kv_transfer_config.kv_buffer_device == "cpu" ) + self._is_hma_enabled = not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager logger.info("Initializing NIXL Scheduler %s", engine_id) @@ -568,8 +574,8 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: Opt # remote prefill or aborted. self._reqs_not_processed: set[ReqId] = set() - # Gather Sliding Window sizes for each kv cache group (if any) - # in number of blocks per SW group. + # Gather Sliding Window sizes for each kv cache group (if any) in number of + # blocks per KV cache group. This is used to clip the local attention window. sw_sizes_tokens = [group.kv_cache_spec.sliding_window if isinstance(group.kv_cache_spec, SlidingWindowSpec) else 0 for group in kv_cache_config.kv_cache_groups] self.sw_sizes = [n_tokens // self.block_size for n_tokens in sw_sizes_tokens] print(f"sw_sizes: {self.sw_sizes}\n", flush=True) @@ -581,7 +587,7 @@ def shutdown(self): self._nixl_handshake_listener_t = None - def get_sw_clippped_blocks(self, block_ids: tuple[list[int], ...]) -> tuple[list[int], ...]: + def get_sw_clippped_blocks(self, block_ids: BlockIds) -> BlockIds: """ Clip the number of blocks to the sliding window size for each kv cache group that employs SWA. @@ -589,6 +595,9 @@ def get_sw_clippped_blocks(self, block_ids: tuple[list[int], ...]) -> tuple[list the entire sequence length, and successively cleans up blocks that are outside the window prior to the `request_finished_all_groups` hook. """ + if len(block_ids) == 0 or not self._is_hma_enabled: + # No blocks to clip eg Full prefix cache hit + return block_ids # NOTE (NickLucche) This logic is currently handled at the connector level # because offloading connectors might want to receive the whole sequence even # for SWA groups. We will abstract this logic once the interface is more stable @@ -745,7 +754,6 @@ def update_state_after_alloc( # a full prefix cache hit on the D worker. We need to call # send_notif in _read_blocks to free the memory on the P. - # TODO sync with Chen on prefix cache + HMA local_block_ids = ( blocks.get_unhashed_block_ids_all_groups() if num_external_tokens > 0 @@ -756,7 +764,8 @@ def update_state_after_alloc( f"update_state_after_alloc local_block_ids unhashed: {local_block_ids}\n", flush=True, ) - # Get unhashed blocks to pull from remote. + # Get unhashed blocks to pull from remote. Mind that a full prefix + # cache hit is indicated with an empty list. self._reqs_need_recv[request.request_id] = ( request, local_block_ids, @@ -834,7 +843,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: list[int] | tuple[list[int], ...], + block_ids: BlockIds, ) -> tuple[bool, dict[str, Any] | None]: """ Once a request is finished, determine whether request blocks @@ -877,8 +886,7 @@ def request_finished( # TODO: check whether block_ids actually ever be 0. If not we could # remove the conditional below print(f"request_finished block_ids: {block_ids}\n\n", flush=True) - if isinstance(block_ids, tuple): - # FIXME just use kvcache_config to figure out if hma is on + if self._is_hma_enabled: delay_free_blocks = any(len(group) > 0 for group in block_ids) else: delay_free_blocks = len(block_ids) > 0 @@ -916,7 +924,7 @@ def request_finished( class NixlConnectorWorker: """Implementation of Worker side methods""" - def __init__(self, vllm_config: VllmConfig, engine_id: str): + def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: KVCacheConfig): if NixlWrapper is None: logger.error("NIXL is not available") raise RuntimeError("NIXL is not available") @@ -934,6 +942,8 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config( "backends", ["UCX"] ) + self._is_hma_enabled = not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager + self.kv_cache_config = kv_cache_config # Agent. non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"] @@ -1805,6 +1815,10 @@ def _validate_remote_agent_handshake( # Num kv_heads > tp_size and P TP > D TP case, not supported assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id)) + if self._is_hma_enabled: + assert block_size_ratio == 1, "HMA does not support different" + " remote block size yet" + kv_cache_layout = ( self.kv_cache_layout if not self.use_host_buffer @@ -2268,8 +2282,8 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta): def _read_blocks( self, - local_block_ids: list[int], - remote_block_ids: list[int], + local_block_ids: BlockIds, + remote_block_ids: BlockIds, dst_engine_id: str, request_id: str, remote_request_id: str, @@ -2316,8 +2330,8 @@ def _read_blocks( # Full prefix cache hit: do not need to read remote blocks, # just notify P worker that we have the blocks we need. - num_local_blocks = len(local_block_ids) - if num_local_blocks == 0: + if len(local_block_ids) == 0: + # A full prefix cache hit is indicated with an empty list. agent_name = self._remote_agents[dst_engine_id][remote_rank] try: self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id) @@ -2336,10 +2350,13 @@ def _read_blocks( return # Partial prefix cache hit: just read uncomputed blocks. - num_remote_blocks = len(remote_block_ids) - assert num_local_blocks <= num_remote_blocks - if num_local_blocks < num_remote_blocks: - remote_block_ids = remote_block_ids[-num_local_blocks:] + assert len(remote_block_ids) == len(local_block_ids) == len(self.kv_cache_config.kv_cache_groups) + for i, remote_group in enumerate(remote_block_ids): + num_remote_blocks = len(remote_group) + num_local_blocks = len(local_block_ids[i]) + assert num_local_blocks <= num_remote_blocks + if num_local_blocks < num_remote_blocks: + remote_block_ids[i] = remote_group[-num_local_blocks:] # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from # corresponding rank. With heterogeneous TP, fixing D>P, the D tp @@ -2414,7 +2431,7 @@ def get_mapped_blocks(self, block_ids, block_size_ratio): def _get_block_descs_ids( self, engine_id: str, - block_ids: tuple[list[int], ...], + block_ids: BlockIds, block_size_ratio: float | None = None, ) -> np.ndarray: """ @@ -2437,6 +2454,11 @@ def _get_block_descs_ids( # Compute the desc ids for each block. region_ids = region_ids[:, None] block_ids = np.concatenate(block_ids)[None, :] + if self._is_hma_enabled: + block_ids = np.concatenate(block_ids)[None, :] + else: + # FIXME check if this can be folded to equivalent concat + block_ids = np.array(block_ids)[None, :] descs_ids = region_ids * num_blocks + block_ids print( "get_block_descs_ids num output", len(descs_ids.flatten()), "\n", flush=True diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 3ff9f2b7b048..065ef8c0a35a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -84,7 +84,7 @@ def get_unhashed_block_ids(self) -> list[int]: assert len(self.blocks) == 1, "Only one group is supported" return [block.block_id for block in self.blocks[0] if block.block_hash is None] - def get_unhashed_block_ids_all_groups(self) -> list[int]: + def get_unhashed_block_ids_all_groups(self) -> list[list[int]]: """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" # Skip padding blocks. return [[block.block_id for block in group if block.block_hash is None and not block.is_null] for group in self.blocks] From 06d2669b20ca6e2af1cb895e85ca25d211a88e11 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 12 Jan 2026 10:43:03 -0500 Subject: [PATCH 07/28] block failure handling + block_ratio handling + remove old request_finished signature cruft Signed-off-by: NickLucche --- .../kv_transfer/kv_connector/utils.py | 11 +++ .../kv_connector/v1/nixl_connector.py | 79 ++++++------------- vllm/v1/core/sched/scheduler.py | 35 +++++--- 3 files changed, 61 insertions(+), 64 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index f9367da73710..8fe93d6e49a3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -300,6 +300,17 @@ def yield_req_data( (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids), ) +def get_blocks_in_fa_kv_group(block_ids: tuple[list[int], ...]) -> tuple[list[int], ...]: + """ + Get blocks in the full attention KV group, which we assume to be the largest group. + Note that when HMA is disabled or the model is not hybrid, + a single group is present here. + """ + if not block_ids: + # Full prefix cache hit case + return [] + argmax_i = max(range(len(block_ids)), key=lambda x: len(block_ids[x])) + return block_ids[argmax_i] @dataclass class TpKVTopology: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index ea7dfbede775..c56489e4fc80 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -27,6 +27,7 @@ EngineId, TpKVTopology, get_current_attn_backend, + get_blocks_in_fa_kv_group, kv_postprocess_blksize_and_layout_on_receive, kv_postprocess_blksize_on_receive, kv_postprocess_layout_on_receive, @@ -328,10 +329,9 @@ def __init__( self, vllm_config: VllmConfig, role: KVConnectorRole, - kv_cache_config: KVCacheConfig, + kv_cache_config: "KVCacheConfig", ): super().__init__(vllm_config, role, kv_cache_config) - print("NixlConnector init", kv_cache_config.kv_cache_groups, "\n", flush=True) assert vllm_config.kv_transfer_config is not None assert vllm_config.kv_transfer_config.engine_id is not None @@ -395,25 +395,12 @@ def build_connector_meta( assert self.connector_scheduler is not None return self.connector_scheduler.build_connector_meta(scheduler_output) - def request_finished( - self, - request: "Request", - block_ids: list[int], - ) -> tuple[bool, dict[str, Any] | None]: - # Hybrid memory allocator (HMA) disabled - assert self.connector_scheduler is not None - return self.connector_scheduler.request_finished(request, block_ids) - def request_finished_all_groups( self, request: "Request", block_ids: tuple[list[int], ...], ) -> tuple[bool, dict[str, Any] | None]: # Hybrid memory allocator (HMA) enabled - print( - f"request_finished_all_groups: {request.request_id}, {block_ids}", - flush=True, - ) assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) @@ -536,7 +523,7 @@ def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: class NixlConnectorScheduler: """Implementation of Scheduler side methods""" - def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: KVCacheConfig): + def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KVCacheConfig"): self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size self.engine_id: EngineId = engine_id @@ -578,7 +565,6 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: KVC # blocks per KV cache group. This is used to clip the local attention window. sw_sizes_tokens = [group.kv_cache_spec.sliding_window if isinstance(group.kv_cache_spec, SlidingWindowSpec) else 0 for group in kv_cache_config.kv_cache_groups] self.sw_sizes = [n_tokens // self.block_size for n_tokens in sw_sizes_tokens] - print(f"sw_sizes: {self.sw_sizes}\n", flush=True) def shutdown(self): self._stop_event.set() @@ -602,8 +588,6 @@ def get_sw_clippped_blocks(self, block_ids: BlockIds) -> BlockIds: # because offloading connectors might want to receive the whole sequence even # for SWA groups. We will abstract this logic once the interface is more stable assert len(block_ids) == len(self.sw_sizes), "Number of KV cache groups must match" - print("CLIPPING BLOCKS", block_ids) - print("to ", tuple([blocks[-self.sw_sizes[i]:] for i, blocks in enumerate(block_ids)]), "\n", flush=True) return tuple([blocks[-self.sw_sizes[i]:] for i, blocks in enumerate(block_ids)]) def set_xfer_handshake_metadata( @@ -760,10 +744,7 @@ def update_state_after_alloc( else [] ) local_block_ids = self.get_sw_clippped_blocks(local_block_ids) - print( - f"update_state_after_alloc local_block_ids unhashed: {local_block_ids}\n", - flush=True, - ) + # Get unhashed blocks to pull from remote. Mind that a full prefix # cache hit is indicated with an empty list. self._reqs_need_recv[request.request_id] = ( @@ -809,7 +790,6 @@ def build_connector_meta( new_block_id_groups = self.get_sw_clippped_blocks(new_block_id_groups) meta.add_new_req_to_save( request_id=req_id, - # FIXME new_block_id_groups[0] when hma is off? local_block_ids=new_block_id_groups, kv_transfer_params=req.kv_transfer_params, ) @@ -835,8 +815,6 @@ def build_connector_meta( self._reqs_in_batch = set() self._reqs_not_processed = set() self._reqs_need_send = {} - if len(meta.reqs_to_recv) > 0: - print("build_connector_meta", meta.reqs_to_recv, "\n", flush=True) return meta @@ -885,11 +863,7 @@ def request_finished( # TODO: check whether block_ids actually ever be 0. If not we could # remove the conditional below - print(f"request_finished block_ids: {block_ids}\n\n", flush=True) - if self._is_hma_enabled: - delay_free_blocks = any(len(group) > 0 for group in block_ids) - else: - delay_free_blocks = len(block_ids) > 0 + delay_free_blocks = any(len(group) > 0 for group in block_ids) if delay_free_blocks: # Prefill request on remote. It will be read from D upon completion @@ -907,7 +881,6 @@ def request_finished( # blocks are always at the start of the list. # Here we "unpad" blocks to send the actual remote blocks to be read. block_ids = self.get_sw_clippped_blocks(block_ids) - print(f"request_finished unpadded block_ids: {block_ids}\n\n", flush=True) return delay_free_blocks, dict( do_remote_prefill=True, @@ -924,7 +897,7 @@ def request_finished( class NixlConnectorWorker: """Implementation of Worker side methods""" - def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: KVCacheConfig): + def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KVCacheConfig"): if NixlWrapper is None: logger.error("NIXL is not available") raise RuntimeError("NIXL is not available") @@ -1367,7 +1340,8 @@ def request_ready(f: Future[Any], entry=(req_id, meta)): meta=meta, ) if req_meta := self._recving_metadata.get(req_id): - self._invalid_block_ids.update(req_meta.local_block_ids) + local_block_ids = get_blocks_in_fa_kv_group(req_meta.local_block_ids) + self._invalid_block_ids.update(local_block_ids) self._failed_recv_reqs.add(req_id) fut.add_done_callback(request_ready) @@ -1439,7 +1413,6 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): # across groups. This results in skipping all tensors but the ones # pointed to by group0. Also, generally we will have more blocks # per tensor but fewer regions. - print(f"layer {layer_name} already seen, skipping", flush=True) continue logger.debug( @@ -2159,7 +2132,10 @@ def _handle_failed_transfer(self, req_id: str, handle: int): """ # Use .get() here as the metadata cleanup is handled by get_finished() if meta := self._recving_metadata.get(req_id): - self._invalid_block_ids.update(meta.local_block_ids) + # For the purpose of marking blocks as invalid, only report FA ones to + # handle blocks<>tokens mapping consistently. + local_block_ids = get_blocks_in_fa_kv_group(meta.local_block_ids) + self._invalid_block_ids.update(local_block_ids) self.nixl_wrapper.release_xfer_handle(handle) self.xfer_stats.record_failed_transfer() @@ -2298,6 +2274,9 @@ def _read_blocks( assert self.kv_topo is not None block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) if block_size_ratio > 1: + # TODO (NickLucche) assume HMA is off. Change to handle multiple KV groups. + local_block_ids = local_block_ids[0] if local_block_ids else [] + remote_block_ids = remote_block_ids[0] local_block_ids = self.get_mapped_blocks( np.asarray(local_block_ids), block_size_ratio ) @@ -2308,12 +2287,14 @@ def _read_blocks( # prefill block_ids with block_size as 4: # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # Local decode block_ids with block_size as 16: [1, 2, 3] - # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to + # expanded decode block_ids with get_mapped_blocks from [1, 2, 3] to # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Then we clip local to align with prefill # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] local_block_ids = local_block_ids[: len(remote_block_ids)] + local_block_ids = tuple(local_block_ids) if local_block_ids else [] + remote_block_ids = tuple(remote_block_ids) # NOTE(rob): having the staging blocks be on the READER side is # not going to work well (since we will have to call rearrange tensors). # after we detect the txn is complete (which means we cannot make the @@ -2403,13 +2384,14 @@ def _read_blocks( remote_rank=remote_rank, ) if meta := self._recving_metadata.get(request_id): - self._invalid_block_ids.update(meta.local_block_ids) + local_block_ids = get_blocks_in_fa_kv_group(meta.local_block_ids) + self._invalid_block_ids.update(local_block_ids) self.xfer_stats.record_failed_transfer() if handle is not None: self.nixl_wrapper.release_xfer_handle(handle) self._failed_recv_reqs.add(request_id) - def get_mapped_blocks(self, block_ids, block_size_ratio): + def get_mapped_blocks(self, block_ids: np.ndarray, block_size_ratio: int) -> np.ndarray: """ Calculates the new set of block IDs by mapping every element in the (potentially sparse) input array. @@ -2446,7 +2428,6 @@ def _get_block_descs_ids( # same for [3], but group0-group1 blocks will always differ (different areas). # Therefore we can just flatten the block_ids and compute the descs ids for all # groups at once. - print("get_block_descs_ids", block_ids, "\n") num_blocks = self.dst_num_blocks[engine_id] if block_size_ratio is not None: num_blocks = int(num_blocks * block_size_ratio) @@ -2454,18 +2435,10 @@ def _get_block_descs_ids( # Compute the desc ids for each block. region_ids = region_ids[:, None] block_ids = np.concatenate(block_ids)[None, :] - if self._is_hma_enabled: - block_ids = np.concatenate(block_ids)[None, :] - else: - # FIXME check if this can be folded to equivalent concat - block_ids = np.array(block_ids)[None, :] descs_ids = region_ids * num_blocks + block_ids - print( - "get_block_descs_ids num output", len(descs_ids.flatten()), "\n", flush=True - ) return descs_ids.flatten() - def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]: + def _logical_to_kernel_block_ids(self, block_ids: BlockIds) -> BlockIds: """ Convert logical block ids to kernel physical block ids. This is required when the logical block size (the one set by the user) @@ -2474,14 +2447,12 @@ def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]: if self._physical_blocks_per_logical_kv_block == 1: # Noop when physical and logical block sizes are the same return block_ids - # FIXME should you just flatten the tuple here? Result should be the same - block_ids_np = np.array(block_ids) block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape( 1, -1 ) - return BlockTable.map_to_kernel_blocks( - block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange - ).tolist() + return [BlockTable.map_to_kernel_blocks( + np.array(group), self._physical_blocks_per_logical_kv_block, block_arange + ).tolist() for group in block_ids] def get_backend_aware_kv_block_len(self, layer_idx: int) -> int: """ diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4ba57e045680..039236e108f5 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -116,6 +116,7 @@ def __init__( self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None self.recompute_kv_load_failures = True + self._connector_supports_hma = False if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -131,6 +132,7 @@ def __init__( self.vllm_config.kv_transfer_config.kv_load_failure_policy ) self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" + self._connector_supports_hma = isinstance(self.connector, SupportsHMA) self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1950,7 +1952,7 @@ def _connector_finished( block_ids = self.kv_cache_manager.get_block_ids(request.request_id) - if not isinstance(self.connector, SupportsHMA): + if not self._connector_supports_hma: # NOTE(Kuntai): We should deprecate this code path after we enforce # all connectors to support HMA. # Hybrid memory allocator should be already turned off for this @@ -1990,12 +1992,11 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: self.failed_recving_kv_req_ids.remove(request.request_id) else: # Now that the blocks are ready, actually cache them. - # FIXME this should only be changed if hma is enabled support_hma check here! - # (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id) block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + # When connector does not support HMA, a single group is present here + num_computed_tokens = max(len(group) for group in block_ids) * self.block_size # Get number of blocks on full attention layer, we can retrieve at most # this many tokens - num_computed_tokens = max(len(group) for group in block_ids) * self.block_size # Handle the case where num request tokens less than one block. num_computed_tokens = min(num_computed_tokens, request.num_tokens) if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 @@ -2078,8 +2079,11 @@ def _update_requests_with_invalid_blocks( is_affected = False marked_invalid_block = False req_id = request.request_id - # TODO (davidb): add support for hybrid memory allocator - (req_block_ids,) = self.kv_cache_manager.get_block_ids(req_id) + req_block_ids = self.kv_cache_manager.get_block_ids(req_id) + # Assume FA group is present to infer number of computed tokens + # TODO this is not padded for SW right? + fa_blocks_idx = max(range(len(req_block_ids)), key=lambda x: len(req_block_ids[x])) + max_num_blocks = len(req_block_ids[fa_blocks_idx]) # We iterate only over blocks that may contain externally computed # tokens if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: @@ -2088,7 +2092,7 @@ def _update_requests_with_invalid_blocks( req_num_computed_tokens = ( request.num_computed_tokens if req_id in self.failed_recving_kv_req_ids - else len(req_block_ids) * self.block_size + else max_num_blocks * self.block_size ) else: # Sync loading. num_computed_tokens includes new tokens @@ -2097,7 +2101,10 @@ def _update_requests_with_invalid_blocks( req_num_computed_blocks = ( req_num_computed_tokens + self.block_size - 1 ) // self.block_size - for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids): + # For the purpose of marking blocks as invalid, only report FA ones to + # handle blocks<>tokens mapping consistently. + # for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids): + for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids[fa_blocks_idx]): if block_id not in invalid_block_ids: continue @@ -2127,9 +2134,17 @@ def _update_requests_with_invalid_blocks( ) total_affected_tokens += num_affected_tokens request.num_external_computed_tokens -= num_affected_tokens - # collect invalid block and all downstream dependent blocks + # Collect invalid block and all downstream dependent blocks, across + # all groups. if evict_blocks: - blocks_to_evict.update(req_block_ids[idx:]) + # Assuming groups are not padded, do SW-aware eviction, example: + # FA: [A B C D C] + # SW: [ E F] + # =>Evict E only when failure index <= E. + for group in req_block_ids: + offset = max_num_blocks - len(group) + start_idx = max(0, idx - offset) + blocks_to_evict.update(group[start_idx:]) if is_affected: if not marked_invalid_block: From 7198beca3764f9bb41db1aa2ac70de4aa340ec8b Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 12 Jan 2026 12:27:10 -0500 Subject: [PATCH 08/28] update tests hma specific tests Signed-off-by: NickLucche --- .../kv_connector/unit/test_nixl_connector.py | 121 +++++++---- .../unit/test_nixl_connector_hma.py | 202 ++++++++++++++++++ 2 files changed, 282 insertions(+), 41 deletions(-) create mode 100644 tests/v1/kv_connector/unit/test_nixl_connector_hma.py diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 1975d2226073..fc20b8037ab3 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -59,7 +59,7 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin from vllm.v1.worker.utils import AttentionGroup -from .utils import create_request, create_scheduler, create_vllm_config +from .utils import create_request, create_scheduler, create_vllm_config, make_kv_cache_config @pytest.fixture(scope="module", autouse=True) @@ -263,7 +263,7 @@ def test_basic_interface(): req_meta = kv_connector_metadata.reqs_to_recv[request_id] for block_id, block in zip( - req_meta.local_block_ids, + req_meta.local_block_ids[0], scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[ request_id ], @@ -327,7 +327,9 @@ def test_kv_transfer_handshake(dist_init): # Prefill connector will register KV cache to populate proper handshake # metadata. - prefill_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + prefill_connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( num_blocks=2, block_size=16, num_kv_heads=4, head_size=64 ) @@ -367,13 +369,15 @@ def test_kv_transfer_handshake(dist_init): do_remote_decode=True, ) request.status = RequestStatus.FINISHED_LENGTH_CAPPED - delay, kv_connector_metadata = scheduler.get_kv_connector().request_finished( - request, [0, 1, 2] + delay, kv_connector_metadata = scheduler.get_kv_connector().request_finished_all_groups( + request, ([0, 1, 2],) ) assert delay # Decode connector will be able to create handshake with the prefill connector. - decode_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + decode_connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) decode_connector.register_kv_caches(kv_caches) # Here we are testing the retrieval of NIXLAgentMetadata. @@ -406,7 +410,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): def __init__( self, *args, hand_shake_latency: float = 1.8, kv_cache_layout="HND", **kwargs ): - super().__init__(*args, **kwargs) + kv_cache_config = make_kv_cache_config(block_size=16) + super().__init__(*args, kv_cache_config=kv_cache_config, **kwargs) self._hand_shake_latency = hand_shake_latency self.kv_cache_layout = kv_cache_layout # Mock register_kv_caches attribute needed for tests that do not call it. @@ -507,7 +512,9 @@ def test_multi_xfer_one_engine( request_id = "req_id" # Test worker role in decode server. - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -528,13 +535,15 @@ def test_multi_xfer_one_engine( num_xfers -= 1 metadata.add_new_req_to_recv( request_id=request_id, - local_block_ids=[num_xfers + 1, num_xfers + 2, num_xfers + 3], + local_block_ids=([num_xfers + 1, num_xfers + 2, num_xfers + 3],), kv_transfer_params={ - "remote_block_ids": [ - num_xfers + 4, - num_xfers + 5, - num_xfers + 6, - ], + "remote_block_ids": ( + [ + num_xfers + 4, + num_xfers + 5, + num_xfers + 6, + ], + ), "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", @@ -594,16 +603,18 @@ def test_async_load_kv( vllm_config.parallel_config.tensor_parallel_size = decode_tp_size # Test worker role in decode server. - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id ) metadata = NixlConnectorMetadata() metadata.add_new_req_to_recv( request_id="id", - local_block_ids=[1, 2, 3], + local_block_ids=([1, 2, 3],), kv_transfer_params={ - "remote_block_ids": [4, 5, 6], + "remote_block_ids": ([4, 5, 6],), "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, "remote_request_id": "prefill-id", "remote_host": "localhost", @@ -652,7 +663,9 @@ def test_prefill_tp_size_greater_than_decode_tp_size( local_tp_size = 1 vllm_config.parallel_config.tensor_parallel_size = local_tp_size - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -717,8 +730,12 @@ def test_prefill_tp_size_greater_than_decode_tp_size_mla( p_tp_size = 2 # Build two separate connectors/workers to emulate P TP=2 ranks. - conn_p0 = NixlConnector(vllm_config, KVConnectorRole.WORKER) - conn_p1 = NixlConnector(vllm_config, KVConnectorRole.WORKER) + conn_p0 = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) + conn_p1 = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) conn_p0.connector_worker = FakeNixlConnectorWorker( vllm_config, conn_p0.engine_id, hand_shake_latency=0 ) @@ -815,7 +832,9 @@ def test_concurrent_load_kv( vllm_config = create_vllm_config() # Test worker role in decode server. - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id ) @@ -827,9 +846,9 @@ def test_concurrent_load_kv( for i in range(total_reqs): metadata.add_new_req_to_recv( request_id=f"id_{i}", - local_block_ids=[1, 2, 3], + local_block_ids=([1, 2, 3],), kv_transfer_params={ - "remote_block_ids": [4, 5, 6], + "remote_block_ids": ([4, 5, 6],), "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, "remote_request_id": f"prefill-id-{i}", "remote_host": "localhost", @@ -884,7 +903,9 @@ def test_handshake_fails_on_kv_cache_layout_mismatch( return_value=2, ): # Initialize connector and worker (with fake NIXL wrapper) - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -934,7 +955,9 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental( return_value=2, ): # Initialize connector and worker (with fake NIXL wrapper) - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, @@ -979,7 +1002,9 @@ def test_kv_connector_stats(default_vllm_config, dist_init): vllm_config = create_vllm_config() # Test worker role in decode server. - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -993,9 +1018,9 @@ def test_kv_connector_stats(default_vllm_config, dist_init): metadata = NixlConnectorMetadata() metadata.add_new_req_to_recv( request_id=request_id, - local_block_ids=[1, 2, 3], + local_block_ids=([1, 2, 3],), kv_transfer_params={ - "remote_block_ids": [4, 5, 6], + "remote_block_ids": ([4, 5, 6],), "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", @@ -1448,7 +1473,9 @@ def test_register_kv_caches( mock_get_attn_backend.return_value = backend_cls # Create connector - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -1676,7 +1703,9 @@ def test_kv_buffer_to_nixl_memory_types( ), ): # noqa: E501 # Create connector and replace its worker with a fake one for isolation - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) # Verify get_reg_descs was called with the correct memory_type assert connector.connector_worker.kv_buffer_device == kv_buffer_device @@ -1692,9 +1721,9 @@ def test_shutdown_cleans_up_resources(default_vllm_config, dist_init): vllm_config = create_vllm_config() scheduler = NixlConnectorScheduler( - vllm_config, vllm_config.kv_transfer_config.engine_id + vllm_config, vllm_config.kv_transfer_config.engine_id, make_kv_cache_config(block_size=16) ) - worker = NixlConnectorWorker(vllm_config, vllm_config.kv_transfer_config.engine_id) + worker = NixlConnectorWorker(vllm_config, vllm_config.kv_transfer_config.engine_id, make_kv_cache_config(block_size=16)) nixl_wrapper = worker.nixl_wrapper with ( @@ -1756,7 +1785,9 @@ def test_aborted_request_removed_from_worker_in_batch(default_vllm_config, dist_ scheduler = create_scheduler(vllm_config) # KVConnector Worker in P - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -2007,7 +2038,9 @@ def test_handshake_failure_returns_finished(default_vllm_config, dist_init): """Test that handshake failures mark blocks invalid and return via get_finished.""" vllm_config = create_vllm_config() - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0.1 ) @@ -2017,9 +2050,9 @@ def test_handshake_failure_returns_finished(default_vllm_config, dist_init): metadata = NixlConnectorMetadata() metadata.add_new_req_to_recv( request_id=request_id, - local_block_ids=[1, 2, 3], + local_block_ids=([1, 2, 3],), kv_transfer_params={ - "remote_block_ids": [4, 5, 6], + "remote_block_ids": ([4, 5, 6],), "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", @@ -2058,7 +2091,9 @@ def test_transfer_setup_failure_returns_finished(default_vllm_config, dist_init) and return via get_finished.""" vllm_config = create_vllm_config() - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -2068,9 +2103,9 @@ def test_transfer_setup_failure_returns_finished(default_vllm_config, dist_init) metadata = NixlConnectorMetadata() metadata.add_new_req_to_recv( request_id=request_id, - local_block_ids=[7, 8, 9], + local_block_ids=([7, 8, 9],), kv_transfer_params={ - "remote_block_ids": [10, 11, 12], + "remote_block_ids": ([10, 11, 12],), "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", @@ -2154,7 +2189,9 @@ def test_compatibility_hash_validation( "enforce_handshake_compat": enforce_handshake_compat }, ) - decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER) + decode_connector = NixlConnector( + local_vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) decode_worker = decode_connector.connector_worker kv_cache_shape = decode_worker.attn_backend.get_kv_cache_shape( num_blocks=2, block_size=16, num_kv_heads=4, head_size=64 @@ -2267,7 +2304,9 @@ def test_handshake_decode_errors(default_vllm_config, dist_init, error_scenario) model="facebook/opt-125m", block_size=16, ) - decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER) + decode_connector = NixlConnector( + local_vllm_config, KVConnectorRole.WORKER, make_kv_cache_config(block_size=16) + ) decode_worker = decode_connector.connector_worker backend = get_current_attn_backend(local_vllm_config) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py new file mode 100644 index 000000000000..e09f85a76845 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py @@ -0,0 +1,202 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for NixlConnectorScheduler sw_sizes calculation with HMA.""" + +from unittest.mock import patch + +import pytest + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig +from vllm.v1.core.single_type_kv_cache_manager import ( + FullAttentionManager, + SlidingWindowManager, +) + +from .utils import ( + create_vllm_config, + make_kv_cache_config, +) + + +@pytest.mark.cpu_test +@pytest.mark.parametrize( + "hma_enabled,expected_sw_sizes", + [ + # HMA enabled: FullAttentionSpec (0) + SlidingWindowSpec (2048/16=128) + (True, [0, 128]), + # HMA disabled: only FullAttentionSpec (0) + (False, [0]), + ], +) +@patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.current_platform") +def test_sw_sizes(mock_platform, hma_enabled, expected_sw_sizes): + """Test sw_sizes is correctly computed based on HMA enabled/disabled.""" + from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( + NixlConnectorScheduler, + ) + + mock_platform.device_type = "cpu" + + block_size = 16 + vllm_config = create_vllm_config(block_size=block_size) + kv_cache_config = make_kv_cache_config( + block_size=block_size, hma_enabled=hma_enabled + ) + + scheduler = NixlConnectorScheduler( + vllm_config=vllm_config, + engine_id="test-engine", + kv_cache_config=kv_cache_config, + ) + + assert scheduler.sw_sizes == expected_sw_sizes, ( + f"Expected sw_sizes={expected_sw_sizes}, got {scheduler.sw_sizes}" + ) + + +@pytest.mark.cpu_test +def test_logical_to_kernel_block_ids_with_hma(): + """Test _logical_to_kernel_block_ids expands blocks when HMA is enabled. + + When HMA is enabled, the logical block size may differ from the kernel + block size. Each logical block maps to multiple kernel blocks. + """ + from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( + NixlConnectorWorker, + ) + + # Create a mock worker with just the required attributes + # (use __new__ to skip __init__) + worker = object.__new__(NixlConnectorWorker) + + # Simulate HMA scenario: logical block size = 32, kernel block size = 16 + # So each logical block maps to 2 kernel blocks eg [0]->[0,1] + worker._physical_blocks_per_logical_kv_block = 2 + + # Test conversion: FA + SW group + logical_block_ids = [[0, 1, 2], [3, 4]] + kernel_block_ids = worker._logical_to_kernel_block_ids(logical_block_ids) + + expected_kernel_block_ids = [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9]] + assert kernel_block_ids == expected_kernel_block_ids, ( + f"Expected {expected_kernel_block_ids}, got {kernel_block_ids}" + ) + + +@pytest.mark.parametrize("model_name, sw_size", [("google/gemma-3-1b-it", 512)]) +def test_fewer_blocks_with_hma(monkeypatch, model_name, sw_size): + """Test that a prefill instance returns fewer "remote blocks" for the SWA groups + when sequence exceeds the sliding window. + """ + kv_transfer_config = KVTransferConfig( + kv_connector="NixlConnector", + kv_role="kv_both", + ) + block_size = 16 + llm_kwargs = { + "model": model_name, + "enforce_eager": True, + "gpu_memory_utilization": 0.5, + "kv_transfer_config": kv_transfer_config, + "max_model_len": 2048, + # NOTE: Make sure HMA is enabled + "disable_hybrid_kv_cache_manager": False, + "max_num_batched_tokens": 1024, + "enable_prefix_caching": False, + "block_size": block_size, + } + + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + + def run_hma_test(llm: LLM): + remote_prefill_opts = { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": None, + "remote_port": None, + } + # Simulate sidecar request + sampling_params = SamplingParams( + temperature=0.0, + max_tokens=1, + extra_args={"kv_transfer_params": remote_prefill_opts}, + ) + scheduler = llm.llm_engine.engine_core.engine_core.scheduler + kv_managers = scheduler.kv_cache_manager.coordinator.single_type_managers + # HMA enabled with FA + SWA groups + assert len(kv_managers) > 2 + for kv_manager in kv_managers: + assert isinstance(kv_manager, (SlidingWindowManager, FullAttentionManager)) + req_to_blocks = kv_managers[0].req_to_blocks + assert len(req_to_blocks) == 0 + + # Process some request with length exceeding the sliding window + outputs = llm.generate(["hi" * 1401], sampling_params) + kv_params = outputs[0].kv_transfer_params + print("kv_params", kv_params) + + expected_num_remote_blocks = sw_size // block_size + remote_block_ids = kv_params["remote_block_ids"] + assert ( + len(remote_block_ids[0]) + == expected_num_remote_blocks + < len(remote_block_ids[-1]) + ) + for group_block_ids in remote_block_ids[:-1]: + assert len(group_block_ids) == expected_num_remote_blocks + + def run_test_and_cleanup(): + llm = LLM(**llm_kwargs) + try: + run_hma_test(llm) + finally: + llm.llm_engine.engine_core.shutdown() + + run_test_and_cleanup() + + +@pytest.mark.cpu_test +def test_nixl_metadata_hma_block_ids_structure(): + """ + Test that NixlConnectorMetadata correctly stores block IDs for multiple + KV cache groups when HMA is enabled. + """ + from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( + NixlConnectorMetadata, + ) + + metadata = NixlConnectorMetadata() + + # Add request with block IDs for 2 groups (FA + SW) + fa_blocks = [0, 1, 2, 3, 4, 5, 6, 7] # 8 blocks for FA + sw_blocks = [8, 9, 10, 11] # 4 blocks for SW (clipped) + + metadata.add_new_req_to_recv( + request_id="test-req-hma", + local_block_ids=(fa_blocks, sw_blocks), + kv_transfer_params={ + "remote_block_ids": ([10, 11, 12, 13, 14, 15, 16, 17], [18, 19, 20, 21]), + "remote_engine_id": "remote-engine", + "remote_request_id": "prefill-test-req-hma", + "remote_host": "localhost", + "remote_port": 1234, + "tp_size": 1, + }, + ) + + assert "test-req-hma" in metadata.reqs_to_recv + req_meta = metadata.reqs_to_recv["test-req-hma"] + + # Verify local block IDs structure + assert len(req_meta.local_block_ids) == 2 + assert list(req_meta.local_block_ids[0]) == fa_blocks + assert list(req_meta.local_block_ids[1]) == sw_blocks + + # Verify remote block IDs structure + assert req_meta.remote is not None + assert len(req_meta.remote.block_ids) == 2 + assert list(req_meta.remote.block_ids[0]) == [10, 11, 12, 13, 14, 15, 16, 17] + assert list(req_meta.remote.block_ids[1]) == [18, 19, 20, 21] From 08c55dc258db1209aab9c8360c54f30843761c12 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Mon, 12 Jan 2026 18:03:40 -0800 Subject: [PATCH 09/28] fix issue for heterogenuous block_size and layout Signed-off-by: Chendi Xue --- .../kv_transfer/kv_connector/v1/nixl_connector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index c56489e4fc80..88c73c2c1637 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1999,7 +1999,7 @@ def get_finished(self) -> tuple[set[str], set[str]]: block_size_ratio > 1 or self.enable_permute_local_kv ): block_ids_for_blocksize_post_process[block_size_ratio].append( - meta.local_physical_block_ids + meta.local_physical_block_ids[0] ) for ( block_size_ratio, @@ -2279,7 +2279,7 @@ def _read_blocks( remote_block_ids = remote_block_ids[0] local_block_ids = self.get_mapped_blocks( np.asarray(local_block_ids), block_size_ratio - ) + ).tolist() if len(local_block_ids) > len(remote_block_ids): # NOTE: # get_mapped_blocks will always expand block_ids for n times. @@ -2293,8 +2293,8 @@ def _read_blocks( # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] local_block_ids = local_block_ids[: len(remote_block_ids)] - local_block_ids = tuple(local_block_ids) if local_block_ids else [] - remote_block_ids = tuple(remote_block_ids) + local_block_ids = [local_block_ids] if local_block_ids else [] + remote_block_ids = [remote_block_ids] # NOTE(rob): having the staging blocks be on the READER side is # not going to work well (since we will have to call rearrange tensors). # after we detect the txn is complete (which means we cannot make the From 6ec65ba33f08ee8d4a28e7ddb80eb0bb1a508c4d Mon Sep 17 00:00:00 2001 From: NickLucche Date: Tue, 13 Jan 2026 09:02:26 -0500 Subject: [PATCH 10/28] cpu-buffer case+precommit Signed-off-by: NickLucche --- .../kv_connector/unit/test_nixl_connector.py | 23 ++- .../kv_transfer/kv_connector/utils.py | 9 +- .../kv_connector/v1/nixl_connector.py | 152 +++++++++++------- vllm/v1/core/kv_cache_manager.py | 9 +- vllm/v1/core/sched/scheduler.py | 20 ++- 5 files changed, 141 insertions(+), 72 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index fc20b8037ab3..8cea3e4db059 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -59,7 +59,12 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin from vllm.v1.worker.utils import AttentionGroup -from .utils import create_request, create_scheduler, create_vllm_config, make_kv_cache_config +from .utils import ( + create_request, + create_scheduler, + create_vllm_config, + make_kv_cache_config, +) @pytest.fixture(scope="module", autouse=True) @@ -369,8 +374,10 @@ def test_kv_transfer_handshake(dist_init): do_remote_decode=True, ) request.status = RequestStatus.FINISHED_LENGTH_CAPPED - delay, kv_connector_metadata = scheduler.get_kv_connector().request_finished_all_groups( - request, ([0, 1, 2],) + delay, kv_connector_metadata = ( + scheduler.get_kv_connector().request_finished_all_groups( + request, ([0, 1, 2],) + ) ) assert delay @@ -1721,9 +1728,15 @@ def test_shutdown_cleans_up_resources(default_vllm_config, dist_init): vllm_config = create_vllm_config() scheduler = NixlConnectorScheduler( - vllm_config, vllm_config.kv_transfer_config.engine_id, make_kv_cache_config(block_size=16) + vllm_config, + vllm_config.kv_transfer_config.engine_id, + make_kv_cache_config(block_size=16), + ) + worker = NixlConnectorWorker( + vllm_config, + vllm_config.kv_transfer_config.engine_id, + make_kv_cache_config(block_size=16), ) - worker = NixlConnectorWorker(vllm_config, vllm_config.kv_transfer_config.engine_id, make_kv_cache_config(block_size=16)) nixl_wrapper = worker.nixl_wrapper with ( diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 8fe93d6e49a3..bb76d25c510e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -24,6 +24,7 @@ logger = init_logger(__name__) EngineId = str +BlockIds = tuple[list[int], ...] | list[list[int]] def get_kv_connector_cache_layout(): @@ -300,10 +301,13 @@ def yield_req_data( (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids), ) -def get_blocks_in_fa_kv_group(block_ids: tuple[list[int], ...]) -> tuple[list[int], ...]: + +def get_blocks_in_fa_kv_group( + block_ids: BlockIds, +) -> list[int]: """ Get blocks in the full attention KV group, which we assume to be the largest group. - Note that when HMA is disabled or the model is not hybrid, + Note that when HMA is disabled or the model is not hybrid, a single group is present here. """ if not block_ids: @@ -312,6 +316,7 @@ def get_blocks_in_fa_kv_group(block_ids: tuple[list[int], ...]) -> tuple[list[in argmax_i = max(range(len(block_ids)), key=lambda x: len(block_ids[x])) return block_ids[argmax_i] + @dataclass class TpKVTopology: """ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 88c73c2c1637..fc2757926768 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -24,6 +24,7 @@ from vllm import envs from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import ( + BlockIds, EngineId, TpKVTopology, get_current_attn_backend, @@ -39,7 +40,7 @@ KVConnectorHandshakeMetadata, KVConnectorMetadata, KVConnectorRole, - SupportsHMA + SupportsHMA, ) from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorPromMetrics, @@ -69,7 +70,6 @@ TransferHandle = int ReqId = str -BlockIds = list[int] | tuple[list[int], ...] # # NIXL Connector Version @@ -344,7 +344,9 @@ def __init__( self.connector_worker: NixlConnectorWorker | None = None elif role == KVConnectorRole.WORKER: self.connector_scheduler = None - self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id, kv_cache_config) + self.connector_worker = NixlConnectorWorker( + vllm_config, self.engine_id, kv_cache_config + ) ############################################################ # Class Methods @@ -404,7 +406,6 @@ def request_finished_all_groups( assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) - def set_xfer_handshake_metadata( self, metadata: dict[int, KVConnectorHandshakeMetadata] ) -> None: @@ -523,7 +524,9 @@ def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: class NixlConnectorScheduler: """Implementation of Scheduler side methods""" - def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KVCacheConfig"): + def __init__( + self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KVCacheConfig" + ): self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size self.engine_id: EngineId = engine_id @@ -540,7 +543,9 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KV self.use_host_buffer = ( vllm_config.kv_transfer_config.kv_buffer_device == "cpu" ) - self._is_hma_enabled = not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager + self._is_hma_enabled = ( + not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager + ) logger.info("Initializing NIXL Scheduler %s", engine_id) @@ -552,7 +557,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KV # Requests that need to start recv/send. # New requests are added by update_state_after_alloc in # the scheduler. Used to make metadata passed to Worker. - self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {} + self._reqs_need_recv: dict[ReqId, tuple[Request, BlockIds]] = {} self._reqs_need_save: dict[ReqId, Request] = {} # Reqs to send and their expiration time self._reqs_need_send: dict[ReqId, float] = {} @@ -561,9 +566,14 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KV # remote prefill or aborted. self._reqs_not_processed: set[ReqId] = set() - # Gather Sliding Window sizes for each kv cache group (if any) in number of + # Gather Sliding Window sizes for each kv cache group (if any) in number of # blocks per KV cache group. This is used to clip the local attention window. - sw_sizes_tokens = [group.kv_cache_spec.sliding_window if isinstance(group.kv_cache_spec, SlidingWindowSpec) else 0 for group in kv_cache_config.kv_cache_groups] + sw_sizes_tokens = [ + group.kv_cache_spec.sliding_window + if isinstance(group.kv_cache_spec, SlidingWindowSpec) + else 0 + for group in kv_cache_config.kv_cache_groups + ] self.sw_sizes = [n_tokens // self.block_size for n_tokens in sw_sizes_tokens] def shutdown(self): @@ -572,23 +582,26 @@ def shutdown(self): self._nixl_handshake_listener_t.join() self._nixl_handshake_listener_t = None - def get_sw_clippped_blocks(self, block_ids: BlockIds) -> BlockIds: """ - Clip the number of blocks to the sliding window size for each kv cache group - that employs SWA. - This is necessary because the KV Cache manager initially allocates blocks for + Clip the number of blocks to the sliding window size for each kv cache group + that employs SWA. + This is necessary because the KV Cache manager initially allocates blocks for the entire sequence length, and successively cleans up blocks that are outside the window prior to the `request_finished_all_groups` hook. """ if len(block_ids) == 0 or not self._is_hma_enabled: # No blocks to clip eg Full prefix cache hit return block_ids - # NOTE (NickLucche) This logic is currently handled at the connector level + # NOTE (NickLucche) This logic is currently handled at the connector level # because offloading connectors might want to receive the whole sequence even # for SWA groups. We will abstract this logic once the interface is more stable - assert len(block_ids) == len(self.sw_sizes), "Number of KV cache groups must match" - return tuple([blocks[-self.sw_sizes[i]:] for i, blocks in enumerate(block_ids)]) + assert len(block_ids) == len(self.sw_sizes), ( + "Number of KV cache groups must match" + ) + return tuple( + [blocks[-self.sw_sizes[i] :] for i, blocks in enumerate(block_ids)] + ) def set_xfer_handshake_metadata( self, metadata: dict[int, KVConnectorHandshakeMetadata] @@ -738,12 +751,14 @@ def update_state_after_alloc( # a full prefix cache hit on the D worker. We need to call # send_notif in _read_blocks to free the memory on the P. - local_block_ids = ( + unhashed_local_block_ids: BlockIds = ( blocks.get_unhashed_block_ids_all_groups() if num_external_tokens > 0 - else [] + else () + ) + local_block_ids = self.get_sw_clippped_blocks( + unhashed_local_block_ids ) - local_block_ids = self.get_sw_clippped_blocks(local_block_ids) # Get unhashed blocks to pull from remote. Mind that a full prefix # cache hit is indicated with an empty list. @@ -787,10 +802,10 @@ def build_connector_meta( req = req_to_save assert req.kv_transfer_params is not None - new_block_id_groups = self.get_sw_clippped_blocks(new_block_id_groups) + clipped_block_id_groups = self.get_sw_clippped_blocks(new_block_id_groups) meta.add_new_req_to_save( request_id=req_id, - local_block_ids=new_block_id_groups, + local_block_ids=clipped_block_id_groups, kv_transfer_params=req.kv_transfer_params, ) assert scheduler_output.num_scheduled_tokens is not None @@ -877,7 +892,7 @@ def request_finished( time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT ) # NOTE HMA will "mark" empty/null blocks in groups with 0s (eg SWA ones), - # trimming down after allocating for the whole sequence length. Empty + # trimming down after allocating for the whole sequence length. Empty # blocks are always at the start of the list. # Here we "unpad" blocks to send the actual remote blocks to be read. block_ids = self.get_sw_clippped_blocks(block_ids) @@ -897,7 +912,9 @@ def request_finished( class NixlConnectorWorker: """Implementation of Worker side methods""" - def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KVCacheConfig"): + def __init__( + self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KVCacheConfig" + ): if NixlWrapper is None: logger.error("NIXL is not available") raise RuntimeError("NIXL is not available") @@ -915,7 +932,9 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str, kv_cache_config: "KV self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config( "backends", ["UCX"] ) - self._is_hma_enabled = not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager + self._is_hma_enabled = ( + not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager + ) self.kv_cache_config = kv_cache_config # Agent. @@ -1340,7 +1359,9 @@ def request_ready(f: Future[Any], entry=(req_id, meta)): meta=meta, ) if req_meta := self._recving_metadata.get(req_id): - local_block_ids = get_blocks_in_fa_kv_group(req_meta.local_block_ids) + local_block_ids = get_blocks_in_fa_kv_group( + req_meta.local_block_ids + ) self._invalid_block_ids.update(local_block_ids) self._failed_recv_reqs.add(req_id) @@ -1861,13 +1882,15 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta): assert self.copy_blocks is not None local_block_ids = meta.local_physical_block_ids - self.copy_blocks( - self.host_xfer_buffers, - self.device_kv_caches, - local_block_ids, - local_block_ids, - "h2d", - ) + # TODO (NickLucche) D2H<>H2D ops could benefit from coalescing io across groups + for group_block_ids in local_block_ids: + self.copy_blocks( + self.host_xfer_buffers, + self.device_kv_caches, + group_block_ids, + group_block_ids, + "h2d", + ) if logger.isEnabledFor(logging.DEBUG): logger.debug( "synced recved kv of request[%s] to device kv buffer," @@ -1893,13 +1916,14 @@ def save_kv_to_host(self, metadata: NixlConnectorMetadata): ",".join(map(str, meta.local_physical_block_ids)), ) # blocking - self.copy_blocks( - self.device_kv_caches, - self.host_xfer_buffers, - meta.local_physical_block_ids, - meta.local_physical_block_ids, - "d2h", - ) + for group_block_ids in meta.local_physical_block_ids: + self.copy_blocks( + self.device_kv_caches, + self.host_xfer_buffers, + group_block_ids, + group_block_ids, + "d2h", + ) def post_process_device_kv_on_receive( self, @@ -2132,8 +2156,8 @@ def _handle_failed_transfer(self, req_id: str, handle: int): """ # Use .get() here as the metadata cleanup is handled by get_finished() if meta := self._recving_metadata.get(req_id): - # For the purpose of marking blocks as invalid, only report FA ones to - # handle blocks<>tokens mapping consistently. + # For the purpose of marking blocks as invalid, only report FA ones to + # handle blocks<>tokens mapping consistently. local_block_ids = get_blocks_in_fa_kv_group(meta.local_block_ids) self._invalid_block_ids.update(local_block_ids) self.nixl_wrapper.release_xfer_handle(handle) @@ -2275,12 +2299,12 @@ def _read_blocks( block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) if block_size_ratio > 1: # TODO (NickLucche) assume HMA is off. Change to handle multiple KV groups. - local_block_ids = local_block_ids[0] if local_block_ids else [] - remote_block_ids = remote_block_ids[0] - local_block_ids = self.get_mapped_blocks( - np.asarray(local_block_ids), block_size_ratio + local_block_ids0 = local_block_ids[0] if local_block_ids else [] + remote_block_ids0 = remote_block_ids[0] + local_block_ids_mapped = self.get_mapped_blocks( + np.asarray(local_block_ids0), block_size_ratio ).tolist() - if len(local_block_ids) > len(remote_block_ids): + if len(local_block_ids_mapped) > len(remote_block_ids0): # NOTE: # get_mapped_blocks will always expand block_ids for n times. # ex: @@ -2292,9 +2316,11 @@ def _read_blocks( # Then we clip local to align with prefill # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - local_block_ids = local_block_ids[: len(remote_block_ids)] - local_block_ids = [local_block_ids] if local_block_ids else [] - remote_block_ids = [remote_block_ids] + local_block_ids_mapped = local_block_ids_mapped[ + : len(remote_block_ids0) + ] + local_block_ids = [local_block_ids_mapped] if local_block_ids_mapped else [] + remote_block_ids = [remote_block_ids0] # NOTE(rob): having the staging blocks be on the READER side is # not going to work well (since we will have to call rearrange tensors). # after we detect the txn is complete (which means we cannot make the @@ -2330,12 +2356,17 @@ def _read_blocks( self.xfer_stats.record_failed_notification() return - # Partial prefix cache hit: just read uncomputed blocks. - assert len(remote_block_ids) == len(local_block_ids) == len(self.kv_cache_config.kv_cache_groups) + assert ( + len(remote_block_ids) + == len(local_block_ids) + == len(self.kv_cache_config.kv_cache_groups) + ) + remote_block_ids = list(remote_block_ids) for i, remote_group in enumerate(remote_block_ids): num_remote_blocks = len(remote_group) num_local_blocks = len(local_block_ids[i]) assert num_local_blocks <= num_remote_blocks + # Partial prefix cache hit: just read uncomputed blocks. if num_local_blocks < num_remote_blocks: remote_block_ids[i] = remote_group[-num_local_blocks:] @@ -2384,14 +2415,16 @@ def _read_blocks( remote_rank=remote_rank, ) if meta := self._recving_metadata.get(request_id): - local_block_ids = get_blocks_in_fa_kv_group(meta.local_block_ids) - self._invalid_block_ids.update(local_block_ids) + fa_local_block_ids = get_blocks_in_fa_kv_group(meta.local_block_ids) + self._invalid_block_ids.update(fa_local_block_ids) self.xfer_stats.record_failed_transfer() if handle is not None: self.nixl_wrapper.release_xfer_handle(handle) self._failed_recv_reqs.add(request_id) - def get_mapped_blocks(self, block_ids: np.ndarray, block_size_ratio: int) -> np.ndarray: + def get_mapped_blocks( + self, block_ids: np.ndarray, block_size_ratio: int + ) -> np.ndarray: """ Calculates the new set of block IDs by mapping every element in the (potentially sparse) input array. @@ -2450,9 +2483,14 @@ def _logical_to_kernel_block_ids(self, block_ids: BlockIds) -> BlockIds: block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape( 1, -1 ) - return [BlockTable.map_to_kernel_blocks( - np.array(group), self._physical_blocks_per_logical_kv_block, block_arange - ).tolist() for group in block_ids] + return [ + BlockTable.map_to_kernel_blocks( + np.array(group), + self._physical_blocks_per_logical_kv_block, + block_arange, + ).tolist() + for group in block_ids + ] def get_backend_aware_kv_block_len(self, layer_idx: int) -> int: """ diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 065ef8c0a35a..ee198a57f0b1 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -87,7 +87,14 @@ def get_unhashed_block_ids(self) -> list[int]: def get_unhashed_block_ids_all_groups(self) -> list[list[int]]: """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" # Skip padding blocks. - return [[block.block_id for block in group if block.block_hash is None and not block.is_null] for group in self.blocks] + return [ + [ + block.block_id + for block in group + if block.block_hash is None and not block.is_null + ] + for group in self.blocks + ] def new_empty(self) -> "KVCacheBlocks": """ diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 039236e108f5..fc5b6ee7e058 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1994,7 +1994,9 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: # Now that the blocks are ready, actually cache them. block_ids = self.kv_cache_manager.get_block_ids(request.request_id) # When connector does not support HMA, a single group is present here - num_computed_tokens = max(len(group) for group in block_ids) * self.block_size + num_computed_tokens = ( + max(len(group) for group in block_ids) * self.block_size + ) # Get number of blocks on full attention layer, we can retrieve at most # this many tokens num_computed_tokens = min(num_computed_tokens, request.num_tokens) @@ -2082,7 +2084,9 @@ def _update_requests_with_invalid_blocks( req_block_ids = self.kv_cache_manager.get_block_ids(req_id) # Assume FA group is present to infer number of computed tokens # TODO this is not padded for SW right? - fa_blocks_idx = max(range(len(req_block_ids)), key=lambda x: len(req_block_ids[x])) + fa_blocks_idx = max( + range(len(req_block_ids)), key=lambda x: len(req_block_ids[x]) + ) max_num_blocks = len(req_block_ids[fa_blocks_idx]) # We iterate only over blocks that may contain externally computed # tokens @@ -2101,10 +2105,12 @@ def _update_requests_with_invalid_blocks( req_num_computed_blocks = ( req_num_computed_tokens + self.block_size - 1 ) // self.block_size - # For the purpose of marking blocks as invalid, only report FA ones to - # handle blocks<>tokens mapping consistently. + # For the purpose of marking blocks as invalid, only report FA ones to + # handle blocks<>tokens mapping consistently. # for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids): - for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids[fa_blocks_idx]): + for idx, block_id in zip( + range(req_num_computed_blocks), req_block_ids[fa_blocks_idx] + ): if block_id not in invalid_block_ids: continue @@ -2134,12 +2140,12 @@ def _update_requests_with_invalid_blocks( ) total_affected_tokens += num_affected_tokens request.num_external_computed_tokens -= num_affected_tokens - # Collect invalid block and all downstream dependent blocks, across + # Collect invalid block and all downstream dependent blocks, across # all groups. if evict_blocks: # Assuming groups are not padded, do SW-aware eviction, example: # FA: [A B C D C] - # SW: [ E F] + # SW: [ E F] # =>Evict E only when failure index <= E. for group in req_block_ids: offset = max_num_blocks - len(group) From cf9c2e519f598b8727990827cff3a2996d9a240c Mon Sep 17 00:00:00 2001 From: NickLucche Date: Tue, 13 Jan 2026 10:15:35 -0500 Subject: [PATCH 11/28] failure logging for hma update tests Signed-off-by: NickLucche --- .../kv_connector/unit/test_nixl_connector.py | 20 +++++++++-- .../unit/test_nixl_connector_hma.py | 5 +-- tests/v1/kv_connector/unit/utils.py | 33 +++++++++++++++++++ .../kv_connector/v1/nixl_connector.py | 14 +++++--- 4 files changed, 63 insertions(+), 9 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 8cea3e4db059..5c42f0c29034 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1919,12 +1919,14 @@ def check_xfer_state(self, handle: int) -> str: ("transfer_exception", {"fail_transfer_exception": True}, True), ], ) +@pytest.mark.parametrize("enable_hma", [False]) def test_transfer_failure_logging( default_vllm_config, dist_init, failure_type, wrapper_config, needs_get_finished, + enable_hma, ): """Test that transfer failures are logged with structured context. @@ -1941,7 +1943,11 @@ def test_transfer_failure_logging( vllm_config = create_vllm_config() - connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector = NixlConnector( + vllm_config, + KVConnectorRole.WORKER, + make_kv_cache_config(block_size=16, hma_enabled=enable_hma), + ) connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0.0 ) @@ -1954,8 +1960,16 @@ def test_transfer_failure_logging( # For notification_failed, we need empty local blocks # (full cache hit path to trigger send_notif) - local_blocks = [] if failure_type == "notification_failed" else [10, 11, 12] - remote_blocks = [20, 21, 22] + if enable_hma: + # HMA enabled: multiple groups (FA + SW) + local_blocks = ( + () if failure_type == "notification_failed" else ([10, 11, 12], [13, 14]) + ) + remote_blocks = [[20, 21, 22], [23, 24]] + else: + # HMA disabled: single group + local_blocks = () if failure_type == "notification_failed" else ([10, 11, 12],) + remote_blocks = [[20, 21, 22]] metadata = NixlConnectorMetadata() metadata.add_new_req_to_recv( diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py index e09f85a76845..a56a68af9150 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py @@ -40,8 +40,9 @@ def test_sw_sizes(mock_platform, hma_enabled, expected_sw_sizes): block_size = 16 vllm_config = create_vllm_config(block_size=block_size) + # SW 2048 tokens=>128 blocks kv_cache_config = make_kv_cache_config( - block_size=block_size, hma_enabled=hma_enabled + block_size=block_size, hma_enabled=hma_enabled, sw_size=2048 ) scheduler = NixlConnectorScheduler( @@ -49,7 +50,7 @@ def test_sw_sizes(mock_platform, hma_enabled, expected_sw_sizes): engine_id="test-engine", kv_cache_config=kv_cache_config, ) - + # in number of blocks assert scheduler.sw_sizes == expected_sw_sizes, ( f"Expected sw_sizes={expected_sw_sizes}, got {scheduler.sw_sizes}" ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 7539da3e93ff..15c8f0d325d2 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -36,6 +36,7 @@ FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, + SlidingWindowSpec, ) from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from vllm.v1.request import Request @@ -412,3 +413,35 @@ def wait_for_save(self): KVConnectorFactory.register_connector( "MockKVConnector", __name__, MockKVConnector.__name__ ) + + +def make_kv_cache_config( + block_size: int, hma_enabled: bool = False, sw_size: int = 128 +) -> KVCacheConfig: + kv_cache_groups = [ + KVCacheGroupSpec( + ["layer0", "layer2"], + FullAttentionSpec( + block_size=block_size, + num_kv_heads=4, + head_size=16, + dtype=torch.float16, + ), + ) + ] + if hma_enabled: + kv_cache_groups.append( + KVCacheGroupSpec( + ["layer1", "layer3"], + SlidingWindowSpec( + block_size=block_size, + num_kv_heads=4, + head_size=16, + dtype=torch.float16, + sliding_window=sw_size, + ), + ) + ) + return KVCacheConfig( + num_blocks=100, kv_cache_tensors=[], kv_cache_groups=kv_cache_groups + ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index fc2757926768..5bef78429e6a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -27,8 +27,8 @@ BlockIds, EngineId, TpKVTopology, - get_current_attn_backend, get_blocks_in_fa_kv_group, + get_current_attn_backend, kv_postprocess_blksize_and_layout_on_receive, kv_postprocess_blksize_on_receive, kv_postprocess_layout_on_receive, @@ -1295,9 +1295,15 @@ def _log_failure( "remote_request_id": meta.remote.request_id, "remote_host": meta.remote.host, "remote_port": meta.remote.port, - "num_local_blocks": len(meta.local_block_ids), - "num_remote_blocks": len(meta.remote.block_ids), - "local_block_ids_sample": meta.local_block_ids[:10], + "num_local_blocks": sum( + len(group) for group in meta.local_block_ids + ), + "num_remote_blocks": sum( + len(group) for group in meta.remote.block_ids + ), + "local_block_ids_sample": meta.local_block_ids[0][:10] + if meta.local_block_ids + else [], } ) From d9cec70100d9bac6dc4ccecca6cac95dd0d96e29 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Tue, 13 Jan 2026 10:24:17 -0500 Subject: [PATCH 12/28] hma e2e lm-eval test review Signed-off-by: NickLucche --- .../config_sweep_accuracy_test.sh | 1 + .../nixl_integration/run_accuracy_test.sh | 25 ++++++++- .../nixl_integration/test_accuracy.py | 1 + .../unit/test_nixl_connector_hma.py | 4 +- .../kv_transfer/kv_connector/utils.py | 51 +++++++++++++++++-- .../kv_connector/v1/nixl_connector.py | 44 ++++++++++------ vllm/v1/core/sched/scheduler.py | 24 ++++----- 7 files changed, 115 insertions(+), 35 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh index abdf88ad6722..b84d3bd0c9d9 100755 --- a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh @@ -12,6 +12,7 @@ tp_configs=( "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" + "ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it" # HMA case ) dp_ep_configs=( "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 673236625d5c..b3a2eec1160b 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -5,6 +5,12 @@ set -xe KV_BUFFER_DEVICE="cuda" # Default to cuda ATTENTION_BACKEND="" # Default to empty (use vllm default) CROSS_LAYERS_BLOCKS="False" +ENABLE_HMA_FLAG="" # Default to empty (HMA disabled by default for kv connector) +# Check for ENABLE_HMA_FLAG environment variable +if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then + ENABLE_HMA_FLAG="--no-disable-hybrid-kv-cache-manager" +fi + while [[ $# -gt 0 ]]; do case $1 in --kv_buffer_device) @@ -31,6 +37,9 @@ echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE" if [[ -n "$ATTENTION_BACKEND" ]]; then echo "Using attention backend: $ATTENTION_BACKEND" fi +if [[ -n "$ENABLE_HMA_FLAG" ]]; then + echo "HMA (Hybrid KV Cache Manager) enabled" +fi DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then @@ -157,7 +166,16 @@ run_tests_for_model() { BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" fi - FULL_CMD="$BASE_CMD" + # Add HMA flag if specified + if [[ -n "$ENABLE_HMA_FLAG" ]]; then + BASE_CMD="${BASE_CMD} $ENABLE_HMA_FLAG" + fi + + if [ -n "$model_args" ]; then + FULL_CMD="$BASE_CMD $model_args" + else + FULL_CMD="$BASE_CMD" + fi eval "$FULL_CMD &" @@ -199,6 +217,11 @@ run_tests_for_model() { BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" fi + # Add HMA flag if specified + if [[ -n "$ENABLE_HMA_FLAG" ]]; then + BASE_CMD="${BASE_CMD} $ENABLE_HMA_FLAG" + fi + # DP-EP attention mode if [[ -z "$DP_EP" ]]; then BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE" diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py index a70f4caeb937..674e65c25ef4 100644 --- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py +++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py @@ -17,6 +17,7 @@ "deepseek-ai/deepseek-vl2-small": 0.59, "deepseek-ai/deepseek-vl2-tiny": 0.19, "deepseek-ai/DeepSeek-V2-Lite-Chat": 0.65, + "google/gemma-3-4b-it": 0.74, } SIMPLE_PROMPT = ( diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py index a56a68af9150..edc1bc4477fc 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py @@ -51,8 +51,8 @@ def test_sw_sizes(mock_platform, hma_enabled, expected_sw_sizes): kv_cache_config=kv_cache_config, ) # in number of blocks - assert scheduler.sw_sizes == expected_sw_sizes, ( - f"Expected sw_sizes={expected_sw_sizes}, got {scheduler.sw_sizes}" + assert scheduler.blocks_per_sw == expected_sw_sizes, ( + f"Expected sw_sizes={expected_sw_sizes}, got {scheduler.blocks_per_sw}" ) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index bb76d25c510e..7eb333b76120 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -20,10 +20,13 @@ if TYPE_CHECKING: from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase + from vllm.v1.kv_cache_interface import KVCacheConfig logger = init_logger(__name__) EngineId = str +# block ids as returned by the hybrid KV cache manager. list[list[int]] are allow +# mutability and are for connector internal use only. BlockIds = tuple[list[int], ...] | list[list[int]] @@ -302,19 +305,57 @@ def yield_req_data( ) +def get_full_attention_group_idx( + kv_cache_config: "KVCacheConfig", +) -> int: + """ + Get the index of the full attention KV cache group from KVCacheConfig. + + Args: + kv_cache_config: The KV cache configuration + + Returns: + The index of the full attention group + + Raises: + AssertionError: If no full attention group is found + """ + from vllm.v1.kv_cache_interface import FullAttentionSpec + + fa_group_idx = next( + ( + i + for i, group in enumerate(kv_cache_config.kv_cache_groups) + if isinstance(group.kv_cache_spec, FullAttentionSpec) + ), + None, + ) + assert fa_group_idx is not None, ( + "No full attention KV cache group found in kv_cache_config" + ) + return fa_group_idx + + def get_blocks_in_fa_kv_group( block_ids: BlockIds, + kv_cache_config: "KVCacheConfig", ) -> list[int]: """ - Get blocks in the full attention KV group, which we assume to be the largest group. - Note that when HMA is disabled or the model is not hybrid, - a single group is present here. + Get blocks in the full attention KV group using KVCacheConfig to determine + the correct group index. + + Args: + block_ids: Block IDs organized by KV cache group + kv_cache_config: The KV cache configuration used to identify the FA group + + Returns: + The block IDs for the full attention KV cache group """ if not block_ids: # Full prefix cache hit case return [] - argmax_i = max(range(len(block_ids)), key=lambda x: len(block_ids[x])) - return block_ids[argmax_i] + fa_group_idx = get_full_attention_group_idx(kv_cache_config) + return block_ids[fa_group_idx] @dataclass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 5bef78429e6a..8ec720116ae1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -3,7 +3,6 @@ import contextlib import copy import logging -import math import os import queue import sys @@ -56,11 +55,12 @@ from vllm.forward_context import ForwardContext from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.utils.math_utils import cdiv from vllm.utils.network_utils import make_zmq_path, make_zmq_socket from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.kv_cache_interface import SlidingWindowSpec +from vllm.v1.kv_cache_interface import MambaSpec, SlidingWindowSpec from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: @@ -206,6 +206,7 @@ def compute_nixl_compatibility_hash( model_config = vllm_config.model_config cache_config = vllm_config.cache_config + is_hma_enabled = not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager factors = { # Version compatibility @@ -221,6 +222,7 @@ def compute_nixl_compatibility_hash( "attn_backend_name": attn_backend_name, "cache_dtype": str(cache_config.cache_dtype), "cross_layers_blocks": cross_layers_blocks, + "is_hma_enabled": is_hma_enabled, } compat_hash = hash_factors(factors) @@ -239,7 +241,6 @@ def compute_nixl_compatibility_hash( @dataclass class RemoteMeta: - # Non-HMA | HMA blocks block_ids: BlockIds host: str port: int @@ -304,7 +305,6 @@ def add_new_req_to_recv( class NixlConnector(KVConnectorBase_V1, SupportsHMA): - @property def prefer_cross_layer_blocks(self) -> bool: backend = get_current_attn_backend(self._vllm_config) @@ -335,6 +335,9 @@ def __init__( assert vllm_config.kv_transfer_config is not None assert vllm_config.kv_transfer_config.engine_id is not None + for group in kv_cache_config.kv_cache_groups: + if isinstance(group.kv_cache_spec, MambaSpec): + raise ValueError("NixlConnector does not support Mamba models.") self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id self.kv_transfer_config = vllm_config.kv_transfer_config if role == KVConnectorRole.SCHEDULER: @@ -402,7 +405,6 @@ def request_finished_all_groups( request: "Request", block_ids: tuple[list[int], ...], ) -> tuple[bool, dict[str, Any] | None]: - # Hybrid memory allocator (HMA) enabled assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) @@ -574,7 +576,9 @@ def __init__( else 0 for group in kv_cache_config.kv_cache_groups ] - self.sw_sizes = [n_tokens // self.block_size for n_tokens in sw_sizes_tokens] + self.blocks_per_sw = [ + cdiv(n_tokens, self.block_size) for n_tokens in sw_sizes_tokens + ] def shutdown(self): self._stop_event.set() @@ -596,11 +600,12 @@ def get_sw_clippped_blocks(self, block_ids: BlockIds) -> BlockIds: # NOTE (NickLucche) This logic is currently handled at the connector level # because offloading connectors might want to receive the whole sequence even # for SWA groups. We will abstract this logic once the interface is more stable - assert len(block_ids) == len(self.sw_sizes), ( + assert len(block_ids) == len(self.blocks_per_sw), ( "Number of KV cache groups must match" ) + # For non-SWA groups, blocks_per_sw is 0 so we return all block_ids unchanged return tuple( - [blocks[-self.sw_sizes[i] :] for i, blocks in enumerate(block_ids)] + [blocks[-self.blocks_per_sw[i] :] for i, blocks in enumerate(block_ids)] ) def set_xfer_handshake_metadata( @@ -1366,7 +1371,7 @@ def request_ready(f: Future[Any], entry=(req_id, meta)): ) if req_meta := self._recving_metadata.get(req_id): local_block_ids = get_blocks_in_fa_kv_group( - req_meta.local_block_ids + req_meta.local_block_ids, self.kv_cache_config ) self._invalid_block_ids.update(local_block_ids) self._failed_recv_reqs.add(req_id) @@ -1816,8 +1821,9 @@ def _validate_remote_agent_handshake( assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id)) if self._is_hma_enabled: - assert block_size_ratio == 1, "HMA does not support different" - " remote block size yet" + assert block_size_ratio == 1, ( + "HMA does not support different remote block size yet" + ) kv_cache_layout = ( self.kv_cache_layout @@ -1833,6 +1839,9 @@ def _validate_remote_agent_handshake( "Remote is HND and local is NHD, enabled additional permute " "on local device KV." ) + assert not self._is_hma_enabled, ( + "HMA does not support block size post processing" + ) self.enable_permute_local_kv = True else: raise RuntimeError( @@ -2028,6 +2037,7 @@ def get_finished(self) -> tuple[set[str], set[str]]: if not self.use_mla and ( block_size_ratio > 1 or self.enable_permute_local_kv ): + assert not self._is_hma_enabled block_ids_for_blocksize_post_process[block_size_ratio].append( meta.local_physical_block_ids[0] ) @@ -2164,7 +2174,9 @@ def _handle_failed_transfer(self, req_id: str, handle: int): if meta := self._recving_metadata.get(req_id): # For the purpose of marking blocks as invalid, only report FA ones to # handle blocks<>tokens mapping consistently. - local_block_ids = get_blocks_in_fa_kv_group(meta.local_block_ids) + local_block_ids = get_blocks_in_fa_kv_group( + meta.local_block_ids, self.kv_cache_config + ) self._invalid_block_ids.update(local_block_ids) self.nixl_wrapper.release_xfer_handle(handle) self.xfer_stats.record_failed_transfer() @@ -2305,6 +2317,7 @@ def _read_blocks( block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) if block_size_ratio > 1: # TODO (NickLucche) assume HMA is off. Change to handle multiple KV groups. + assert not self._is_hma_enabled local_block_ids0 = local_block_ids[0] if local_block_ids else [] remote_block_ids0 = remote_block_ids[0] local_block_ids_mapped = self.get_mapped_blocks( @@ -2334,8 +2347,7 @@ def _read_blocks( # then we will need to have the staging blocks on the remote side. # NOTE(rob): according to nvidia the staging blocks are used to - # saturate IB with heterogeneous TP sizes. We should remove the staging - # blocks until we are ready. + # saturate IB with heterogeneous TP sizes. # Number of D TP workers that will read from dst P. Propagate info # on notification so that dst worker can wait before freeing blocks. @@ -2421,7 +2433,9 @@ def _read_blocks( remote_rank=remote_rank, ) if meta := self._recving_metadata.get(request_id): - fa_local_block_ids = get_blocks_in_fa_kv_group(meta.local_block_ids) + fa_local_block_ids = get_blocks_in_fa_kv_group( + meta.local_block_ids, self.kv_cache_config + ) self._invalid_block_ids.update(fa_local_block_ids) self.xfer_stats.record_failed_transfer() if handle is not None: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index fc5b6ee7e058..dcc31d71dca6 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -19,6 +19,9 @@ from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_connector.utils import ( + get_full_attention_group_idx, +) from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, @@ -116,7 +119,7 @@ def __init__( self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None self.recompute_kv_load_failures = True - self._connector_supports_hma = False + self._full_attention_group_idx = 0 if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -132,7 +135,9 @@ def __init__( self.vllm_config.kv_transfer_config.kv_load_failure_policy ) self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" - self._connector_supports_hma = isinstance(self.connector, SupportsHMA) + self._full_attention_group_idx = get_full_attention_group_idx( + kv_cache_config + ) self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1952,7 +1957,7 @@ def _connector_finished( block_ids = self.kv_cache_manager.get_block_ids(request.request_id) - if not self._connector_supports_hma: + if not isinstance(self.connector, SupportsHMA): # NOTE(Kuntai): We should deprecate this code path after we enforce # all connectors to support HMA. # Hybrid memory allocator should be already turned off for this @@ -1995,7 +2000,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: block_ids = self.kv_cache_manager.get_block_ids(request.request_id) # When connector does not support HMA, a single group is present here num_computed_tokens = ( - max(len(group) for group in block_ids) * self.block_size + len(block_ids[self._full_attention_group_idx]) * self.block_size ) # Get number of blocks on full attention layer, we can retrieve at most # this many tokens @@ -2084,10 +2089,8 @@ def _update_requests_with_invalid_blocks( req_block_ids = self.kv_cache_manager.get_block_ids(req_id) # Assume FA group is present to infer number of computed tokens # TODO this is not padded for SW right? - fa_blocks_idx = max( - range(len(req_block_ids)), key=lambda x: len(req_block_ids[x]) - ) - max_num_blocks = len(req_block_ids[fa_blocks_idx]) + fa_blocks = req_block_ids[self._full_attention_group_idx] + max_num_blocks = len(fa_blocks) # We iterate only over blocks that may contain externally computed # tokens if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: @@ -2107,10 +2110,7 @@ def _update_requests_with_invalid_blocks( ) // self.block_size # For the purpose of marking blocks as invalid, only report FA ones to # handle blocks<>tokens mapping consistently. - # for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids): - for idx, block_id in zip( - range(req_num_computed_blocks), req_block_ids[fa_blocks_idx] - ): + for idx, block_id in zip(range(req_num_computed_blocks), fa_blocks): if block_id not in invalid_block_ids: continue From b22efd7b951400d9a27232f6a6e1a87b5b76892f Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 5 Feb 2026 15:56:55 +0000 Subject: [PATCH 13/28] enable hma on all configs opt Signed-off-by: NickLucche --- .../nixl_integration/config_sweep_accuracy_test.sh | 10 +++++++++- .../nixl_integration/run_accuracy_test.sh | 14 +++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh index b84d3bd0c9d9..fb268994ef78 100755 --- a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh @@ -12,7 +12,7 @@ tp_configs=( "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" - "ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it" # HMA case + "ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it" # SW model ) dp_ep_configs=( "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) @@ -27,6 +27,14 @@ else configs=("${tp_configs[@]}") fi +if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then + # Append ENABLE_HMA_FLAG=1 to each config in the selected array + echo "ENABLE_HMA_FLAG is set, appending ENABLE_HMA_FLAG=1 to each config" + for i in "${!configs[@]}"; do + configs[$i]="ENABLE_HMA_FLAG=1 ${configs[$i]}" + done +fi + run_tests() { local label=$1 local extra_args=$2 diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index b3a2eec1160b..81ee303e8339 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -5,10 +5,10 @@ set -xe KV_BUFFER_DEVICE="cuda" # Default to cuda ATTENTION_BACKEND="" # Default to empty (use vllm default) CROSS_LAYERS_BLOCKS="False" -ENABLE_HMA_FLAG="" # Default to empty (HMA disabled by default for kv connector) +ENABLE_HMA_VAR="" # Default to empty (HMA disabled by default for kv connector) # Check for ENABLE_HMA_FLAG environment variable if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then - ENABLE_HMA_FLAG="--no-disable-hybrid-kv-cache-manager" + ENABLE_HMA_VAR="--no-disable-hybrid-kv-cache-manager" fi while [[ $# -gt 0 ]]; do @@ -37,7 +37,7 @@ echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE" if [[ -n "$ATTENTION_BACKEND" ]]; then echo "Using attention backend: $ATTENTION_BACKEND" fi -if [[ -n "$ENABLE_HMA_FLAG" ]]; then +if [[ -n "$ENABLE_HMA_VAR" ]]; then echo "HMA (Hybrid KV Cache Manager) enabled" fi @@ -167,8 +167,8 @@ run_tests_for_model() { fi # Add HMA flag if specified - if [[ -n "$ENABLE_HMA_FLAG" ]]; then - BASE_CMD="${BASE_CMD} $ENABLE_HMA_FLAG" + if [[ -n "$ENABLE_HMA_VAR" ]]; then + BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR" fi if [ -n "$model_args" ]; then @@ -218,8 +218,8 @@ run_tests_for_model() { fi # Add HMA flag if specified - if [[ -n "$ENABLE_HMA_FLAG" ]]; then - BASE_CMD="${BASE_CMD} $ENABLE_HMA_FLAG" + if [[ -n "$ENABLE_HMA_VAR" ]]; then + BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR" fi # DP-EP attention mode From 41122abcebbadc87046e6c21ed841c7826070825 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 6 Feb 2026 11:21:23 +0000 Subject: [PATCH 14/28] request-level failure for hma Signed-off-by: NickLucche --- .../kv_transfer/kv_connector/utils.py | 22 -------- .../kv_connector/v1/nixl_connector.py | 21 +++----- vllm/v1/core/sched/scheduler.py | 52 +++++++++++-------- 3 files changed, 36 insertions(+), 59 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 7eb333b76120..25178f4db202 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -336,28 +336,6 @@ def get_full_attention_group_idx( return fa_group_idx -def get_blocks_in_fa_kv_group( - block_ids: BlockIds, - kv_cache_config: "KVCacheConfig", -) -> list[int]: - """ - Get blocks in the full attention KV group using KVCacheConfig to determine - the correct group index. - - Args: - block_ids: Block IDs organized by KV cache group - kv_cache_config: The KV cache configuration used to identify the FA group - - Returns: - The block IDs for the full attention KV cache group - """ - if not block_ids: - # Full prefix cache hit case - return [] - fa_group_idx = get_full_attention_group_idx(kv_cache_config) - return block_ids[fa_group_idx] - - @dataclass class TpKVTopology: """ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8ec720116ae1..a1b949283dbf 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -26,7 +26,6 @@ BlockIds, EngineId, TpKVTopology, - get_blocks_in_fa_kv_group, get_current_attn_backend, kv_postprocess_blksize_and_layout_on_receive, kv_postprocess_blksize_on_receive, @@ -1370,10 +1369,8 @@ def request_ready(f: Future[Any], entry=(req_id, meta)): meta=meta, ) if req_meta := self._recving_metadata.get(req_id): - local_block_ids = get_blocks_in_fa_kv_group( - req_meta.local_block_ids, self.kv_cache_config - ) - self._invalid_block_ids.update(local_block_ids) + for group_block_ids in req_meta.local_block_ids: + self._invalid_block_ids.update(group_block_ids) self._failed_recv_reqs.add(req_id) fut.add_done_callback(request_ready) @@ -2172,12 +2169,8 @@ def _handle_failed_transfer(self, req_id: str, handle: int): """ # Use .get() here as the metadata cleanup is handled by get_finished() if meta := self._recving_metadata.get(req_id): - # For the purpose of marking blocks as invalid, only report FA ones to - # handle blocks<>tokens mapping consistently. - local_block_ids = get_blocks_in_fa_kv_group( - meta.local_block_ids, self.kv_cache_config - ) - self._invalid_block_ids.update(local_block_ids) + for group_block_ids in meta.local_block_ids: + self._invalid_block_ids.update(group_block_ids) self.nixl_wrapper.release_xfer_handle(handle) self.xfer_stats.record_failed_transfer() @@ -2433,10 +2426,8 @@ def _read_blocks( remote_rank=remote_rank, ) if meta := self._recving_metadata.get(request_id): - fa_local_block_ids = get_blocks_in_fa_kv_group( - meta.local_block_ids, self.kv_cache_config - ) - self._invalid_block_ids.update(fa_local_block_ids) + for group_block_ids in meta.local_block_ids: + self._invalid_block_ids.update(group_block_ids) self.xfer_stats.record_failed_transfer() if handle is not None: self.nixl_wrapper.release_xfer_handle(handle) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index dcc31d71dca6..1cb6a3296cc9 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -2087,8 +2087,8 @@ def _update_requests_with_invalid_blocks( marked_invalid_block = False req_id = request.request_id req_block_ids = self.kv_cache_manager.get_block_ids(req_id) + is_hma = len(req_block_ids) > 1 # Assume FA group is present to infer number of computed tokens - # TODO this is not padded for SW right? fa_blocks = req_block_ids[self._full_attention_group_idx] max_num_blocks = len(fa_blocks) # We iterate only over blocks that may contain externally computed @@ -2105,12 +2105,17 @@ def _update_requests_with_invalid_blocks( # Sync loading. num_computed_tokens includes new tokens req_num_computed_tokens = request.num_cached_tokens + all_req_block_ids = ( + (block_id for group in req_block_ids for block_id in group) + if is_hma + else req_block_ids[0] + ) req_num_computed_blocks = ( req_num_computed_tokens + self.block_size - 1 ) // self.block_size - # For the purpose of marking blocks as invalid, only report FA ones to - # handle blocks<>tokens mapping consistently. - for idx, block_id in zip(range(req_num_computed_blocks), fa_blocks): + for idx, block_id in enumerate(all_req_block_ids): + if idx >= req_num_computed_blocks: + break if block_id not in invalid_block_ids: continue @@ -2133,24 +2138,27 @@ def _update_requests_with_invalid_blocks( continue marked_invalid_block = True - # Truncate the computed tokens at the first failed block - request.num_computed_tokens = idx * self.block_size - num_affected_tokens = ( - req_num_computed_tokens - request.num_computed_tokens - ) - total_affected_tokens += num_affected_tokens - request.num_external_computed_tokens -= num_affected_tokens - # Collect invalid block and all downstream dependent blocks, across - # all groups. - if evict_blocks: - # Assuming groups are not padded, do SW-aware eviction, example: - # FA: [A B C D C] - # SW: [ E F] - # =>Evict E only when failure index <= E. - for group in req_block_ids: - offset = max_num_blocks - len(group) - start_idx = max(0, idx - offset) - blocks_to_evict.update(group[start_idx:]) + if is_hma: + # TODO (NickLucche) HMA: Partial recovery is not supported because + # SW blocks only cover a suffix of the original sequence. + # After truncation, the sliding window shifts and may require + # blocks that were never transferred. Evict all and restart fresh. + total_affected_tokens += req_num_computed_tokens + request.num_computed_tokens = 0 + request.num_external_computed_tokens = 0 + if evict_blocks: + for group in req_block_ids: + blocks_to_evict.update(group) + else: + # Truncate the computed tokens at the first failed block + request.num_computed_tokens = idx * self.block_size + num_affected_tokens = ( + req_num_computed_tokens - request.num_computed_tokens + ) + total_affected_tokens += num_affected_tokens + request.num_external_computed_tokens -= num_affected_tokens + if evict_blocks: + blocks_to_evict.update(fa_blocks[idx:]) if is_affected: if not marked_invalid_block: From 3602394061d6b987c0fb73e8d696b19e1b88a13e Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 6 Feb 2026 11:39:46 +0000 Subject: [PATCH 15/28] add request-level failure tests Signed-off-by: NickLucche --- tests/out_eval | 161 ++++++++++++++++++ .../unit/test_invalid_blocks_correctness.py | 135 +++++++++++++++ tests/v1/kv_connector/unit/utils.py | 39 +++-- 3 files changed, 318 insertions(+), 17 deletions(-) create mode 100644 tests/out_eval diff --git a/tests/out_eval b/tests/out_eval new file mode 100644 index 000000000000..b950194d5348 --- /dev/null +++ b/tests/out_eval @@ -0,0 +1,161 @@ +ENABLE_HMA_FLAG is set, appending ENABLE_HMA_FLAG=1 to each config +=== Running tests (default backend) === +-> Running with ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 +Running accuracy tests with kv_buffer_device=cuda +HMA (Hybrid KV Cache Manager) enabled +================================ +Testing model: Qwen/Qwen3-0.6B +================================ +Starting prefill instance 0 on GPU 0,1, port 8100 +Starting decode instance 0 on GPU 2,3, port 8200 +Waiting for prefill instance on port 8100 to start... +(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] +(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] █ █ █▄ ▄█ +(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.15.2rc1.dev34+g62deffca0.d20260205 +(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] █▄█▀ █ █ █ █ model Qwen/Qwen3-0.6B +(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] +(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:250] non-default args: {'model_tag': 'Qwen/Qwen3-0.6B', 'api_server_count': 1, 'port': 8200, 'enforce_eager': True, 'tensor_parallel_size': 2, 'block_size': 128, 'gpu_memory_utilization': 0.6, 'disable_hybrid_kv_cache_manager': False, 'kv_transfer_config': KVTransferConfig(kv_connector='NixlConnector', engine_id='2390b1ed-9351-43eb-a05d-19c4de3adef5', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='recompute')} +(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] +(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] █ █ █▄ ▄█ +(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.15.2rc1.dev34+g62deffca0.d20260205 +(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] █▄█▀ █ █ █ █ model Qwen/Qwen3-0.6B +(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] +(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:250] non-default args: {'model_tag': 'Qwen/Qwen3-0.6B', 'api_server_count': 1, 'port': 8100, 'enforce_eager': True, 'tensor_parallel_size': 2, 'block_size': 128, 'gpu_memory_utilization': 0.6, 'disable_hybrid_kv_cache_manager': False, 'kv_transfer_config': KVTransferConfig(kv_connector='NixlConnector', engine_id='38dfb72a-cbc2-42f6-842c-66ac26a9e290', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='recompute')} +(APIServer pid=2088919) INFO 02-06 10:41:18 [model.py:529] Resolved architecture: Qwen3ForCausalLM +(APIServer pid=2088919) INFO 02-06 10:41:18 [model.py:1544] Using max model len 40960 +(APIServer pid=2088903) INFO 02-06 10:41:18 [model.py:529] Resolved architecture: Qwen3ForCausalLM +(APIServer pid=2088903) INFO 02-06 10:41:18 [model.py:1544] Using max model len 40960 +(APIServer pid=2088919) INFO 02-06 10:41:18 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=2088919) INFO 02-06 10:41:18 [vllm.py:666] Asynchronous scheduling is enabled. +(APIServer pid=2088919) WARNING 02-06 10:41:18 [vllm.py:704] Enforce eager set, overriding optimization level to -O0 +(APIServer pid=2088919) INFO 02-06 10:41:18 [vllm.py:809] Cudagraph is disabled under eager mode +(APIServer pid=2088903) INFO 02-06 10:41:18 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=2088903) INFO 02-06 10:41:18 [vllm.py:666] Asynchronous scheduling is enabled. +(APIServer pid=2088903) WARNING 02-06 10:41:18 [vllm.py:704] Enforce eager set, overriding optimization level to -O0 +(APIServer pid=2088903) INFO 02-06 10:41:18 [vllm.py:809] Cudagraph is disabled under eager mode +(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:28 [core.py:96] Initializing a V1 LLM engine (v0.15.2rc1.dev34+g62deffca0.d20260205) with config: model='Qwen/Qwen3-0.6B', speculative_config=None, tokenizer='Qwen/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-0.6B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'static_all_moe_layers': []} +(EngineCore_DP0 pid=2089638) WARNING 02-06 10:41:28 [multiproc_executor.py:910] Reducing Torch parallelism from 80 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:28 [core.py:96] Initializing a V1 LLM engine (v0.15.2rc1.dev34+g62deffca0.d20260205) with config: model='Qwen/Qwen3-0.6B', speculative_config=None, tokenizer='Qwen/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-0.6B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'static_all_moe_layers': []} +(EngineCore_DP0 pid=2089634) WARNING 02-06 10:41:28 [multiproc_executor.py:910] Reducing Torch parallelism from 80 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:42381 backend=nccl +INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:35589 backend=nccl +INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:35589 backend=nccl +INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:42381 backend=nccl +INFO 02-06 10:41:40 [pynccl.py:111] vLLM is using nccl==2.27.5 +INFO 02-06 10:41:41 [pynccl.py:111] vLLM is using nccl==2.27.5 +INFO 02-06 10:41:43 [parallel_state.py:1445] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A +INFO 02-06 10:41:43 [parallel_state.py:1445] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 1, EP rank N/A +INFO 02-06 10:41:43 [parallel_state.py:1445] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 1, EP rank N/A +INFO 02-06 10:41:43 [parallel_state.py:1445] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A +(Worker_TP0 pid=2090292) INFO 02-06 10:41:44 [gpu_model_runner.py:4119] Starting to load model Qwen/Qwen3-0.6B... +(Worker_TP0 pid=2090156) INFO 02-06 10:41:44 [gpu_model_runner.py:4119] Starting to load model Qwen/Qwen3-0.6B... +(Worker_TP0 pid=2090292) INFO 02-06 10:41:44 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=2090156) INFO 02-06 10:41:45 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=2090292) INFO 02-06 10:41:45 [weight_utils.py:567] No model.safetensors.index.json found in remote. +(Worker_TP0 pid=2090292) INFO 02-06 10:41:45 [default_loader.py:291] Loading weights took 0.17 seconds +(Worker_TP1 pid=2090293) INFO 02-06 10:41:45 [weight_utils.py:567] No model.safetensors.index.json found in remote. +(Worker_TP1 pid=2090157) INFO 02-06 10:41:46 [weight_utils.py:567] No model.safetensors.index.json found in remote. +(Worker_TP0 pid=2090292) INFO 02-06 10:41:46 [gpu_model_runner.py:4216] Model loading took 0.57 GiB memory and 1.603964 seconds +(Worker_TP0 pid=2090156) INFO 02-06 10:41:46 [weight_utils.py:567] No model.safetensors.index.json found in remote. +(Worker_TP0 pid=2090156) INFO 02-06 10:41:46 [default_loader.py:291] Loading weights took 0.20 seconds +(Worker_TP0 pid=2090156) INFO 02-06 10:41:47 [gpu_model_runner.py:4216] Model loading took 0.57 GiB memory and 2.198764 seconds +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [gpu_worker.py:360] Available KV cache memory: 40.84 GiB +(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:49 [kv_cache_utils.py:1307] GPU KV cache size: 764,672 tokens +(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:49 [kv_cache_utils.py:1312] Maximum concurrency for 40,960 tokens per request: 18.67x +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 2390b1ed-9351-43eb-a05d-19c4de3adef5 +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 2390b1ed-9351-43eb-a05d-19c4de3adef5 +(Worker_TP0 pid=2090292) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper +(Worker_TP1 pid=2090293) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 2390b1ed-9351-43eb-a05d-19c4de3adef5 +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 2390b1ed-9351-43eb-a05d-19c4de3adef5 +(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [gpu_worker.py:360] Available KV cache memory: 40.84 GiB +(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:49 [kv_cache_utils.py:1307] GPU KV cache size: 764,672 tokens +(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:49 [kv_cache_utils.py:1312] Maximum concurrency for 40,960 tokens per request: 18.67x +(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. +(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. +(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available +(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available +(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 38dfb72a-cbc2-42f6-842c-66ac26a9e290 +(Worker_TP0 pid=2090156) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. +(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 38dfb72a-cbc2-42f6-842c-66ac26a9e290 +(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper +(Worker_TP1 pid=2090157) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. +(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 38dfb72a-cbc2-42f6-842c-66ac26a9e290 +(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper +(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 38dfb72a-cbc2-42f6-842c-66ac26a9e290 +(Worker_TP1 pid=2090293) 2026-02-06 10:41:49 NIXL INFO _api.py:363 Backend UCX was instantiated +(Worker_TP1 pid=2090293) 2026-02-06 10:41:49 NIXL INFO _api.py:253 Initialized NIXL agent: 70bd73a1-9d3e-40d3-8e54-28edb0363079 +(Worker_TP0 pid=2090292) 2026-02-06 10:41:49 NIXL INFO _api.py:363 Backend UCX was instantiated +(Worker_TP0 pid=2090292) 2026-02-06 10:41:49 NIXL INFO _api.py:253 Initialized NIXL agent: 624f5a93-a261-4ef9-92a9-6d19ce6b47bc +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. +(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:1083] Detected kv cache layout HND +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN +(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:1083] Detected kv cache layout HND +(Worker_TP1 pid=2090157) 2026-02-06 10:41:50 NIXL INFO _api.py:363 Backend UCX was instantiated +(Worker_TP1 pid=2090157) 2026-02-06 10:41:50 NIXL INFO _api.py:253 Initialized NIXL agent: ab967c8d-ebd5-4dff-aaa1-64833ef5fb69 +(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. +(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN +(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [nixl_connector.py:1083] Detected kv cache layout HND +(Worker_TP0 pid=2090156) 2026-02-06 10:41:50 NIXL INFO _api.py:363 Backend UCX was instantiated +(Worker_TP0 pid=2090156) 2026-02-06 10:41:50 NIXL INFO _api.py:253 Initialized NIXL agent: 4cd176c9-f171-422b-8536-79b8abdf23bf +(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. +(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN +(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [nixl_connector.py:1083] Detected kv cache layout HND +(Worker_TP0 pid=2090292) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False +(Worker_TP1 pid=2090293) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False +(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False +(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False +(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:50 [core.py:272] init engine (profile, create kv cache, warmup model) took 3.72 seconds +(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:50 [core.py:272] init engine (profile, create kv cache, warmup model) took 4.27 seconds +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] EngineCore failed to start. +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] Traceback (most recent call last): +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 957, in run_engine_core +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 711, in __init__ +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] super().__init__( +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 123, in __init__ +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] Scheduler = vllm_config.scheduler_config.get_scheduler_cls() +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/config/scheduler.py", line 156, in get_scheduler_cls +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.async_scheduler import AsyncScheduler +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/async_scheduler.py", line 6, in +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.scheduler import Scheduler +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/scheduler.py", line 2070 +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] if +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] ^ +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] SyntaxError: invalid syntax +(Worker_TP0 pid=2090156) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated +(Worker_TP1 pid=2090157) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] EngineCore failed to start. +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] Traceback (most recent call last): +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 957, in run_engine_core +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 711, in __init__ +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] super().__init__( +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 123, in __init__ +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] Scheduler = vllm_config.scheduler_config.get_scheduler_cls() +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/config/scheduler.py", line 156, in get_scheduler_cls +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.async_scheduler import AsyncScheduler +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/async_scheduler.py", line 6, in +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.scheduler import Scheduler +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/scheduler.py", line 2070 +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] if +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] ^ +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] SyntaxError: invalid syntax +(Worker_TP1 pid=2090293) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated +(Worker_TP0 pid=2090292) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated +(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:54 [multiproc_executor.py:246] Worker proc VllmWorker-0 died unexpectedly, shutting down executor. +(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:54 [multiproc_executor.py:246] Worker proc VllmWorker-0 died unexpectedly, shutting down executor. diff --git a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py index 6cb2d3ea4d97..581fb0bf18a4 100644 --- a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py +++ b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py @@ -24,6 +24,7 @@ create_request, create_scheduler, create_vllm_config, + make_kv_cache_config, ) pytestmark = pytest.mark.cpu_test @@ -56,6 +57,18 @@ def recompute_scheduler(): return create_scheduler(vllm_config) +@pytest.fixture +def hma_recompute_scheduler(): + """scheduler with HMA enabled and kv_load_failure_policy='recompute'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute" + # Create scheduler with HMA (FA + SW groups) + kv_cache_config = make_kv_cache_config( + block_size=16, hma_enabled=True, num_blocks=10000 + ) + return create_scheduler(vllm_config, kv_cache_config=kv_cache_config) + + def test_sync_recompute_blocks_not_freed_for_running_requests( recompute_scheduler: Scheduler, ): @@ -478,3 +491,125 @@ def cache_blocks_spy(req, num_tokens): # request should be in the running queue assert request in recompute_scheduler.running + + +def test_hma_sync_recompute_evicts_all_blocks( + hma_recompute_scheduler: Scheduler, +): + """ + Test HMA sync recompute case - all blocks must be evicted on any failure. + + With HMA, sliding window (SW) blocks only cover a suffix of the sequence. + After truncating num_computed_tokens, the sliding window shifts and may + require blocks that were never transferred. + + Therefore, when any block fails we evict ALL blocks and reset + num_computed_tokens to 0. + + This test verifies: + 1. All blocks (FA and SW) are evicted on partial failure + 2. num_computed_tokens is reset to 0 (not truncated to failure point) + 3. Request can be rescheduled for full recomputation + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + # Fail a block in the middle - in non-HMA this would allow partial recovery + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * hma_recompute_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * hma_recompute_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + hma_recompute_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + hma_recompute_scheduler.connector = Mock() + hma_recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + hma_recompute_scheduler.connector.request_finished.return_value = (False, None) + hma_recompute_scheduler.connector.take_events.return_value = () + + scheduler_output = hma_recompute_scheduler.schedule() + + # request should be running with sync KV load + assert len(hma_recompute_scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + assert request.status == RequestStatus.RUNNING + + # verify we have multiple KV cache groups (HMA is enabled) + req_block_ids = hma_recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + assert len(req_block_ids) == 2 + + # get the FA block IDs + fa_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_ids = {fa_block_ids[invalid_block_idx]} + + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=False, # not finished - should continue running + ) + + hma_recompute_scheduler.update_from_output(scheduler_output, model_runner_output) + + # Critical assertions for HMA recompute case: + + # 1. request should still be RUNNING (for recomputation) + assert request.status == RequestStatus.RUNNING, ( + f"Request should remain RUNNING for recompute, got {request.status}" + ) + + # 2. num_computed_tokens should be reset to 0 (not truncated to failure point) + # This is the key difference from non-HMA: we can't do partial recovery + assert request.num_computed_tokens == 0, ( + f"HMA: num_computed_tokens should be reset to 0, " + f"got {request.num_computed_tokens}. " + f"Partial recovery is not supported with sliding window layers." + ) + + # 3. num_external_computed_tokens should also be reset to 0 + assert request.num_external_computed_tokens == 0, ( + f"HMA: num_external_computed_tokens should be reset to 0, " + f"got {request.num_external_computed_tokens}" + ) + + # 4. verify blocks are still allocated (not freed yet, just marked for eviction) + # The actual eviction happens after update_from_output + allocated_blocks = hma_recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + assert allocated_blocks is not None + + # 5. request should still be in running queue + assert request in hma_recompute_scheduler.running, ( + "Request should remain in running queue for recomputation" + ) + + # 6. request should still be in scheduler.requests + assert request.request_id in hma_recompute_scheduler.requests, ( + "Request should not be deleted from scheduler.requests" + ) + + # 7. verify request can be rescheduled for full recomputation + scheduler_output_2 = hma_recompute_scheduler.schedule() + + # request should be reschedulable + scheduled_req_ids = [ + req.request_id for req in scheduler_output_2.scheduled_new_reqs + ] + if scheduler_output_2.num_scheduled_tokens: + scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys()) + + assert ( + request.request_id in scheduled_req_ids + or len(hma_recompute_scheduler.running) > 0 + ), "Request should be reschedulable for full recomputation" diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 15c8f0d325d2..d267299815a6 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -143,24 +143,26 @@ def create_vllm_config( def create_scheduler( vllm_config: VllmConfig, num_blocks: int = 10000, + kv_cache_config: KVCacheConfig | None = None, ) -> Scheduler: """Initialize Scheduler For Testing.""" block_size = vllm_config.cache_config.block_size - kv_cache_config = KVCacheConfig( - num_blocks=num_blocks, # A large number of blocks to hold all requests - kv_cache_tensors=[], - kv_cache_groups=[ - KVCacheGroupSpec( - ["layer"], - FullAttentionSpec( - block_size=block_size, - num_kv_heads=1, - head_size=1, - dtype=torch.float32, - ), - ) - ], - ) + if kv_cache_config is None: + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, # A large number of blocks to hold all requests + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec( + ["layer"], + FullAttentionSpec( + block_size=block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + ), + ) + ], + ) vllm_config.cache_config.num_gpu_blocks = num_blocks return Scheduler( vllm_config=vllm_config, @@ -416,7 +418,10 @@ def wait_for_save(self): def make_kv_cache_config( - block_size: int, hma_enabled: bool = False, sw_size: int = 128 + block_size: int, + hma_enabled: bool = False, + sw_size: int = 128, + num_blocks: int = 100, ) -> KVCacheConfig: kv_cache_groups = [ KVCacheGroupSpec( @@ -443,5 +448,5 @@ def make_kv_cache_config( ) ) return KVCacheConfig( - num_blocks=100, kv_cache_tensors=[], kv_cache_groups=kv_cache_groups + num_blocks=num_blocks, kv_cache_tensors=[], kv_cache_groups=kv_cache_groups ) From 0b481675cac96ca1a383add4b7d4df37541fff9d Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 6 Feb 2026 11:42:42 +0000 Subject: [PATCH 16/28] micro-opt for sw clip Signed-off-by: NickLucche --- tests/out_eval | 161 ------------------ .../config_sweep_accuracy_test.sh | 2 +- .../unit/test_nixl_connector_hma.py | 1 - .../kv_connector/v1/nixl_connector.py | 29 ++-- vllm/v1/core/sched/scheduler.py | 8 +- 5 files changed, 25 insertions(+), 176 deletions(-) delete mode 100644 tests/out_eval diff --git a/tests/out_eval b/tests/out_eval deleted file mode 100644 index b950194d5348..000000000000 --- a/tests/out_eval +++ /dev/null @@ -1,161 +0,0 @@ -ENABLE_HMA_FLAG is set, appending ENABLE_HMA_FLAG=1 to each config -=== Running tests (default backend) === --> Running with ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 -Running accuracy tests with kv_buffer_device=cuda -HMA (Hybrid KV Cache Manager) enabled -================================ -Testing model: Qwen/Qwen3-0.6B -================================ -Starting prefill instance 0 on GPU 0,1, port 8100 -Starting decode instance 0 on GPU 2,3, port 8200 -Waiting for prefill instance on port 8100 to start... -(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] -(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] █ █ █▄ ▄█ -(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.15.2rc1.dev34+g62deffca0.d20260205 -(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] █▄█▀ █ █ █ █ model Qwen/Qwen3-0.6B -(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:314] -(APIServer pid=2088919) INFO 02-06 10:41:15 [utils.py:250] non-default args: {'model_tag': 'Qwen/Qwen3-0.6B', 'api_server_count': 1, 'port': 8200, 'enforce_eager': True, 'tensor_parallel_size': 2, 'block_size': 128, 'gpu_memory_utilization': 0.6, 'disable_hybrid_kv_cache_manager': False, 'kv_transfer_config': KVTransferConfig(kv_connector='NixlConnector', engine_id='2390b1ed-9351-43eb-a05d-19c4de3adef5', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='recompute')} -(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] -(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] █ █ █▄ ▄█ -(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.15.2rc1.dev34+g62deffca0.d20260205 -(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] █▄█▀ █ █ █ █ model Qwen/Qwen3-0.6B -(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:314] -(APIServer pid=2088903) INFO 02-06 10:41:15 [utils.py:250] non-default args: {'model_tag': 'Qwen/Qwen3-0.6B', 'api_server_count': 1, 'port': 8100, 'enforce_eager': True, 'tensor_parallel_size': 2, 'block_size': 128, 'gpu_memory_utilization': 0.6, 'disable_hybrid_kv_cache_manager': False, 'kv_transfer_config': KVTransferConfig(kv_connector='NixlConnector', engine_id='38dfb72a-cbc2-42f6-842c-66ac26a9e290', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='recompute')} -(APIServer pid=2088919) INFO 02-06 10:41:18 [model.py:529] Resolved architecture: Qwen3ForCausalLM -(APIServer pid=2088919) INFO 02-06 10:41:18 [model.py:1544] Using max model len 40960 -(APIServer pid=2088903) INFO 02-06 10:41:18 [model.py:529] Resolved architecture: Qwen3ForCausalLM -(APIServer pid=2088903) INFO 02-06 10:41:18 [model.py:1544] Using max model len 40960 -(APIServer pid=2088919) INFO 02-06 10:41:18 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=2088919) INFO 02-06 10:41:18 [vllm.py:666] Asynchronous scheduling is enabled. -(APIServer pid=2088919) WARNING 02-06 10:41:18 [vllm.py:704] Enforce eager set, overriding optimization level to -O0 -(APIServer pid=2088919) INFO 02-06 10:41:18 [vllm.py:809] Cudagraph is disabled under eager mode -(APIServer pid=2088903) INFO 02-06 10:41:18 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=2088903) INFO 02-06 10:41:18 [vllm.py:666] Asynchronous scheduling is enabled. -(APIServer pid=2088903) WARNING 02-06 10:41:18 [vllm.py:704] Enforce eager set, overriding optimization level to -O0 -(APIServer pid=2088903) INFO 02-06 10:41:18 [vllm.py:809] Cudagraph is disabled under eager mode -(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:28 [core.py:96] Initializing a V1 LLM engine (v0.15.2rc1.dev34+g62deffca0.d20260205) with config: model='Qwen/Qwen3-0.6B', speculative_config=None, tokenizer='Qwen/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-0.6B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'static_all_moe_layers': []} -(EngineCore_DP0 pid=2089638) WARNING 02-06 10:41:28 [multiproc_executor.py:910] Reducing Torch parallelism from 80 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:28 [core.py:96] Initializing a V1 LLM engine (v0.15.2rc1.dev34+g62deffca0.d20260205) with config: model='Qwen/Qwen3-0.6B', speculative_config=None, tokenizer='Qwen/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-0.6B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'static_all_moe_layers': []} -(EngineCore_DP0 pid=2089634) WARNING 02-06 10:41:28 [multiproc_executor.py:910] Reducing Torch parallelism from 80 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:42381 backend=nccl -INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:35589 backend=nccl -INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:35589 backend=nccl -INFO 02-06 10:41:40 [parallel_state.py:1234] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:42381 backend=nccl -INFO 02-06 10:41:40 [pynccl.py:111] vLLM is using nccl==2.27.5 -INFO 02-06 10:41:41 [pynccl.py:111] vLLM is using nccl==2.27.5 -INFO 02-06 10:41:43 [parallel_state.py:1445] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A -INFO 02-06 10:41:43 [parallel_state.py:1445] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 1, EP rank N/A -INFO 02-06 10:41:43 [parallel_state.py:1445] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 1, EP rank N/A -INFO 02-06 10:41:43 [parallel_state.py:1445] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A -(Worker_TP0 pid=2090292) INFO 02-06 10:41:44 [gpu_model_runner.py:4119] Starting to load model Qwen/Qwen3-0.6B... -(Worker_TP0 pid=2090156) INFO 02-06 10:41:44 [gpu_model_runner.py:4119] Starting to load model Qwen/Qwen3-0.6B... -(Worker_TP0 pid=2090292) INFO 02-06 10:41:44 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=2090156) INFO 02-06 10:41:45 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=2090292) INFO 02-06 10:41:45 [weight_utils.py:567] No model.safetensors.index.json found in remote. -(Worker_TP0 pid=2090292) INFO 02-06 10:41:45 [default_loader.py:291] Loading weights took 0.17 seconds -(Worker_TP1 pid=2090293) INFO 02-06 10:41:45 [weight_utils.py:567] No model.safetensors.index.json found in remote. -(Worker_TP1 pid=2090157) INFO 02-06 10:41:46 [weight_utils.py:567] No model.safetensors.index.json found in remote. -(Worker_TP0 pid=2090292) INFO 02-06 10:41:46 [gpu_model_runner.py:4216] Model loading took 0.57 GiB memory and 1.603964 seconds -(Worker_TP0 pid=2090156) INFO 02-06 10:41:46 [weight_utils.py:567] No model.safetensors.index.json found in remote. -(Worker_TP0 pid=2090156) INFO 02-06 10:41:46 [default_loader.py:291] Loading weights took 0.20 seconds -(Worker_TP0 pid=2090156) INFO 02-06 10:41:47 [gpu_model_runner.py:4216] Model loading took 0.57 GiB memory and 2.198764 seconds -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [gpu_worker.py:360] Available KV cache memory: 40.84 GiB -(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:49 [kv_cache_utils.py:1307] GPU KV cache size: 764,672 tokens -(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:49 [kv_cache_utils.py:1312] Maximum concurrency for 40,960 tokens per request: 18.67x -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 2390b1ed-9351-43eb-a05d-19c4de3adef5 -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 2390b1ed-9351-43eb-a05d-19c4de3adef5 -(Worker_TP0 pid=2090292) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper -(Worker_TP1 pid=2090293) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 2390b1ed-9351-43eb-a05d-19c4de3adef5 -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 2390b1ed-9351-43eb-a05d-19c4de3adef5 -(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [gpu_worker.py:360] Available KV cache memory: 40.84 GiB -(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:49 [kv_cache_utils.py:1307] GPU KV cache size: 764,672 tokens -(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:49 [kv_cache_utils.py:1312] Maximum concurrency for 40,960 tokens per request: 18.67x -(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. -(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:105] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL. -(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available -(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:118] NIXL is available -(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 38dfb72a-cbc2-42f6-842c-66ac26a9e290 -(Worker_TP0 pid=2090156) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. -(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [factory.py:64] Creating v1 connector with name: NixlConnector and engine_id: 38dfb72a-cbc2-42f6-842c-66ac26a9e290 -(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper -(Worker_TP1 pid=2090157) WARNING 02-06 10:41:49 [base.py:166] Initializing KVConnectorBase_V1. This API is experimental and subject to change in the future as we iterate the design. -(Worker_TP0 pid=2090156) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 38dfb72a-cbc2-42f6-842c-66ac26a9e290 -(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:928] Initializing NIXL wrapper -(Worker_TP1 pid=2090157) INFO 02-06 10:41:49 [nixl_connector.py:929] Initializing NIXL worker 38dfb72a-cbc2-42f6-842c-66ac26a9e290 -(Worker_TP1 pid=2090293) 2026-02-06 10:41:49 NIXL INFO _api.py:363 Backend UCX was instantiated -(Worker_TP1 pid=2090293) 2026-02-06 10:41:49 NIXL INFO _api.py:253 Initialized NIXL agent: 70bd73a1-9d3e-40d3-8e54-28edb0363079 -(Worker_TP0 pid=2090292) 2026-02-06 10:41:49 NIXL INFO _api.py:363 Backend UCX was instantiated -(Worker_TP0 pid=2090292) 2026-02-06 10:41:49 NIXL INFO _api.py:253 Initialized NIXL agent: 624f5a93-a261-4ef9-92a9-6d19ce6b47bc -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. -(Worker_TP1 pid=2090293) INFO 02-06 10:41:49 [nixl_connector.py:1083] Detected kv cache layout HND -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN -(Worker_TP0 pid=2090292) INFO 02-06 10:41:49 [nixl_connector.py:1083] Detected kv cache layout HND -(Worker_TP1 pid=2090157) 2026-02-06 10:41:50 NIXL INFO _api.py:363 Backend UCX was instantiated -(Worker_TP1 pid=2090157) 2026-02-06 10:41:50 NIXL INFO _api.py:253 Initialized NIXL agent: ab967c8d-ebd5-4dff-aaa1-64833ef5fb69 -(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. -(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN -(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [nixl_connector.py:1083] Detected kv cache layout HND -(Worker_TP0 pid=2090156) 2026-02-06 10:41:50 NIXL INFO _api.py:363 Backend UCX was instantiated -(Worker_TP0 pid=2090156) 2026-02-06 10:41:50 NIXL INFO _api.py:253 Initialized NIXL agent: 4cd176c9-f171-422b-8536-79b8abdf23bf -(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [utils.py:73] `VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to HND. -(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [nixl_connector.py:1082] Detected attention backend FLASH_ATTN -(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [nixl_connector.py:1083] Detected kv cache layout HND -(Worker_TP0 pid=2090292) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False -(Worker_TP1 pid=2090293) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False -(Worker_TP1 pid=2090157) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False -(Worker_TP0 pid=2090156) INFO 02-06 10:41:50 [nixl_connector.py:1399] Registering KV_Caches. use_mla: False, kv_buffer_device: cuda, use_host_buffer: False -(EngineCore_DP0 pid=2089638) INFO 02-06 10:41:50 [core.py:272] init engine (profile, create kv cache, warmup model) took 3.72 seconds -(EngineCore_DP0 pid=2089634) INFO 02-06 10:41:50 [core.py:272] init engine (profile, create kv cache, warmup model) took 4.27 seconds -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] EngineCore failed to start. -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] Traceback (most recent call last): -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 957, in run_engine_core -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 711, in __init__ -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] super().__init__( -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 123, in __init__ -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] Scheduler = vllm_config.scheduler_config.get_scheduler_cls() -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/config/scheduler.py", line 156, in get_scheduler_cls -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.async_scheduler import AsyncScheduler -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/async_scheduler.py", line 6, in -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.scheduler import Scheduler -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/scheduler.py", line 2070 -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] if -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] ^ -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:52 [core.py:966] SyntaxError: invalid syntax -(Worker_TP0 pid=2090156) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated -(Worker_TP1 pid=2090157) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] EngineCore failed to start. -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] Traceback (most recent call last): -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 957, in run_engine_core -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 711, in __init__ -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] super().__init__( -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/engine/core.py", line 123, in __init__ -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] Scheduler = vllm_config.scheduler_config.get_scheduler_cls() -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/config/scheduler.py", line 156, in get_scheduler_cls -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.async_scheduler import AsyncScheduler -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/async_scheduler.py", line 6, in -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] from vllm.v1.core.sched.scheduler import Scheduler -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] File "/home/NickLucche/llmd/worktree/vllm/v1/core/sched/scheduler.py", line 2070 -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] if -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] ^ -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:52 [core.py:966] SyntaxError: invalid syntax -(Worker_TP1 pid=2090293) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated -(Worker_TP0 pid=2090292) WARNING 02-06 10:41:52 [multiproc_executor.py:786] WorkerProc was terminated -(EngineCore_DP0 pid=2089638) ERROR 02-06 10:41:54 [multiproc_executor.py:246] Worker proc VllmWorker-0 died unexpectedly, shutting down executor. -(EngineCore_DP0 pid=2089634) ERROR 02-06 10:41:54 [multiproc_executor.py:246] Worker proc VllmWorker-0 died unexpectedly, shutting down executor. diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh index fb268994ef78..7e232e2fe490 100755 --- a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh @@ -12,7 +12,7 @@ tp_configs=( "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" - "ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it" # SW model + "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it" # SW model ) dp_ep_configs=( "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py index edc1bc4477fc..8e444df65fca 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py @@ -137,7 +137,6 @@ def run_hma_test(llm: LLM): # Process some request with length exceeding the sliding window outputs = llm.generate(["hi" * 1401], sampling_params) kv_params = outputs[0].kv_transfer_params - print("kv_params", kv_params) expected_num_remote_blocks = sw_size // block_size remote_block_ids = kv_params["remote_block_ids"] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index a1b949283dbf..2467abd82f25 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -549,6 +549,8 @@ def __init__( ) logger.info("Initializing NIXL Scheduler %s", engine_id) + if self._is_hma_enabled: + logger.info("Hybrid Memory Allocator is enabled with NIXL") # Background thread for handling new handshake requests. self._nixl_handshake_listener_t: threading.Thread | None = None @@ -569,14 +571,14 @@ def __init__( # Gather Sliding Window sizes for each kv cache group (if any) in number of # blocks per KV cache group. This is used to clip the local attention window. - sw_sizes_tokens = [ - group.kv_cache_spec.sliding_window - if isinstance(group.kv_cache_spec, SlidingWindowSpec) - else 0 - for group in kv_cache_config.kv_cache_groups + sw_sizes_tokens: list[tuple[int, int]] = [ + (g.kv_cache_spec.sliding_window, g.kv_cache_spec.block_size) + if isinstance(g.kv_cache_spec, SlidingWindowSpec) + else (0, self.block_size) + for g in kv_cache_config.kv_cache_groups ] self.blocks_per_sw = [ - cdiv(n_tokens, self.block_size) for n_tokens in sw_sizes_tokens + cdiv(n_tokens, block_size) for n_tokens, block_size in sw_sizes_tokens ] def shutdown(self): @@ -585,7 +587,7 @@ def shutdown(self): self._nixl_handshake_listener_t.join() self._nixl_handshake_listener_t = None - def get_sw_clippped_blocks(self, block_ids: BlockIds) -> BlockIds: + def get_sw_clipped_blocks(self, block_ids: BlockIds) -> BlockIds: """ Clip the number of blocks to the sliding window size for each kv cache group that employs SWA. @@ -604,7 +606,12 @@ def get_sw_clippped_blocks(self, block_ids: BlockIds) -> BlockIds: ) # For non-SWA groups, blocks_per_sw is 0 so we return all block_ids unchanged return tuple( - [blocks[-self.blocks_per_sw[i] :] for i, blocks in enumerate(block_ids)] + [ + blocks[-self.blocks_per_sw[i] :] + if self.blocks_per_sw[i] > 0 + else blocks + for i, blocks in enumerate(block_ids) + ] ) def set_xfer_handshake_metadata( @@ -760,7 +767,7 @@ def update_state_after_alloc( if num_external_tokens > 0 else () ) - local_block_ids = self.get_sw_clippped_blocks( + local_block_ids = self.get_sw_clipped_blocks( unhashed_local_block_ids ) @@ -806,7 +813,7 @@ def build_connector_meta( req = req_to_save assert req.kv_transfer_params is not None - clipped_block_id_groups = self.get_sw_clippped_blocks(new_block_id_groups) + clipped_block_id_groups = self.get_sw_clipped_blocks(new_block_id_groups) meta.add_new_req_to_save( request_id=req_id, local_block_ids=clipped_block_id_groups, @@ -899,7 +906,7 @@ def request_finished( # trimming down after allocating for the whole sequence length. Empty # blocks are always at the start of the list. # Here we "unpad" blocks to send the actual remote blocks to be read. - block_ids = self.get_sw_clippped_blocks(block_ids) + block_ids = self.get_sw_clipped_blocks(block_ids) return delay_free_blocks, dict( do_remote_prefill=True, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 1cb6a3296cc9..b9fbd7427b18 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -2105,6 +2105,8 @@ def _update_requests_with_invalid_blocks( # Sync loading. num_computed_tokens includes new tokens req_num_computed_tokens = request.num_cached_tokens + # Unravel blocks across groups for HMA, as any block may have failed. When + # one block is detected as invalid, we reset request state and recompute. all_req_block_ids = ( (block_id for group in req_block_ids for block_id in group) if is_hma @@ -2114,7 +2116,8 @@ def _update_requests_with_invalid_blocks( req_num_computed_tokens + self.block_size - 1 ) // self.block_size for idx, block_id in enumerate(all_req_block_ids): - if idx >= req_num_computed_blocks: + if not is_hma and idx >= req_num_computed_blocks: + # When HMA is off, we can easily consider actually computed blocks break if block_id not in invalid_block_ids: continue @@ -2150,7 +2153,8 @@ def _update_requests_with_invalid_blocks( for group in req_block_ids: blocks_to_evict.update(group) else: - # Truncate the computed tokens at the first failed block + # Truncate the computed tokens at the first failed block. When HMA + # is off, the implicit assumption is that the only group is FA request.num_computed_tokens = idx * self.block_size num_affected_tokens = ( req_num_computed_tokens - request.num_computed_tokens From 33bb65ea7d3eb742826ef5d9e9f81697fcf51d1e Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 20 Feb 2026 08:39:04 +0000 Subject: [PATCH 17/28] account for window across blocks Signed-off-by: NickLucche --- .../v1/kv_connector/unit/test_nixl_connector.py | 17 +++++++++++++---- .../kv_connector/v1/nixl_connector.py | 5 ++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 5c42f0c29034..656d879dbeda 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -415,9 +415,15 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): REMOTE_ENGINE_ID = "remote_engine" def __init__( - self, *args, hand_shake_latency: float = 1.8, kv_cache_layout="HND", **kwargs + self, + *args, + hand_shake_latency: float = 1.8, + kv_cache_layout="HND", + kv_cache_config=None, + **kwargs, ): - kv_cache_config = make_kv_cache_config(block_size=16) + if kv_cache_config is None: + kv_cache_config = make_kv_cache_config(block_size=16) super().__init__(*args, kv_cache_config=kv_cache_config, **kwargs) self._hand_shake_latency = hand_shake_latency self.kv_cache_layout = kv_cache_layout @@ -1919,7 +1925,7 @@ def check_xfer_state(self, handle: int) -> str: ("transfer_exception", {"fail_transfer_exception": True}, True), ], ) -@pytest.mark.parametrize("enable_hma", [False]) +@pytest.mark.parametrize("enable_hma", [False, True]) def test_transfer_failure_logging( default_vllm_config, dist_init, @@ -1949,7 +1955,10 @@ def test_transfer_failure_logging( make_kv_cache_config(block_size=16, hma_enabled=enable_hma), ) connector.connector_worker = FakeNixlConnectorWorker( - vllm_config, connector.engine_id, hand_shake_latency=0.0 + vllm_config, + connector.engine_id, + hand_shake_latency=0.0, + kv_cache_config=connector._kv_cache_config, ) # Configure FailingNixlWrapper to fail in the specified way diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 2467abd82f25..aeec11bb0e41 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -577,8 +577,11 @@ def __init__( else (0, self.block_size) for g in kv_cache_config.kv_cache_groups ] + # cdiv(n_tokens, block_size) gives blocks/window; add 1 to conservatively + # account for boundary overlap eg window isn't fully aligned with blocks. self.blocks_per_sw = [ - cdiv(n_tokens, block_size) for n_tokens, block_size in sw_sizes_tokens + cdiv(n_tokens, block_size) + 1 if n_tokens else 0 + for n_tokens, block_size in sw_sizes_tokens ] def shutdown(self): From b6870bce2de951838f7f07fad168ddbb8fc52e2c Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 27 Feb 2026 10:53:03 +0000 Subject: [PATCH 18/28] revert all sched changes Signed-off-by: NickLucche --- vllm/v1/core/sched/scheduler.py | 70 ++++++++------------------------- 1 file changed, 17 insertions(+), 53 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b9fbd7427b18..bf397ad681ca 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -19,9 +19,6 @@ from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory -from vllm.distributed.kv_transfer.kv_connector.utils import ( - get_full_attention_group_idx, -) from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, @@ -119,7 +116,6 @@ def __init__( self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None self.recompute_kv_load_failures = True - self._full_attention_group_idx = 0 if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -135,9 +131,6 @@ def __init__( self.vllm_config.kv_transfer_config.kv_load_failure_policy ) self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" - self._full_attention_group_idx = get_full_attention_group_idx( - kv_cache_config - ) self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1997,13 +1990,9 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: self.failed_recving_kv_req_ids.remove(request.request_id) else: # Now that the blocks are ready, actually cache them. - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) - # When connector does not support HMA, a single group is present here - num_computed_tokens = ( - len(block_ids[self._full_attention_group_idx]) * self.block_size - ) - # Get number of blocks on full attention layer, we can retrieve at most - # this many tokens + (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id) + num_computed_tokens = len(block_ids) * self.block_size + # Handle the case where num request tokens less than one block. num_computed_tokens = min(num_computed_tokens, request.num_tokens) if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 @@ -2086,11 +2075,8 @@ def _update_requests_with_invalid_blocks( is_affected = False marked_invalid_block = False req_id = request.request_id - req_block_ids = self.kv_cache_manager.get_block_ids(req_id) - is_hma = len(req_block_ids) > 1 - # Assume FA group is present to infer number of computed tokens - fa_blocks = req_block_ids[self._full_attention_group_idx] - max_num_blocks = len(fa_blocks) + # TODO (davidb): add support for hybrid memory allocator + (req_block_ids,) = self.kv_cache_manager.get_block_ids(req_id) # We iterate only over blocks that may contain externally computed # tokens if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: @@ -2099,26 +2085,16 @@ def _update_requests_with_invalid_blocks( req_num_computed_tokens = ( request.num_computed_tokens if req_id in self.failed_recving_kv_req_ids - else max_num_blocks * self.block_size + else len(req_block_ids) * self.block_size ) else: # Sync loading. num_computed_tokens includes new tokens req_num_computed_tokens = request.num_cached_tokens - # Unravel blocks across groups for HMA, as any block may have failed. When - # one block is detected as invalid, we reset request state and recompute. - all_req_block_ids = ( - (block_id for group in req_block_ids for block_id in group) - if is_hma - else req_block_ids[0] - ) req_num_computed_blocks = ( req_num_computed_tokens + self.block_size - 1 ) // self.block_size - for idx, block_id in enumerate(all_req_block_ids): - if not is_hma and idx >= req_num_computed_blocks: - # When HMA is off, we can easily consider actually computed blocks - break + for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids): if block_id not in invalid_block_ids: continue @@ -2141,28 +2117,16 @@ def _update_requests_with_invalid_blocks( continue marked_invalid_block = True - if is_hma: - # TODO (NickLucche) HMA: Partial recovery is not supported because - # SW blocks only cover a suffix of the original sequence. - # After truncation, the sliding window shifts and may require - # blocks that were never transferred. Evict all and restart fresh. - total_affected_tokens += req_num_computed_tokens - request.num_computed_tokens = 0 - request.num_external_computed_tokens = 0 - if evict_blocks: - for group in req_block_ids: - blocks_to_evict.update(group) - else: - # Truncate the computed tokens at the first failed block. When HMA - # is off, the implicit assumption is that the only group is FA - request.num_computed_tokens = idx * self.block_size - num_affected_tokens = ( - req_num_computed_tokens - request.num_computed_tokens - ) - total_affected_tokens += num_affected_tokens - request.num_external_computed_tokens -= num_affected_tokens - if evict_blocks: - blocks_to_evict.update(fa_blocks[idx:]) + # Truncate the computed tokens at the first failed block + request.num_computed_tokens = idx * self.block_size + num_affected_tokens = ( + req_num_computed_tokens - request.num_computed_tokens + ) + total_affected_tokens += num_affected_tokens + request.num_external_computed_tokens -= num_affected_tokens + # collect invalid block and all downstream dependent blocks + if evict_blocks: + blocks_to_evict.update(req_block_ids[idx:]) if is_affected: if not marked_invalid_block: From 036af11115e7f5afe507d07253e5f5733404b5b9 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 2 Mar 2026 15:19:09 +0000 Subject: [PATCH 19/28] disable failure recovery Signed-off-by: NickLucche --- .../kv_connector/v1/nixl_connector.py | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index aeec11bb0e41..8930ccce8c07 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -59,7 +59,7 @@ from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.kv_cache_interface import MambaSpec, SlidingWindowSpec +from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, SlidingWindowSpec from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: @@ -544,12 +544,17 @@ def __init__( self.use_host_buffer = ( vllm_config.kv_transfer_config.kv_buffer_device == "cpu" ) - self._is_hma_enabled = ( + self._is_hma_required = ( not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager + # Also handle unlikely SW-only model case instead of checking num_groups>1. + and any( + not isinstance(g.kv_cache_spec, FullAttentionSpec) + for g in kv_cache_config.kv_cache_groups + ) ) logger.info("Initializing NIXL Scheduler %s", engine_id) - if self._is_hma_enabled: + if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager: logger.info("Hybrid Memory Allocator is enabled with NIXL") # Background thread for handling new handshake requests. @@ -598,8 +603,8 @@ def get_sw_clipped_blocks(self, block_ids: BlockIds) -> BlockIds: the entire sequence length, and successively cleans up blocks that are outside the window prior to the `request_finished_all_groups` hook. """ - if len(block_ids) == 0 or not self._is_hma_enabled: - # No blocks to clip eg Full prefix cache hit + if len(block_ids) == 0 or not self._is_hma_required: + # No blocks to clip eg Full prefix cache hit or not a hybrid model. return block_ids # NOTE (NickLucche) This logic is currently handled at the connector level # because offloading connectors might want to receive the whole sequence even @@ -946,8 +951,12 @@ def __init__( self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config( "backends", ["UCX"] ) - self._is_hma_enabled = ( + self._is_hma_required = ( not vllm_config.scheduler_config.disable_hybrid_kv_cache_manager + and any( + not isinstance(g.kv_cache_spec, FullAttentionSpec) + for g in kv_cache_config.kv_cache_groups + ) ) self.kv_cache_config = kv_cache_config @@ -1378,9 +1387,10 @@ def request_ready(f: Future[Any], entry=(req_id, meta)): error=e, meta=meta, ) - if req_meta := self._recving_metadata.get(req_id): - for group_block_ids in req_meta.local_block_ids: - self._invalid_block_ids.update(group_block_ids) + if ( + req_meta := self._recving_metadata.get(req_id) + ) and not self._is_hma_required: + self._invalid_block_ids.update(req_meta.local_block_ids[0]) self._failed_recv_reqs.add(req_id) fut.add_done_callback(request_ready) @@ -1827,7 +1837,7 @@ def _validate_remote_agent_handshake( # Num kv_heads > tp_size and P TP > D TP case, not supported assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id)) - if self._is_hma_enabled: + if self._is_hma_required: assert block_size_ratio == 1, ( "HMA does not support different remote block size yet" ) @@ -1846,7 +1856,7 @@ def _validate_remote_agent_handshake( "Remote is HND and local is NHD, enabled additional permute " "on local device KV." ) - assert not self._is_hma_enabled, ( + assert not self._is_hma_required, ( "HMA does not support block size post processing" ) self.enable_permute_local_kv = True @@ -2044,7 +2054,7 @@ def get_finished(self) -> tuple[set[str], set[str]]: if not self.use_mla and ( block_size_ratio > 1 or self.enable_permute_local_kv ): - assert not self._is_hma_enabled + assert not self._is_hma_required block_ids_for_blocksize_post_process[block_size_ratio].append( meta.local_physical_block_ids[0] ) @@ -2178,9 +2188,9 @@ def _handle_failed_transfer(self, req_id: str, handle: int): handle: The transfer handle. """ # Use .get() here as the metadata cleanup is handled by get_finished() - if meta := self._recving_metadata.get(req_id): - for group_block_ids in meta.local_block_ids: - self._invalid_block_ids.update(group_block_ids) + # TODO (NickLucche) handle failed transfer for HMA. + if (meta := self._recving_metadata.get(req_id)) and not self._is_hma_required: + self._invalid_block_ids.update(meta.local_block_ids) self.nixl_wrapper.release_xfer_handle(handle) self.xfer_stats.record_failed_transfer() @@ -2320,7 +2330,7 @@ def _read_blocks( block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) if block_size_ratio > 1: # TODO (NickLucche) assume HMA is off. Change to handle multiple KV groups. - assert not self._is_hma_enabled + assert not self._is_hma_required local_block_ids0 = local_block_ids[0] if local_block_ids else [] remote_block_ids0 = remote_block_ids[0] local_block_ids_mapped = self.get_mapped_blocks( @@ -2435,9 +2445,10 @@ def _read_blocks( dst_engine_id=dst_engine_id, remote_rank=remote_rank, ) - if meta := self._recving_metadata.get(request_id): - for group_block_ids in meta.local_block_ids: - self._invalid_block_ids.update(group_block_ids) + if ( + meta := self._recving_metadata.get(request_id) + ) and not self._is_hma_required: + self._invalid_block_ids.update(meta.local_block_ids) self.xfer_stats.record_failed_transfer() if handle is not None: self.nixl_wrapper.release_xfer_handle(handle) From a1ddbf63615805ec5f221bea5fb71ee4aac0b231 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 2 Mar 2026 15:25:31 +0000 Subject: [PATCH 20/28] fix Signed-off-by: NickLucche --- .../distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8930ccce8c07..0239816ab953 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -2190,7 +2190,7 @@ def _handle_failed_transfer(self, req_id: str, handle: int): # Use .get() here as the metadata cleanup is handled by get_finished() # TODO (NickLucche) handle failed transfer for HMA. if (meta := self._recving_metadata.get(req_id)) and not self._is_hma_required: - self._invalid_block_ids.update(meta.local_block_ids) + self._invalid_block_ids.update(meta.local_block_ids[0]) self.nixl_wrapper.release_xfer_handle(handle) self.xfer_stats.record_failed_transfer() @@ -2448,7 +2448,7 @@ def _read_blocks( if ( meta := self._recving_metadata.get(request_id) ) and not self._is_hma_required: - self._invalid_block_ids.update(meta.local_block_ids) + self._invalid_block_ids.update(meta.local_block_ids[0]) self.xfer_stats.record_failed_transfer() if handle is not None: self.nixl_wrapper.release_xfer_handle(handle) From 9d08e751d7884b4426c04846a1af1896033ae782 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 2 Mar 2026 15:32:18 +0000 Subject: [PATCH 21/28] missing sched changes Signed-off-by: NickLucche --- vllm/v1/core/sched/scheduler.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index bf397ad681ca..4ed1bc79742b 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -19,6 +19,7 @@ from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_connector.utils import get_full_attention_group_idx from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, @@ -116,6 +117,7 @@ def __init__( self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None self.recompute_kv_load_failures = True + self._full_attention_group_idx = 0 if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -131,6 +133,9 @@ def __init__( self.vllm_config.kv_transfer_config.kv_load_failure_policy ) self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" + self._full_attention_group_idx = get_full_attention_group_idx( + kv_cache_config + ) self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1990,8 +1995,11 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: self.failed_recving_kv_req_ids.remove(request.request_id) else: # Now that the blocks are ready, actually cache them. - (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id) - num_computed_tokens = len(block_ids) * self.block_size + block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + # When connector does not support HMA, a single group is present here + num_computed_tokens = ( + len(block_ids[self._full_attention_group_idx]) * self.block_size + ) # Handle the case where num request tokens less than one block. num_computed_tokens = min(num_computed_tokens, request.num_tokens) if num_computed_tokens == request.num_tokens: From 380d543d869075ebedd9deeae5d6b3cefad6f906 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 2 Mar 2026 15:36:35 +0000 Subject: [PATCH 22/28] rebase cruft Signed-off-by: NickLucche --- .../kv_connector/nixl_integration/run_accuracy_test.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 81ee303e8339..e37522a7ccd2 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -170,13 +170,8 @@ run_tests_for_model() { if [[ -n "$ENABLE_HMA_VAR" ]]; then BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR" fi - - if [ -n "$model_args" ]; then - FULL_CMD="$BASE_CMD $model_args" - else - FULL_CMD="$BASE_CMD" - fi - + + FULL_CMD="$BASE_CMD" eval "$FULL_CMD &" # Store host and port for proxy configuration From b29597b1195eb73bad98188f78ac26dcc56858e6 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 2 Mar 2026 15:48:41 +0000 Subject: [PATCH 23/28] revert invalid block changes Signed-off-by: NickLucche --- .../unit/test_invalid_blocks_correctness.py | 135 ------------------ 1 file changed, 135 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py index 581fb0bf18a4..6cb2d3ea4d97 100644 --- a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py +++ b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py @@ -24,7 +24,6 @@ create_request, create_scheduler, create_vllm_config, - make_kv_cache_config, ) pytestmark = pytest.mark.cpu_test @@ -57,18 +56,6 @@ def recompute_scheduler(): return create_scheduler(vllm_config) -@pytest.fixture -def hma_recompute_scheduler(): - """scheduler with HMA enabled and kv_load_failure_policy='recompute'""" - vllm_config = create_vllm_config() - vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute" - # Create scheduler with HMA (FA + SW groups) - kv_cache_config = make_kv_cache_config( - block_size=16, hma_enabled=True, num_blocks=10000 - ) - return create_scheduler(vllm_config, kv_cache_config=kv_cache_config) - - def test_sync_recompute_blocks_not_freed_for_running_requests( recompute_scheduler: Scheduler, ): @@ -491,125 +478,3 @@ def cache_blocks_spy(req, num_tokens): # request should be in the running queue assert request in recompute_scheduler.running - - -def test_hma_sync_recompute_evicts_all_blocks( - hma_recompute_scheduler: Scheduler, -): - """ - Test HMA sync recompute case - all blocks must be evicted on any failure. - - With HMA, sliding window (SW) blocks only cover a suffix of the sequence. - After truncating num_computed_tokens, the sliding window shifts and may - require blocks that were never transferred. - - Therefore, when any block fails we evict ALL blocks and reset - num_computed_tokens to 0. - - This test verifies: - 1. All blocks (FA and SW) are evicted on partial failure - 2. num_computed_tokens is reset to 0 (not truncated to failure point) - 3. Request can be rescheduled for full recomputation - """ - num_prompt_blocks = 100 - num_external_computed_blocks = 99 - # Fail a block in the middle - in non-HMA this would allow partial recovery - invalid_block_idx = 50 - - num_prompt_tokens = num_prompt_blocks * hma_recompute_scheduler.block_size - num_external_computed_tokens = ( - num_external_computed_blocks * hma_recompute_scheduler.block_size - ) - - request = create_request(num_tokens=num_prompt_tokens) - hma_recompute_scheduler.add_request(request=request) - - req_num_new_matched_tokens = { - request.request_id: num_external_computed_tokens, - } - - # mock connector indicating sync load - hma_recompute_scheduler.connector = Mock() - hma_recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( - _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) - ) - hma_recompute_scheduler.connector.request_finished.return_value = (False, None) - hma_recompute_scheduler.connector.take_events.return_value = () - - scheduler_output = hma_recompute_scheduler.schedule() - - # request should be running with sync KV load - assert len(hma_recompute_scheduler.running) == 1 - assert len(scheduler_output.scheduled_new_reqs) == 1 - assert request.status == RequestStatus.RUNNING - - # verify we have multiple KV cache groups (HMA is enabled) - req_block_ids = hma_recompute_scheduler.kv_cache_manager.get_block_ids( - request.request_id - ) - assert len(req_block_ids) == 2 - - # get the FA block IDs - fa_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] - invalid_block_ids = {fa_block_ids[invalid_block_idx]} - - model_runner_output = create_model_runner_output( - [request], - invalid_block_ids=invalid_block_ids, - use_eos=False, # not finished - should continue running - ) - - hma_recompute_scheduler.update_from_output(scheduler_output, model_runner_output) - - # Critical assertions for HMA recompute case: - - # 1. request should still be RUNNING (for recomputation) - assert request.status == RequestStatus.RUNNING, ( - f"Request should remain RUNNING for recompute, got {request.status}" - ) - - # 2. num_computed_tokens should be reset to 0 (not truncated to failure point) - # This is the key difference from non-HMA: we can't do partial recovery - assert request.num_computed_tokens == 0, ( - f"HMA: num_computed_tokens should be reset to 0, " - f"got {request.num_computed_tokens}. " - f"Partial recovery is not supported with sliding window layers." - ) - - # 3. num_external_computed_tokens should also be reset to 0 - assert request.num_external_computed_tokens == 0, ( - f"HMA: num_external_computed_tokens should be reset to 0, " - f"got {request.num_external_computed_tokens}" - ) - - # 4. verify blocks are still allocated (not freed yet, just marked for eviction) - # The actual eviction happens after update_from_output - allocated_blocks = hma_recompute_scheduler.kv_cache_manager.get_block_ids( - request.request_id - ) - assert allocated_blocks is not None - - # 5. request should still be in running queue - assert request in hma_recompute_scheduler.running, ( - "Request should remain in running queue for recomputation" - ) - - # 6. request should still be in scheduler.requests - assert request.request_id in hma_recompute_scheduler.requests, ( - "Request should not be deleted from scheduler.requests" - ) - - # 7. verify request can be rescheduled for full recomputation - scheduler_output_2 = hma_recompute_scheduler.schedule() - - # request should be reschedulable - scheduled_req_ids = [ - req.request_id for req in scheduler_output_2.scheduled_new_reqs - ] - if scheduler_output_2.num_scheduled_tokens: - scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys()) - - assert ( - request.request_id in scheduled_req_ids - or len(hma_recompute_scheduler.running) > 0 - ), "Request should be reschedulable for full recomputation" From de7a452b2e09151ddef060da649273e7b340c08a Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 2 Mar 2026 16:10:26 +0000 Subject: [PATCH 24/28] update tests Signed-off-by: NickLucche --- tests/v1/kv_connector/unit/test_nixl_connector_hma.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py index 8e444df65fca..636d51402bde 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py @@ -24,7 +24,7 @@ "hma_enabled,expected_sw_sizes", [ # HMA enabled: FullAttentionSpec (0) + SlidingWindowSpec (2048/16=128) - (True, [0, 128]), + (True, [0, 128 + 1]), # HMA disabled: only FullAttentionSpec (0) (False, [0]), ], @@ -138,7 +138,8 @@ def run_hma_test(llm: LLM): outputs = llm.generate(["hi" * 1401], sampling_params) kv_params = outputs[0].kv_transfer_params - expected_num_remote_blocks = sw_size // block_size + # +1 to account for overlapping window across blocks. + expected_num_remote_blocks = sw_size // block_size + 1 remote_block_ids = kv_params["remote_block_ids"] assert ( len(remote_block_ids[0]) From 72a709d01f968ff66b2a7e0a5b3d7366741e8121 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 5 Mar 2026 13:16:41 +0000 Subject: [PATCH 25/28] revert sched changes Signed-off-by: NickLucche --- vllm/v1/core/sched/scheduler.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4ed1bc79742b..bf397ad681ca 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -19,7 +19,6 @@ from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory -from vllm.distributed.kv_transfer.kv_connector.utils import get_full_attention_group_idx from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, @@ -117,7 +116,6 @@ def __init__( self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None self.recompute_kv_load_failures = True - self._full_attention_group_idx = 0 if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -133,9 +131,6 @@ def __init__( self.vllm_config.kv_transfer_config.kv_load_failure_policy ) self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" - self._full_attention_group_idx = get_full_attention_group_idx( - kv_cache_config - ) self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1995,11 +1990,8 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: self.failed_recving_kv_req_ids.remove(request.request_id) else: # Now that the blocks are ready, actually cache them. - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) - # When connector does not support HMA, a single group is present here - num_computed_tokens = ( - len(block_ids[self._full_attention_group_idx]) * self.block_size - ) + (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id) + num_computed_tokens = len(block_ids) * self.block_size # Handle the case where num request tokens less than one block. num_computed_tokens = min(num_computed_tokens, request.num_tokens) if num_computed_tokens == request.num_tokens: From dde50e04ccad775717a9975f0522de6e8a1ee0ae Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 5 Mar 2026 14:12:08 +0000 Subject: [PATCH 26/28] precommit Signed-off-by: NickLucche --- tests/v1/kv_connector/unit/test_nixl_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 656d879dbeda..d97afb1aaa12 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1969,6 +1969,7 @@ def test_transfer_failure_logging( # For notification_failed, we need empty local blocks # (full cache hit path to trigger send_notif) + local_blocks: tuple[()] | tuple[list[int], ...] if enable_hma: # HMA enabled: multiple groups (FA + SW) local_blocks = ( From 70f929eaad76a622b8788f6c27cbd6bee7e82ea0 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 5 Mar 2026 15:28:52 +0000 Subject: [PATCH 27/28] cruft Signed-off-by: NickLucche --- .../unit/test_remote_decode_lifecycle.py | 4 ++- .../kv_transfer/kv_connector/utils.py | 32 ------------------- 2 files changed, 3 insertions(+), 33 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index b2ec2ddfb64d..b656e0809543 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -208,7 +208,9 @@ def test_prefix_cache_lifecycle(): # Ensure we send all block ids, including the partial blocks, # even if there is a cache hit. - assert len(kv_transfer_params["remote_block_ids"]) == (NUM_EXTERNAL_FULL_BLOCKS + 1) + # remote_block_ids is BlockIds (tuple of lists); sum block counts across groups. + num_remote_blocks = sum(len(g) for g in kv_transfer_params["remote_block_ids"]) + assert num_remote_blocks == (NUM_EXTERNAL_FULL_BLOCKS + 1) # STEP (2): Ensure it is freed. scheduler_output = scheduler.schedule() diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 138f8826fc45..eb93ea324836 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -20,7 +20,6 @@ if TYPE_CHECKING: from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase - from vllm.v1.kv_cache_interface import KVCacheConfig logger = init_logger(__name__) @@ -305,37 +304,6 @@ def yield_req_data( ) -def get_full_attention_group_idx( - kv_cache_config: "KVCacheConfig", -) -> int: - """ - Get the index of the full attention KV cache group from KVCacheConfig. - - Args: - kv_cache_config: The KV cache configuration - - Returns: - The index of the full attention group - - Raises: - AssertionError: If no full attention group is found - """ - from vllm.v1.kv_cache_interface import FullAttentionSpec - - fa_group_idx = next( - ( - i - for i, group in enumerate(kv_cache_config.kv_cache_groups) - if isinstance(group.kv_cache_spec, FullAttentionSpec) - ), - None, - ) - assert fa_group_idx is not None, ( - "No full attention KV cache group found in kv_cache_config" - ) - return fa_group_idx - - @dataclass class TpKVTopology: """ From f9c31f318b49c654b465f8fe941812bd25ddbc41 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 5 Mar 2026 17:50:39 +0000 Subject: [PATCH 28/28] max model len gemma Signed-off-by: NickLucche --- .../config_sweep_accuracy_test.sh | 2 +- .../nixl_integration/run_accuracy_test.sh | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh index 7e232e2fe490..c35f4bfe8890 100755 --- a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh @@ -12,7 +12,7 @@ tp_configs=( "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" - "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it" # SW model + "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" # SW model ) dp_ep_configs=( "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index e37522a7ccd2..fe95249602a8 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -40,6 +40,9 @@ fi if [[ -n "$ENABLE_HMA_VAR" ]]; then echo "HMA (Hybrid KV Cache Manager) enabled" fi +if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then + echo "vLLM serve extra args: $VLLM_SERVE_EXTRA_ARGS" +fi DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then @@ -79,6 +82,8 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2} PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-128} DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128} +# Comma-separated extra args for vllm serve (e.g. --max-model-len,2048) +VLLM_SERVE_EXTRA_ARGS=${VLLM_SERVE_EXTRA_ARGS:-} # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) @@ -160,6 +165,12 @@ run_tests_for_model() { --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '$KV_CONFIG'" + if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then + IFS=',' read -r -a extra_args <<< "$VLLM_SERVE_EXTRA_ARGS" + for arg in "${extra_args[@]}"; do + BASE_CMD="${BASE_CMD} $arg" + done + fi # Add attention backend config if specified if [[ -n "$ATTENTION_BACKEND" ]]; then @@ -206,6 +217,12 @@ run_tests_for_model() { --block-size ${DECODE_BLOCK_SIZE} \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --kv-transfer-config '$KV_CONFIG'" + if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then + IFS=',' read -r -a extra_args <<< "$VLLM_SERVE_EXTRA_ARGS" + for arg in "${extra_args[@]}"; do + BASE_CMD="${BASE_CMD} $arg" + done + fi # Add attention backend config if specified if [[ -n "$ATTENTION_BACKEND" ]]; then