Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions lib/bindings/kvbm/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions lib/bindings/kvbm/python/kvbm/v2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
SchedulerOutput = _v2.SchedulerOutput
Tensor = _v2.Tensor

# Scheduler classes (thin wrappers around real Rust scheduler)
RustScheduler = _v2.RustScheduler
SchedulerConfig = _v2.SchedulerConfig
RequestStatus = _v2.RequestStatus

_V2_CORE_AVAILABLE = True
except ImportError:
# Provide stubs when v2 feature is not compiled
Expand All @@ -36,6 +41,9 @@ def is_available() -> bool:
KvbmRequest = _make_feature_stub("KvbmRequest", "v2")
SchedulerOutput = _make_feature_stub("SchedulerOutput", "v2")
Tensor = _make_feature_stub("Tensor", "v2")
RustScheduler = _make_feature_stub("RustScheduler", "v2")
SchedulerConfig = _make_feature_stub("SchedulerConfig", "v2")
RequestStatus = _make_feature_stub("RequestStatus", "v2")
_V2_CORE_AVAILABLE = False

__all__ = [
Expand All @@ -47,5 +55,8 @@ def is_available() -> bool:
"KvbmRequest",
"SchedulerOutput",
"Tensor",
"RustScheduler",
"SchedulerConfig",
"RequestStatus",
"_V2_CORE_AVAILABLE",
]
505 changes: 434 additions & 71 deletions lib/bindings/kvbm/python/kvbm/v2/vllm/schedulers/dynamo.py

Large diffs are not rendered by default.

93 changes: 93 additions & 0 deletions lib/bindings/kvbm/python/kvbm/v2/vllm/schedulers/output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Scheduler output implementations conforming to Protocol definitions.

These dataclasses implement the SchedulerOutputProtocol and related protocols,
allowing us to construct scheduler outputs from Rust scheduler results.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict, List, Set, Tuple


@dataclass
class RustNewRequestData:
"""
Our implementation of NewRequestDataProtocol.

Conforms to the same interface as vLLM's NewRequestData.
"""

req_id: str
prompt_token_ids: List[int] | None
block_ids: Tuple[List[int], ...]
num_computed_tokens: int
# Fields we get from stored Request objects
mm_features: List[Any] = field(default_factory=list)
sampling_params: Any | None = None
pooling_params: Any | None = None
lora_request: Any | None = None
prompt_embeds: Any | None = None


@dataclass
class RustCachedRequestData:
"""
Our implementation of CachedRequestDataProtocol.

Conforms to the same interface as vLLM's CachedRequestData.
"""

req_ids: List[str] = field(default_factory=list)
resumed_req_ids: Set[str] = field(default_factory=set)
resumed_from_preemption: List[bool] = field(default_factory=list)
new_token_ids: List[List[int]] = field(default_factory=list)
all_token_ids: Dict[str, List[int]] = field(default_factory=dict)
new_block_ids: List[Tuple[List[int], ...] | None] = field(default_factory=list)
num_computed_tokens: List[int] = field(default_factory=list)
num_output_tokens: List[int] = field(default_factory=list)

@property
def num_reqs(self) -> int:
return len(self.req_ids)

@classmethod
def make_empty(cls) -> "RustCachedRequestData":
"""Create an empty cached request data."""
return cls()


@dataclass
class RustSchedulerOutput:
"""
Our implementation of SchedulerOutputProtocol.

Conforms to the same interface as vLLM's SchedulerOutput.
"""

scheduled_new_reqs: List[RustNewRequestData]
scheduled_cached_reqs: RustCachedRequestData
num_scheduled_tokens: Dict[str, int]
total_num_scheduled_tokens: int
scheduled_spec_decode_tokens: Dict[str, List[int]] = field(default_factory=dict)
scheduled_encoder_inputs: Dict[str, List[int]] = field(default_factory=dict)
num_common_prefix_blocks: List[int] = field(default_factory=list)
finished_req_ids: Set[str] = field(default_factory=set)
free_encoder_mm_hashes: List[str] = field(default_factory=list)
pending_structured_output_tokens: bool = False
kv_connector_metadata: Any | None = None
ec_connector_metadata: Any | None = None

@classmethod
def make_empty(cls) -> "RustSchedulerOutput":
"""Create an empty scheduler output."""
return cls(
scheduled_new_reqs=[],
scheduled_cached_reqs=RustCachedRequestData.make_empty(),
num_scheduled_tokens={},
total_num_scheduled_tokens=0,
)
103 changes: 103 additions & 0 deletions lib/bindings/kvbm/python/kvbm/v2/vllm/schedulers/protocols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Protocol definitions for vLLM scheduler output types.

These Protocols define the interface that vLLM expects from scheduler outputs.
By defining them as Protocols, we can:
1. Use vLLM's dataclasses directly when convenient
2. Implement our own classes (Python or Rust PyO3) that conform to the same interface
3. See version differences as explicit Protocol changes

Based on vLLM v0.11+ SchedulerOutput structure.
"""

from __future__ import annotations

from typing import Any, Dict, List, Protocol, Set, Tuple, runtime_checkable


@runtime_checkable
class NewRequestDataProtocol(Protocol):
"""
Protocol matching vLLM's NewRequestData.

Represents a request being scheduled for the first time.
The worker processes will cache this data.
"""

req_id: str
prompt_token_ids: List[int] | None
mm_features: List[Any] # List[MultiModalFeatureSpec]
sampling_params: Any | None # SamplingParams | None
pooling_params: Any | None # PoolingParams | None
block_ids: Tuple[List[int], ...]
num_computed_tokens: int
lora_request: Any | None # LoRARequest | None
prompt_embeds: Any | None # torch.Tensor | None


@runtime_checkable
class CachedRequestDataProtocol(Protocol):
"""
Protocol matching vLLM's CachedRequestData.

Represents requests that have been scheduled before.
Only the diff is sent to minimize communication cost.
"""

req_ids: List[str]
# For request ids not in resumed_req_ids, new_block_ids will be appended.
# For those in the set, new_block_ids replaces the existing block IDs.
resumed_req_ids: Set[str]
# Only used for pipeline parallelism; empty when PP is not used.
new_token_ids: List[List[int]]
# For requests not scheduled in the last step, propagate token ids.
all_token_ids: Dict[str, List[int]]
new_block_ids: List[Tuple[List[int], ...] | None]
num_computed_tokens: List[int]
num_output_tokens: List[int]

@property
def num_reqs(self) -> int:
"""Number of cached requests."""
...


@runtime_checkable
class SchedulerOutputProtocol(Protocol):
"""
Protocol matching vLLM's SchedulerOutput.

Contains all scheduling decisions for a single step.
"""

# Requests being scheduled for the first time
scheduled_new_reqs: List[NewRequestDataProtocol]
# Requests that have been scheduled before (only diff sent)
scheduled_cached_reqs: CachedRequestDataProtocol

# req_id -> num_scheduled_tokens
num_scheduled_tokens: Dict[str, int]
# Total tokens scheduled (sum of num_scheduled_tokens.values())
total_num_scheduled_tokens: int
# req_id -> spec_token_ids (only for requests with spec decode tokens)
scheduled_spec_decode_tokens: Dict[str, List[int]]
# req_id -> encoder input indices to process
scheduled_encoder_inputs: Dict[str, List[int]]
# Common prefix blocks per KV cache group (for cascade attention)
num_common_prefix_blocks: List[int]

# Finished request IDs (to notify workers to free cached states)
finished_req_ids: Set[str]
# mm_hash strings for encoder outputs to free from cache
free_encoder_mm_hashes: List[str]

# Whether scheduled requests have all output tokens for grammar bitmask
pending_structured_output_tokens: bool

# KV Cache Connector metadata
kv_connector_metadata: Any | None
# EC Cache Connector metadata
ec_connector_metadata: Any | None
Loading
Loading