Skip to content

Commit 5166a3d

Browse files
authored
feat: Router replicas with state-sharing (#2264)
1 parent 10f4302 commit 5166a3d

File tree

18 files changed

+1718
-464
lines changed

18 files changed

+1718
-464
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/frontend/src/dynamo/frontend/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,12 @@ def parse_args():
112112
help=" KV Router. Disable KV events.",
113113
)
114114
parser.set_defaults(use_kv_events=True)
115+
parser.add_argument(
116+
"--router-replica-sync",
117+
action="store_true",
118+
default=False,
119+
help="KV Router: Enable replica synchronization across multiple router instances. When true, routers will publish and subscribe to events to maintain consistent state.",
120+
)
115121
parser.add_argument(
116122
"--static-endpoint",
117123
type=validate_static_endpoint,
@@ -148,6 +154,7 @@ async def async_main():
148154
overlap_score_weight=flags.kv_overlap_score_weight,
149155
router_temperature=flags.router_temperature,
150156
use_kv_events=flags.use_kv_events,
157+
router_replica_sync=flags.router_replica_sync,
151158
)
152159
elif flags.router_mode == "random":
153160
router_mode = RouterMode.Random

components/metrics/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ use std::net::SocketAddr;
8484
use std::time::Duration as StdDuration;
8585

8686
use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, LoadMetrics};
87-
use dynamo_llm::kv_router::scheduler::Endpoint;
87+
use dynamo_llm::kv_router::scoring::Endpoint;
8888
use dynamo_llm::kv_router::scoring::ProcessedEndpoints;
8989

9090
use dynamo_runtime::{

components/router/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ async fn app(runtime: Runtime) -> Result<()> {
6666

6767
let selector = Box::new(CustomWorkerSelector::default());
6868

69-
let router = KvRouter::new(component.clone(), args.block_size, Some(selector), true).await?;
69+
let router = KvRouter::new(component.clone(), args.block_size, Some(selector), None).await?;
7070
let router = Ingress::for_engine(Arc::new(router))?;
7171

7272
component

docs/architecture/kv_cache_routing.md

Lines changed: 57 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@ For performance testing, compare a typical workload with `--router-mode random|r
1717

1818
The KV-aware routing arguments:
1919

20-
- `--kv-overlap-score-weight`: Sets the amount of weighting on overlaps with prefix caches, which directly contributes to the prefill cost. A large weight is expected to yield a better TTFT (at the expense of worse ITL). When set to 0, prefix caches are not considered at all (falling back to pure load balancing behavior on the active blocks).
20+
- `--kv-overlap-score-weight`: Sets the amount of weighting on overlaps with prefix caches, which directly contributes to the prefill cost. A large weight is expected to yield a better TTFT (at the expense of worse ITL). When set to 0, prefix caches are not considered at all (falling back to pure load balancing behavior on the active blocks). Defaults to 1.
2121

22-
- `--router-temperature`: Sets the temperature when randomly selecting workers to route to via softmax sampling on the router cost logits. Setting it to 0 recovers the deterministic behavior where the min logit is picked.
22+
- `--router-temperature`: Sets the temperature when randomly selecting workers to route to via softmax sampling on the router cost logits. Setting it to 0 (default) recovers the deterministic behavior where the min logit is picked.
2323

24-
- `--use-kv-events`: Sets whether to listen to KV events for maintaining the global view of cached blocks. If true, then we use the `KvIndexer` to listen to the block creation and deletion events. If false, `ApproxKvIndexer`, which assumes the kv cache of historical prompts exists for fixed time durations (hard-coded to 120s), is used to predict the kv cache hit ratio in each engine. Set false if your backend engine does not emit KV events.
24+
- `--use-kv-events`/`--no-kv-events`: Sets whether to listen to KV events for maintaining the global view of cached blocks. If true (default), then we use the `KvIndexer` to listen to the block creation and deletion events. If false, `ApproxKvIndexer`, which assumes the kv cache of historical prompts exists for fixed time durations (hard-coded to 120s), is used to predict the kv cache hit ratio in each engine. Set false if your backend engine does not emit KV events.
2525

26+
- `--router-replica-sync`: Enables state synchronization between multiple router replicas via NATS. Disabled by default, and can be enabled by passing the flag in. When enabled, router replicas share their view of KV cache distribution and active sequences, allowing all routers to make optimal routing decisions even when requests are distributed across multiple router instances. This improves fault tolerance and routing accuracy in multi-router deployments.
2627

2728
## Architecture
2829

@@ -45,6 +46,22 @@ We can then use the default routing methods exposed by the client class to send
4546

4647
KV Cache routing uses direct routing with a special worker selection algorithm.
4748

49+
## Serving Two Router Replicas
50+
51+
For improved fault tolerance, you can launch two frontend + router replicas. Since the frontend and router are currently tied together, you'll need to use two different HTTP ports for each instance.
52+
53+
To enable state sharing between the router replicas (which provides more accurate routing decisions), use the `--router-replica-sync` flag when starting the frontend:
54+
55+
```bash
56+
# Router replica 1
57+
python -m dynamo.frontend --router-mode kv --port 8000 --router-replica-sync
58+
59+
# Router replica 2
60+
python -m dynamo.frontend --router-mode kv --port 8001 --router-replica-sync
61+
```
62+
63+
When `--router-replica-sync` is enabled, the router replicas will communicate with each other via NATS to maintain consistent state across instances. This allows both routers to have a complete view of the KV cache distribution and make optimal routing decisions, even when requests are distributed across multiple router instances.
64+
4865
## Understanding KV Cache
4966
The leading Large Language Models (LLMs) today are auto-regressive and based off of the [transformer architecture](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). One key inference optimization technique is to cache the already computed keys and values and to reuse them for the future tokens. This is called the [KV Cache](https://developer.nvidia.com/blog/mastering-llm-techniques-inference-optimization/#key-value_caching).
5067

@@ -88,30 +105,46 @@ Further details can be found for: [TRT-LLM](https://developer.nvidia.com/blog/in
88105
|
89106
+------------------+------------------+
90107
| | |
91-
| KV match: 15% | KV match: 50% | KV match: 75%
108+
| Cached: 2 blocks | Cached: 5 blocks | Cached: 8 blocks
109+
| Prefill: 8 blks | Prefill: 5 blks | Prefill: 2 blks
110+
| Decode: 10 blks | Decode: 7 blks | Decode: 9 blks
92111
v v v
93112
+----------------+ +----------------+ +----------------+
94113
| Worker 1 | | Worker 2 | | Worker 3 |
95-
| (Load: 30%) | | (Load: 50%) | | (Load: 80%) |
96114
+----------------+ +----------------+ +----------------+
97115
```
98116

99117
Load balancing in LLM serving becomes complex when enabling KV Cache reuse. While KV Cache reuse can save significant computation, if the routing strategy is not aware of the unique KV states of each worker we can:
100-
- miss opportunities for KV Cache reuse if routing to the wrong node
118+
- miss opportunities for KV Cache reuse if routing to the "wrong" node
101119
- get into an imbalanced state where a few workers are processing many requests, lowering throughput of entire system
102120

103-
The best way to solve these issues is for the router to have a global view of KV Cache and load. With this view, the router can use a cost function to score the workers and make decisions to maximize cache hits while keeping the system balanced and throughput high.
121+
The router uses a cost function that considers both the prefill cost (influenced by cached blocks) and the decode load to make optimal routing decisions:
122+
123+
### Cost Calculation
124+
125+
1. **Prefill blocks**: The number of tokens that need to be processed during prefill is predicted based on the request's input tokens and the cached blocks available on each worker. This is divided by the block size to get the effective "prefill blocks". This prediction is updated when the first output token is produced, signaling prefill completion.
104126

105-
In the above image, our cost function is (KV match - Load) so we select Worker 2 even though Worker 3 would offer the best KV match.
106-
- Worker 1 = (0.15 - 0.30) = -0.15
107-
- **Worker 2 = (0.50 - 0.50) = 0**
108-
- Worker 3 = (0.75 - 0.80) = -0.05
127+
2. **Decode blocks**: The number of blocks needed during the decode phase is predicted based on the request's input tokens and the current active sequences on each worker. This is updated when the request is freed (blocks are dereferenced or freed).
128+
129+
3. **Cost formula**: `cost = overlap_score_weight * prefill_blocks + decode_blocks`
130+
- Lower cost is better
131+
- The `overlap_score_weight` parameter controls the importance of cache hits vs. load balancing
132+
- A higher weight prioritizes cache reuse (better TTFT) while a lower weight prioritizes load distribution (better ITL)
133+
134+
### Worker Selection
135+
136+
The router selects the worker with the lowest cost. When `router_temperature` is set to a non-zero value, the router uses softmax sampling on the normalized cost logits to introduce randomness in the selection, which can help with load distribution.
137+
138+
Example calculation with `overlap_score_weight = 1.0`:
139+
- Worker 1: cost = 1.0 * 8 + 10 = 18
140+
- **Worker 2: cost = 1.0 * 5 + 7 = 12** (selected - lowest cost)
141+
- Worker 3: cost = 1.0 * 2 + 9 = 11
109142

110143
## Events
111144

112-
In Dynamo, we want to support KV Cache Routing and load balancing for many backends that have different implementations of KV Cache and record different metrics. To that end, we built a KVPublisher that can be plugged into any framework to publish KV Events and a WorkerMetricsPublisher that can publish Metric Events.
145+
In Dynamo, we support KV Cache Routing for many backends that have different implementations of KV Cache. To enable this, we built a KVPublisher that can be plugged into any framework to publish KV Events.
113146

114-
On the receiving side we have a KVIndexer which accepts events from the KVPublisher and puts them into a global prefix tree and a KvMetricsAggregator which aggregates metric events by worker.
147+
On the receiving side we have a KVIndexer which accepts events from the KVPublisher and puts them into a global prefix tree for tracking cached blocks across all workers.
115148

116149
```text
117150
+----------------+ +-----------------+
@@ -121,13 +154,8 @@ On the receiving side we have a KVIndexer which accepts events from the KVPublis
121154
| +------------+ | remove_kv_block() | | KVIndexer | |
122155
| |KVPublisher | |------------------------>| +-------------+ |
123156
| +------------+ | | |
124-
| | num_request_waiting | +--------------+|
125-
| +------------+ | gpu_cache_usage_perc | |KvMetricsAggre||
126-
| |KvMetrics | |------------------------>| | gator ||
127-
| |Publisher | | ... | +--------------+|
128-
| +------------+ | +-----------------+
129-
+----------------+
130-
157+
| | | |
158+
+----------------+ +-----------------+
131159
```
132160

133161
### KVPublisher
@@ -144,18 +172,15 @@ The KVIndexer builds and maintains a global view of cached blocks in a prefix tr
144172

145173
The KVIndexer has a method `find_matches_for_request`, which takes in tokens and returns a dictionary with keys of worker id and values of the number of matched KV Blocks.
146174

147-
### WorkerMetricsPublisher
148-
We added a KvMetrics Publisher which sends the following metrics to the KvMetricsAggregator:
149-
- num_requests_waiting
150-
- gpu_cache_usage_perc
151-
- gpu_prefix_cache_hit_rate
152-
- request_active_slots
153-
- request_total_slots
154-
- kv_active_blocks
155-
- kv_total_blocks
175+
### Inter-Router Communication
176+
177+
In multi-router deployments, each router only observes a subset of requests. To maintain a consistent global view of active sequences and KV cache states, routers broadcast their local actions to other replicas through three synchronization events:
178+
179+
1. **AddRequest**: Published when assigning a request to a worker, containing the request ID, worker ID, token sequence blocks, and overlap score. This updates other routers' tracking of which blocks are in use.
180+
181+
2. **MarkPrefillCompleted**: Published when a request transitions from prefill to decode phase, signaling that prefill tokens should no longer count toward the worker's active prefill load.
156182

157-
Currently, the WorkerMetricsPublisher exists as a Python binding.
183+
3. **Free**: Published when a request completes and its resources are released, allowing other routers to update their block reference counts.
158184

159-
### KvMetricsAggregator
160-
The KvMetricsAggregator receives these metrics and aggregates them. It has a method `get_metrics` which returns an object of `AggregatedMetrics`.
185+
Each event includes a unique router ID to prevent processing of self-generated events. This asynchronous communication ensures all routers maintain synchronized KV cache state for optimal routing decisions despite handling different request streams.
161186

launch/dynamo-run/src/flags.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,12 @@ pub struct Flags {
9696
#[arg(long)]
9797
pub use_kv_events: Option<bool>,
9898

99+
/// KV Router: Whether to enable replica synchronization across multiple router instances.
100+
/// When true, routers will publish and subscribe to events to maintain consistent state.
101+
/// Default: false
102+
#[arg(long)]
103+
pub router_replica_sync: Option<bool>,
104+
99105
/// Max model context length. Reduce this if you don't have enough VRAM for the full model
100106
/// context length (e.g. Llama 4).
101107
/// Defaults to the model's max, which is usually model_max_length in tokenizer_config.json.
@@ -223,6 +229,7 @@ impl Flags {
223229
self.kv_overlap_score_weight,
224230
self.router_temperature,
225231
self.use_kv_events,
232+
self.router_replica_sync,
226233
self.max_num_batched_tokens,
227234
),
228235
)

lib/bindings/python/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/bindings/python/rust/llm/entrypoint.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,19 @@ pub struct KvRouterConfig {
3535
#[pymethods]
3636
impl KvRouterConfig {
3737
#[new]
38-
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true))]
39-
fn new(overlap_score_weight: f64, router_temperature: f64, use_kv_events: bool) -> Self {
38+
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, router_replica_sync=false))]
39+
fn new(
40+
overlap_score_weight: f64,
41+
router_temperature: f64,
42+
use_kv_events: bool,
43+
router_replica_sync: bool,
44+
) -> Self {
4045
KvRouterConfig {
4146
inner: RsKvRouterConfig {
4247
overlap_score_weight,
4348
router_temperature,
4449
use_kv_events,
50+
router_replica_sync,
4551
..Default::default()
4652
},
4753
}

lib/llm/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ derive-getters = "0.5"
8585
offset-allocator = "0.2"
8686
regex = "1"
8787
rayon = "1"
88+
dashmap = { version = "5.5.3" }
8889

8990
# input/text
9091
dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }

lib/llm/src/discovery/model_manager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ impl ModelManager {
217217
component.clone(),
218218
kv_cache_block_size,
219219
Some(selector),
220-
kv_router_config.unwrap_or_default().use_kv_events,
220+
kv_router_config,
221221
)
222222
.await?;
223223
let new_kv_chooser = Arc::new(chooser);

0 commit comments

Comments
 (0)