Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
"router_queue_threshold",
"router_event_threads",
"router_queue_policy",
"remote_indexer_component",
"use_remote_indexer",
"serve_indexer",
)


Expand All @@ -61,7 +62,8 @@ class KvRouterConfigBase(ConfigBase):
router_queue_threshold: Optional[float]
router_event_threads: int
router_queue_policy: str
remote_indexer_component: Optional[str]
use_remote_indexer: bool = False
serve_indexer: bool = False

def kv_router_kwargs(self) -> dict:
"""Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
Expand Down Expand Up @@ -286,15 +288,14 @@ def add_arguments(self, parser) -> None:
arg_type=str,
choices=["fcfs", "wspt"],
)
add_argument(
add_negatable_bool_argument(
g,
flag_name="--remote-indexer-component",
env_var="DYN_REMOTE_INDEXER_COMPONENT",
default=None,
flag_name="--use-remote-indexer",
env_var="DYN_USE_REMOTE_INDEXER",
default=False,
help=(
"[EXPERIMENTAL] KV Router: Component name of a standalone KV indexer to use for overlap scoring. "
"When set, the router queries the standalone indexer via the request plane instead "
"of maintaining a local radix tree (e.g. 'kv-indexer')."
"[EXPERIMENTAL] KV Router: Query a remote KV indexer served from the worker "
"component via the request plane instead of maintaining a local radix tree."
),
arg_type=str,
dest="use_remote_indexer",
)
15 changes: 15 additions & 0 deletions components/src/dynamo/frontend/frontend_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ def validate(self) -> None:
"--router-prefill-load-model=aic requires "
"--router-track-prefill-tokens"
)
if self.serve_indexer:
if self.router_mode != "kv":
raise ValueError("--serve-indexer requires --router-mode=kv")
if self.use_remote_indexer:
raise ValueError(
"--serve-indexer and --use-remote-indexer are mutually exclusive"
)


@register_encoder(FrontendConfig)
Expand Down Expand Up @@ -193,6 +200,14 @@ def add_arguments(self, parser) -> None:
help="HTTP port for the engine (u16).",
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--serve-indexer",
env_var="DYN_SERVE_INDEXER",
default=False,
help="Serve this frontend's local KV indexers over the request plane.",
dest="serve_indexer",
)
add_argument(
g,
flag_name="--tls-cert-path",
Expand Down
16 changes: 15 additions & 1 deletion components/src/dynamo/router/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
KvRouterArgGroup,
KvRouterConfigBase,
)
from dynamo.common.configuration.utils import add_argument
from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
from dynamo.llm import AicPerfConfig, KvRouterConfig


Expand All @@ -25,6 +25,7 @@ class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase):
namespace: str
endpoint: str
router_block_size: int
serve_indexer: bool = False

def validate(self) -> None:
"""Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
Expand All @@ -40,6 +41,10 @@ def validate(self) -> None:
"Expected format: namespace.component.endpoint"
)
self.namespace = parts[0]
if self.serve_indexer and self.use_remote_indexer:
raise ValueError(
"--serve-indexer and --use-remote-indexer are mutually exclusive"
)
if self.router_prefill_load_model == "aic":
missing = [
flag
Expand Down Expand Up @@ -89,6 +94,15 @@ def add_arguments(self, parser) -> None:
obsolete_flag="--block-size",
)

add_negatable_bool_argument(
g,
flag_name="--serve-indexer",
env_var="DYN_SERVE_INDEXER",
default=False,
help="Serve this router's local KV indexer over the request plane.",
dest="serve_indexer",
)

# KV router options (shared with dynamo.frontend)
KvRouterArgGroup().add_arguments(parser)
AicPerfArgGroup().add_arguments(parser)
Expand Down
61 changes: 60 additions & 1 deletion docs/components/router/router-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ When using KV routing, the router needs to know what each worker has cached. The
|------------|---------------|-------------|
| **NATS Core (local indexer)** | Default (no extra flags) | Workers maintain a local indexer; router queries workers on startup and receives events via NATS Core |
| **JetStream (durable)** | `--router-durable-kv-events` | Events persisted in NATS JetStream; supports snapshots and durable consumers. *Deprecated.* |
| **ZMQ** | `--event-plane zmq` | Workers publish via ZMQ PUB sockets; standalone indexer aggregates events |
| **ZMQ** | `--event-plane zmq` | Workers publish via ZMQ PUB sockets; the standalone `dynamo.indexer` service aggregates events |
| **Approximate (no events)** | `--no-router-kv-events` | No events consumed; router predicts cache state from its own routing decisions with TTL-based expiration |

### Aggregated vs. Disaggregated Topology
Expand Down Expand Up @@ -93,6 +93,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--router-prefill-load-model <none\|aic>` | `none` | Prompt-side load model. `aic` decays only the oldest active prefill using an AIC-predicted duration |
| `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` |
| `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
| `--serve-indexer` | `false` | Serve the Dynamo-native remote indexer from this frontend/router on the worker component |
| `--use-remote-indexer` | `false` | Query the worker component's served remote indexer instead of maintaining a local overlap indexer |

For all available options: `python -m dynamo.frontend --help`

Expand Down Expand Up @@ -444,6 +446,63 @@ graph TD

For improved fault tolerance, you can launch multiple frontend + router replicas. If multiple `dynamo.frontend` processes share the same host or network namespace, give each instance a different HTTP port. In Kubernetes or on separate hosts, replicas can usually reuse the same container port. Alternatively, you can deploy the router separately as the standalone `python -m dynamo.router` service; see the [Standalone Router README](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/router/README.md).

### Dynamo-Native Remote Indexer

For Dynamo-native deployments, the remote indexer is served by `dynamo.frontend` or `dynamo.router`, not by `dynamo.indexer`.

- Use `--serve-indexer` on router/frontend replicas that should expose `kv_indexer_query` from the worker component.
- Use `--use-remote-indexer` on consumer routers/frontends that should query that served endpoint instead of maintaining a local overlap indexer.
- `dynamo.indexer` remains the standalone HTTP + ZMQ microservice for non-Dynamo / direct-ZMQ deployments.

Frontend example:

```bash
# Serving anchors
python -m dynamo.frontend --router-mode kv --serve-indexer

# Consumer frontend
python -m dynamo.frontend --router-mode kv --use-remote-indexer
```

The served service is request-plane only. Each serving router/frontend keeps its normal local KV event ingestion, gap detection, and worker-query recovery path; remote consumers only issue hash-based overlap queries.

Approximate mode (`--no-router-kv-events`) is singleton-only for remote serving: only one `--serve-indexer` replica may exist for a given worker component. Event-driven mode allows multiple serving replicas behind the same worker component.
Comment thread
PeaBrane marked this conversation as resolved.

```mermaid
graph TD
subgraph "Workers"
W1["Worker 1"]
W2["Worker 2"]
end

subgraph "Event Plane"
EP["KV Events"]
end

subgraph "Serving Routers / Frontends"
S1["Router / Frontend A<br/>--serve-indexer"]
S2["Router / Frontend B<br/>--serve-indexer"]
I1["Local Indexer"]
I2["Local Indexer"]
end

subgraph "Request Plane"
RP["backend.kv_indexer_query"]
end

C["Consumer Router / Frontend<br/>--use-remote-indexer"]

W1 --> EP
W2 --> EP
EP --> S1
EP --> S2
S1 --> I1
S2 --> I2
C --> RP
RP --> S1
RP --> S2
```

### Router State Management

The KV Router tracks two types of state (see [Router Design](../../design-docs/router-design.md) for details):
Expand Down
127 changes: 10 additions & 117 deletions docs/components/router/standalone-indexer.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@ subtitle: Run the KV cache indexer as an independent HTTP service for querying b

## Overview

The standalone KV indexer (`python -m dynamo.indexer`) is a lightweight service that maintains a radix tree of cached blocks and exposes HTTP endpoints for querying and managing workers. It supports two operational modes:
The standalone KV indexer (`python -m dynamo.indexer`) is a lightweight service that maintains a radix tree of cached blocks and exposes HTTP endpoints for querying and managing workers.

- **Standalone mode** (default): subscribes to ZMQ KV event streams directly from workers. No Dynamo runtime discovery, registration, or event-plane integration required.
- **Dynamo runtime mode** (`--dynamo-runtime`): integrates with the Dynamo runtime for automatic worker discovery via MDC, KV event ingestion via the event plane (NATS or ZMQ), and overlap queries over the request plane for remote frontends.
- It subscribes to ZMQ KV event streams directly from workers.
- It exposes an HTTP API for registration, inspection, and overlap queries.
- It preserves P2P recovery and gap detection/replay for the standalone ZMQ path.

This is distinct from the [Standalone Router](../../../components/src/dynamo/router/README.md), which is a full routing service. The standalone indexer provides only the indexing and query layer without routing logic.

For Dynamo-native remote indexing, use `--serve-indexer` on `dynamo.frontend` or `dynamo.router` and `--use-remote-indexer` on consumers instead. That request-plane service reuses the router's existing event ingestion and recovery machinery; it is not implemented by `dynamo.indexer`.
Comment thread
PeaBrane marked this conversation as resolved.

The HTTP API follows the [Mooncake KV Indexer RFC](https://github.com/kvcache-ai/Mooncake/issues/1403) conventions.

`DYN_ROUTER_MIN_INITIAL_WORKERS` is also honored here. When set to a positive integer, the
Expand All @@ -30,17 +33,15 @@ The indexer maintains one radix tree per `(model_name, tenant_id)` pair. Workers

## Compatibility

In standalone mode, the indexer works with any engine that publishes KV cache events over ZMQ in the expected msgpack format. This includes bare vLLM and SGLang engines, which emit ZMQ KV events natively — no Dynamo-specific wrapper is required.

In Dynamo runtime mode, the indexer discovers workers automatically via MDC and receives KV events through the event plane. It also registers a query endpoint on the request plane, allowing frontends to query overlap scores remotely without needing direct HTTP access.
The standalone indexer works with any engine that publishes KV cache events over ZMQ in the expected msgpack format. This includes bare vLLM and SGLang engines, which emit ZMQ KV events natively — no Dynamo-specific wrapper is required.

## Use Cases

- **Debugging**: Inspect the radix tree state to verify which blocks are cached on which workers.
- **State verification**: Confirm that the indexer's view of KV cache state matches the router's internal state (used in integration tests).
- **Custom routing**: Build external routing logic that queries the indexer for overlap scores and makes its own worker selection decisions.
- **Monitoring**: Observe KV cache distribution across workers without running a full router.
- **Remote indexing**: In Dynamo runtime mode, frontends can offload KV cache indexing to a dedicated service and query it over the request plane.
- **Standalone microservice**: Run an indexer independently of the router/frontend when you want direct HTTP inspection and ZMQ-based ingestion.

## P2P Recovery

Expand Down Expand Up @@ -91,7 +92,6 @@ The service is exposed through the Python bindings package and launched with `py
|---------|-------------|
| `kv-indexer` | Core standalone indexer service path (`python -m dynamo.indexer`: HTTP API, ZMQ listeners, P2P recovery) |
| `kv-indexer-metrics` | Optional `/metrics` endpoint |
| `kv-indexer-runtime` | Dynamo runtime integration (`--dynamo-runtime`, discovery, event plane, request plane) |

### Standalone build

Expand All @@ -109,30 +109,12 @@ cd lib/bindings/python && VIRTUAL_ENV=../../.venv ../../.venv/bin/maturin develo

This keeps the default `kv-indexer` build lean while still allowing Prometheus metrics when needed.

### Runtime-enabled build

```bash
cd lib/bindings/python && VIRTUAL_ENV=../../.venv ../../.venv/bin/maturin develop --uv --features kv-indexer,kv-indexer-runtime
```

This enables the `--dynamo-runtime` CLI flag for MDC discovery, event-plane subscription, and request-plane queries. It also includes the metrics endpoint.

## CLI

### Standalone mode (default)

```bash
python -m dynamo.indexer --port 8090 [--threads 4] [--block-size 16 --model-name my-model --tenant-id default --workers "1=tcp://host:5557,2:1=tcp://host:5558"] [--peers "http://peer1:8090,http://peer2:8091"]
```

### Dynamo runtime mode

```bash
python -m dynamo.indexer --dynamo-runtime --namespace default --component-name kv-indexer --worker-component backend --port 8090 [--threads 4]
```

In runtime mode, workers are discovered automatically via MDC. The `--workers` flag can still be used to register additional static workers alongside discovered ones.

| Flag | Default | Description |
|------|---------|-------------|
| `--block-size` | (none) | KV cache block size for initial `--workers` (required when `--workers` is set) |
Expand All @@ -142,10 +124,6 @@ In runtime mode, workers are discovered automatically via MDC. The `--workers` f
| `--model-name` | `default` | Model name for initial `--workers` |
| `--tenant-id` | `default` | Tenant ID for initial `--workers` |
| `--peers` | (none) | Comma-separated peer indexer URLs for P2P recovery on startup |
| `--dynamo-runtime` | `false` | Enable Dynamo runtime integration (requires `kv-indexer-runtime`) |
| `--namespace` | `default` | Dynamo namespace to register the indexer component under |
| `--component-name` | `kv-indexer` | Component name for this indexer in the Dynamo runtime |
| `--worker-component` | `backend` | Component name that workers register under for event-plane subscription |

### Shared Startup Gate

Expand All @@ -165,7 +143,7 @@ curl http://localhost:8090/health

### `GET /metrics` — Prometheus metrics

Returns metrics in Prometheus text exposition format. Available when the Python bindings are built with the `kv-indexer-metrics` or `kv-indexer-runtime` feature.
Returns metrics in Prometheus text exposition format. Available when the Python bindings are built with the `kv-indexer-metrics` feature.

```bash
curl http://localhost:8090/metrics
Expand Down Expand Up @@ -400,38 +378,9 @@ If no `replay_endpoint` is configured, gaps are logged as warnings but not recov

The sequence counter (`last_seq`) persists across unregister/register cycles, so re-registering a worker after a gap will trigger replay on the first batch received by the new listener.

## Dynamo Runtime Mode

When started with `--dynamo-runtime`, the indexer integrates with the Dynamo distributed runtime:

### Worker Discovery

The indexer watches MDC (Model Discovery Catalog) for worker additions and removals. When a worker registers with MDC, the indexer automatically creates an indexer for its model and block size. Workers discovered via MDC are tracked separately from those registered via `--workers` or the `/register` HTTP API; a worker cannot be registered through both paths simultaneously.

### Event Plane Subscription

Instead of connecting directly to ZMQ PUB sockets on each worker, the indexer subscribes to KV events through the Dynamo event plane. The transport (NATS or ZMQ) is determined by the `DYNAMO_EVENT_TRANSPORT` environment variable. Events are routed to the appropriate indexer based on the worker ID.

### Request Plane Query Endpoint

The indexer registers a query endpoint on the Dynamo request plane, allowing frontends to send `IndexerQueryRequest` messages containing a model name, namespace, and block hashes. The indexer looks up the appropriate radix tree and returns overlap scores. This enables frontends to use a remote indexer for KV-aware routing without direct HTTP access.

### Example

```bash
# Start the indexer with runtime integration
python -m dynamo.indexer --dynamo-runtime \
--namespace my-namespace \
--component-name kv-indexer \
--worker-component backend \
--port 8090 --threads 4
```

The HTTP API remains fully available in runtime mode. Static workers can be added via `--workers` alongside discovered workers.

## Limitations

- **Standalone mode is ZMQ only**: In standalone mode, workers must publish KV events via ZMQ PUB sockets. Build with `kv-indexer-runtime` and use `--dynamo-runtime` to receive events via the event plane (NATS or ZMQ).
- **Standalone mode is ZMQ only**: Workers must publish KV events via ZMQ PUB sockets.
- **No routing logic**: The indexer only maintains the radix tree and answers queries. It does not track active blocks, manage request lifecycle, or perform worker selection.

## Architecture
Expand Down Expand Up @@ -471,62 +420,6 @@ graph TD
style CLIENT fill:#fff3e0,stroke:#333,color:#333
```

### Dynamo Runtime Mode

```mermaid
graph TD
subgraph Workers
W1[Worker 1]
W2[Worker 2]
end

subgraph "Dynamo Runtime"
MDC[MDC Discovery]
EP[Event Plane<br/>NATS / ZMQ]
RP[Request Plane]
end

subgraph "Standalone Indexer"
DISC[Discovery Watcher]
SUB[Event Subscriber]
REG[Worker Registry]
IDX["Indexer Map<br/>(model, tenant) → Radix Tree"]
QE[Query Endpoint]
HTTP[HTTP API<br/>/query /dump /register /metrics]
end

FRONTEND[Frontend / Router]
CLIENT[External Client]

W1 -->|register| MDC
W2 -->|register| MDC
MDC -->|added/removed| DISC
DISC -->|add/remove workers| REG
W1 -->|KV events| EP
W2 -->|KV events| EP
EP -->|RouterEvent| SUB
SUB -->|apply events| IDX
FRONTEND -->|IndexerQueryRequest| RP
RP --> QE
QE -->|query| IDX
CLIENT -->|POST /query, GET /dump| HTTP
HTTP -->|query| IDX

style W1 fill:#f3e5f5,stroke:#333,color:#333
style W2 fill:#f3e5f5,stroke:#333,color:#333
style MDC fill:#e3f2fd,stroke:#333,color:#333
style EP fill:#e3f2fd,stroke:#333,color:#333
style RP fill:#e3f2fd,stroke:#333,color:#333
style IDX fill:#2e8b57,stroke:#333,color:#fff
style SUB fill:#2e8b57,stroke:#333,color:#fff
style DISC fill:#2e8b57,stroke:#333,color:#fff
style REG fill:#2e8b57,stroke:#333,color:#fff
style QE fill:#2e8b57,stroke:#333,color:#fff
style HTTP fill:#2e8b57,stroke:#333,color:#fff
style FRONTEND fill:#fff3e0,stroke:#333,color:#333
style CLIENT fill:#fff3e0,stroke:#333,color:#333
```

### P2P Recovery Flow

```mermaid
Expand Down
Loading
Loading