Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
"router_prune_target_ratio",
"router_queue_threshold",
"router_event_threads",
"router_enable_cache_control",
"router_queue_policy",
"remote_indexer_component",
)
Expand All @@ -59,7 +58,6 @@ class KvRouterConfigBase(ConfigBase):
router_prune_target_ratio: float
router_queue_threshold: Optional[float]
router_event_threads: int
router_enable_cache_control: bool
router_queue_policy: str
remote_indexer_component: Optional[str]

Expand Down Expand Up @@ -260,18 +258,6 @@ def add_arguments(self, parser) -> None:
),
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--enable-cache-control",
env_var="DYN_ENABLE_CACHE_CONTROL",
default=False,
dest="router_enable_cache_control",
help=(
"KV Router: Enable cache control (PIN with TTL). When set, the router creates "
"a cache_control service mesh client and fires pin_prefix after generation for "
"requests with nvext.cache_control."
),
)
add_argument(
g,
flag_name="--router-queue-policy",
Expand Down
2 changes: 0 additions & 2 deletions components/src/dynamo/frontend/frontend_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ def validate(self) -> None:
)
if self.min_initial_workers < 0:
raise ValueError("--router-min-initial-workers must be >= 0")
if self.router_enable_cache_control and self.router_mode != "kv":
raise ValueError("--enable-cache-control requires --router-mode=kv")
if self.tokenizer_backend not in self._VALID_TOKENIZER_BACKENDS:
raise ValueError(
f"--tokenizer: invalid value '{self.tokenizer_backend}' "
Expand Down
42 changes: 0 additions & 42 deletions components/src/dynamo/sglang/request_handlers/handler_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,47 +380,6 @@ async def update_weight_version(self, body: dict) -> dict:
"new_version": req.new_version,
}

async def pin_prefix(self, body: dict) -> dict:
"""Pin a prefix by token_ids to resist eviction.

Args:
body: Dict with "token_ids" list of token IDs and optional
"ttl_seconds" (default 300).
"""
token_ids = body.get("token_ids", [])
ttl_seconds = body.get("ttl_seconds", 300)
if not token_ids:
return {"status": "error", "message": "token_ids required"}
try:
result = await self.engine.tokenizer_manager.pin_prefix(
token_ids, ttl_seconds
)
return {
"status": "ok" if result.success else "error",
"nodes_pinned": result.nodes_pinned,
"message": result.message,
}
except Exception as e:
logging.error(f"Failed to pin prefix: {e}")
return {"status": "error", "message": str(e)}

async def cache_control(self, request, context=None):
"""Service mesh endpoint for cache control operations.

Args:
request: Dict with "action" key and action-specific parameters.
context: Optional Dynamo context (unused but required by protocol).

Yields:
Single dict with operation result.
"""
action = request.get("action")
if action == "pin_prefix":
result = await self.pin_prefix(request)
else:
result = {"status": "error", "message": f"Unknown action: {action}"}
yield result

def register_engine_routes(self, runtime: DistributedRuntime) -> None:
"""Register all engine routes for this handler.

Expand All @@ -435,7 +394,6 @@ def register_engine_routes(self, runtime: DistributedRuntime) -> None:
runtime.register_engine_route(
"resume_memory_occupation", self.resume_memory_occupation
)
runtime.register_engine_route("pin_prefix", self.pin_prefix)
runtime.register_engine_route(
"update_weights_from_disk", self.update_weights_from_disk
)
Expand Down
190 changes: 2 additions & 188 deletions docs/backends/sglang/agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: SGLang for Agentic Workloads
subtitle: Priority scheduling, KV cache eviction policies, and cache pinning for multi-turn agentic serving
subtitle: Priority scheduling and KV cache eviction policies for multi-turn agentic serving
---

# SGLang for Agentic Workloads

This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/frontend/nvext.md#agent-hints) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations.
This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable and how Dynamo's [agent hints](../../components/frontend/nvext.md#agent-hints) map to SGLang behavior.

## Overview

Expand Down Expand Up @@ -109,192 +109,6 @@ for chunk in response:
print(chunk.choices[0].delta.content, end="")
```

## Cache Pinning (Experimental)

> [!WARNING]
> Cache pinning is experimental and available on development branches only. The API may change.

**Required PRs:**
- SGLang: [feat: TTL-based prefix pinning with refresh-on-hit for HiRadixCache](https://github.com/sgl-project/sglang/pull/18941)
- Dynamo: [feat: wire nvext.cache_control TTL-based pinning through Dynamo router](https://github.com/ai-dynamo/dynamo/pull/6213)

Cache pinning lets you explicitly protect KV cache for high-value conversation prefixes. When a request includes `nvext.cache_control`, the router fires a `pin_prefix` call to the SGLang worker after generation completes. Pinned nodes resist eviction for the specified TTL -- even under memory pressure, they are retained (demoted to host memory with HiCache rather than deleted).

### How It Works

```mermaid
sequenceDiagram
participant Client
participant Preprocessor
participant Router
participant Worker as SGLang Worker
participant Cache as Radix Cache

Client->>Preprocessor: chat/completions + nvext.cache_control{ttl}
Preprocessor->>Preprocessor: Extract TTL, attach to RoutingHints
Preprocessor->>Router: PreprocessedRequest (cache_control_ttl=N)
Router->>Router: Select worker, record token_ids + TTL in PinState
Router->>Worker: Generate request
Worker-->>Router: Stream response tokens
Router-->>Client: Stream response tokens

Note over Router,Worker: On stream completion

Router-)Worker: pin_prefix(token_ids, ttl) [fire-and-forget]
Worker->>Cache: Walk radix tree along token sequence
Cache->>Cache: Set pin_expiry, acquire host_ref_counter hold
Worker--)Router: {status: ok, nodes_pinned: N}

Note over Cache: TTL expires

Cache->>Cache: Clear pin_expiry, release host_ref_counter
Note over Cache: Node now eligible for normal eviction
```

1. The client includes `nvext.cache_control` with a TTL in the request.
2. The Dynamo preprocessor extracts the TTL and attaches it to routing hints.
3. The router routes the request normally and records the token IDs in a `PinState`.
4. After the response stream completes, the router spawns a fire-and-forget `pin_prefix` RPC to the worker that served the request.
5. The worker walks the radix tree along the token sequence and pins each node, setting `pin_expiry` and acquiring a `host_ref_counter` hold that prevents eviction.
6. When TTL expires, the pin is cleared and the node becomes eligible for normal eviction.

### Enabling Cache Pinning

**Frontend flag:**

```bash
python -m dynamo.frontend \
--router-mode kv \
--enable-cache-control \
...
```

| Flag | Description |
|------|-------------|
| `--enable-cache-control` | Enables cache control (PIN with TTL). Creates a `cache_control` service mesh client and fires `pin_prefix` after generation for requests with `nvext.cache_control`. Requires `--router-mode=kv`. |

**SGLang worker:** The worker receives PIN requests via its `cache_control` service mesh endpoint. You **must** set the `SGLANG_HICACHE_MAX_PINNED_RATIO` environment variable to a non-zero value -- pinning is disabled by default.

| Environment Variable | Type | Default | Description |
|---------------------|------|---------|-------------|
| `SGLANG_HICACHE_MAX_PINNED_RATIO` | `float` | `0.0` | Max fraction of cache tokens that can be pinned. Must be in `[0, 1)`. `0` disables pinning entirely. |

HiCache is required (`--enable-hierarchical-cache`). Without it, the scheduler rejects PIN requests. For best results, use `write_through` so that pinned nodes demote to host memory instead of being deleted when GPU memory fills:

```bash
SGLANG_HICACHE_MAX_PINNED_RATIO=0.1 python -m dynamo.sglang \
--model-path Qwen/Qwen3-14B-FP8 \
--enable-hierarchical-cache \
--hicache-ratio 2.0 \
--hicache-write-policy write_through \
...
```

### Request Format

Include `cache_control` as a top-level field in `nvext`:

```json
{
"model": "Qwen/Qwen3-14B-FP8",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum computing."}
],
"nvext": {
"cache_control": {
"type": "ephemeral",
"ttl": "1h"
}
}
}
```

| Field | Type | Description |
|-------|------|-------------|
| `cache_control.type` | `string` | Currently only `"ephemeral"` is supported. |
| `cache_control.ttl` | `string` | TTL as integer seconds (`"600"`) or shorthand (`"5m"`, `"1h"`). Clamped to [300, 3600] seconds. Unrecognized strings default to 300s. |

### Python Example

```python
from openai import OpenAI

client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")

# First turn -- pin the conversation prefix for 1 hour
response = client.chat.completions.create(
model="Qwen/Qwen3-14B-FP8",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": "Analyze this codebase and suggest improvements."},
],
stream=True,
extra_body={
"nvext": {
"cache_control": {
"type": "ephemeral",
"ttl": "1h"
}
}
}
)

# Collect the assistant reply
assistant_response = ""
for chunk in response:
if chunk.choices[0].delta.content:
assistant_response += chunk.choices[0].delta.content

# Later turns reuse the pinned prefix -- even after heavy load from
# other requests, the KV cache for this conversation is preserved.
response = client.chat.completions.create(
model="Qwen/Qwen3-14B-FP8",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": "Analyze this codebase and suggest improvements."},
{"role": "assistant", "content": assistant_response},
{"role": "user", "content": "Now focus on the database layer."},
],
stream=True,
extra_body={
"nvext": {
"cache_control": {
"type": "ephemeral",
"ttl": "1h"
}
}
}
)
```

### Verifying Cache Hits

The response includes `prompt_tokens_details.cached_tokens` in the `usage` object when `--enable-cache-report` is set on the SGLang worker:

```json
{
"usage": {
"prompt_tokens": 2048,
"completion_tokens": 150,
"prompt_tokens_details": {
"cached_tokens": 1920
}
}
}
```

A high `cached_tokens / prompt_tokens` ratio on subsequent turns confirms that the pinned prefix was preserved.

### Limitations

- **Pinning disabled by default**: `SGLANG_HICACHE_MAX_PINNED_RATIO` defaults to `0.0`. You must set it to a non-zero value (e.g., `0.1`) or all PIN requests will be rejected.
- **HiCache required**: The scheduler rejects PIN requests unless `--enable-hierarchical-cache` is set.
- **TTL clamping**: Values are clamped to [300, 3600] seconds. You cannot pin for less than 5 minutes or more than 1 hour.
- **Pin budget**: Pinned tokens consume a budget controlled by `SGLANG_HICACHE_MAX_PINNED_RATIO` (fraction of host pool capacity). Requests exceeding this budget are rejected.
- **No priority on pinned nodes**: `pin_prefix` does not set a priority on the radix tree nodes. All pinned nodes have equal eviction priority and fall back to LRU ordering among themselves when host memory fills.
- **Requires stack restart for A/B testing**: Pins persist in cache across benchmark runs. When comparing pinned vs. unpinned performance, restart the full stack between phases to avoid false cache hits.

## See Also

- **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference including agent hints
Expand Down
1 change: 0 additions & 1 deletion docs/components/frontend/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ The Rust HTTP server also reads these environment variables (not exposed as CLI
| `--router-event-threads` | `DYN_ROUTER_EVENT_THREADS` | `4` | Event processing threads. >1 enables concurrent radix tree |
| `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `4.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling |
| `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
| `--enable-cache-control` / `--no-enable-cache-control` | `DYN_ENABLE_CACHE_CONTROL` | `false` | Enable TTL-based cache pinning (requires `--router-mode=kv`) |
| `--decode-fallback` / `--no-decode-fallback` | `DYN_DECODE_FALLBACK` | `false` | Fall back to aggregated mode when prefill workers unavailable |

## Fault Tolerance
Expand Down
28 changes: 1 addition & 27 deletions docs/components/frontend/nvext.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
| `prefill_worker_id` | `u64` | `None` | Router | Routes the request to a specific prefill worker (disaggregated serving). |
| `decode_worker_id` | `u64` | `None` | Router | Routes the request to a specific decode worker (disaggregated serving). |
| `agent_hints` | object | `None` | Router | Per-request hints for scheduling and load balancing. See [Agent Hints](#agent-hints). |
| `cache_control` | object | `None` | Router | KV cache pinning hint with TTL. See [Cache Control](#cache-control). |

### Header Overrides

Expand Down Expand Up @@ -130,31 +129,6 @@ Backend details:
}
```

## Cache Control

> [!WARNING]
> Cache control is experimental and available on development branches only. The API may change.

The `cache_control` object enables explicit KV cache pinning with a TTL. When set, the router fires a `pin_prefix` call to the backend worker after generation completes, protecting the conversation's KV cache from eviction for the specified duration.

| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `cache_control.type` | `string` | — | Cache control type. Currently only `"ephemeral"` is supported. |
| `cache_control.ttl` | `string` | `"300"` | TTL as integer seconds (`"600"`) or shorthand (`"5m"`, `"1h"`). Clamped to [300, 3600] seconds. |

```json
{
"nvext": {
"cache_control": {
"type": "ephemeral",
"ttl": "1h"
}
}
}
```

Requires `--enable-cache-control` and `--router-mode=kv` on the frontend. See [SGLang for Agentic Workloads](../../backends/sglang/agents.md#cache-pinning-experimental) for full setup and usage details.

## Response Extensions

When the client requests response metadata via `extra_fields`, the response includes an `nvext` object with the requested fields:
Expand Down Expand Up @@ -190,4 +164,4 @@ When the client requests response metadata via `extra_fields`, the response incl
|----------|-------------|
| [Frontend Guide](frontend-guide.md) | KServe gRPC configuration and integration |
| [Router Guide](../router/router-guide.md) | Full router configuration and CLI arguments |
| [SGLang for Agentic Workloads](../../backends/sglang/agents.md) | SGLang engine flags for priority scheduling, eviction policies, and cache pinning |
| [SGLang for Agentic Workloads](../../backends/sglang/agents.md) | SGLang engine flags for priority scheduling and eviction policies |
Loading
Loading