ai-dynamo · ishandhanani · Apr 2, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
@@ -34,7 +34,6 @@
     "router_prune_target_ratio",
     "router_queue_threshold",
     "router_event_threads",
-    "router_enable_cache_control",
     "router_queue_policy",
     "remote_indexer_component",
 )
@@ -59,7 +58,6 @@ class KvRouterConfigBase(ConfigBase):
     router_prune_target_ratio: float
     router_queue_threshold: Optional[float]
     router_event_threads: int
-    router_enable_cache_control: bool
     router_queue_policy: str
     remote_indexer_component: Optional[str]
 
@@ -260,18 +258,6 @@ def add_arguments(self, parser) -> None:
             ),
             arg_type=int,
         )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--enable-cache-control",
-            env_var="DYN_ENABLE_CACHE_CONTROL",
-            default=False,
-            dest="router_enable_cache_control",
-            help=(
-                "KV Router: Enable cache control (PIN with TTL). When set, the router creates "
-                "a cache_control service mesh client and fires pin_prefix after generation for "
-                "requests with nvext.cache_control."
-            ),
-        )
         add_argument(
             g,
             flag_name="--router-queue-policy",

@@ -93,8 +93,6 @@ def validate(self) -> None:
             )
         if self.min_initial_workers < 0:
             raise ValueError("--router-min-initial-workers must be >= 0")
-        if self.router_enable_cache_control and self.router_mode != "kv":
-            raise ValueError("--enable-cache-control requires --router-mode=kv")
         if self.tokenizer_backend not in self._VALID_TOKENIZER_BACKENDS:
             raise ValueError(
                 f"--tokenizer: invalid value '{self.tokenizer_backend}' "

@@ -380,47 +380,6 @@ async def update_weight_version(self, body: dict) -> dict:
             "new_version": req.new_version,
         }
 
-    async def pin_prefix(self, body: dict) -> dict:
-        """Pin a prefix by token_ids to resist eviction.
-
-        Args:
-            body: Dict with "token_ids" list of token IDs and optional
-                  "ttl_seconds" (default 300).
-        """
-        token_ids = body.get("token_ids", [])
-        ttl_seconds = body.get("ttl_seconds", 300)
-        if not token_ids:
-            return {"status": "error", "message": "token_ids required"}
-        try:
-            result = await self.engine.tokenizer_manager.pin_prefix(
-                token_ids, ttl_seconds
-            )
-            return {
-                "status": "ok" if result.success else "error",
-                "nodes_pinned": result.nodes_pinned,
-                "message": result.message,
-            }
-        except Exception as e:
-            logging.error(f"Failed to pin prefix: {e}")
-            return {"status": "error", "message": str(e)}
-
-    async def cache_control(self, request, context=None):
-        """Service mesh endpoint for cache control operations.
-
-        Args:
-            request: Dict with "action" key and action-specific parameters.
-            context: Optional Dynamo context (unused but required by protocol).
-
-        Yields:
-            Single dict with operation result.
-        """
-        action = request.get("action")
-        if action == "pin_prefix":
-            result = await self.pin_prefix(request)
-        else:
-            result = {"status": "error", "message": f"Unknown action: {action}"}
-        yield result
-
     def register_engine_routes(self, runtime: DistributedRuntime) -> None:
         """Register all engine routes for this handler.
 
@@ -435,7 +394,6 @@ def register_engine_routes(self, runtime: DistributedRuntime) -> None:
         runtime.register_engine_route(
             "resume_memory_occupation", self.resume_memory_occupation
         )
-        runtime.register_engine_route("pin_prefix", self.pin_prefix)
         runtime.register_engine_route(
             "update_weights_from_disk", self.update_weights_from_disk
         )

diff --git a/docs/backends/sglang/agents.md b/docs/backends/sglang/agents.md
@@ -2,12 +2,12 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 title: SGLang for Agentic Workloads
-subtitle: Priority scheduling, KV cache eviction policies, and cache pinning for multi-turn agentic serving
+subtitle: Priority scheduling and KV cache eviction policies for multi-turn agentic serving
 ---
 
 # SGLang for Agentic Workloads
 
-This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/frontend/nvext.md#agent-hints) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations.
+This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable and how Dynamo's [agent hints](../../components/frontend/nvext.md#agent-hints) map to SGLang behavior.
 
 ## Overview
 
@@ -109,192 +109,6 @@ for chunk in response:
         print(chunk.choices[0].delta.content, end="")
 ```
 
-## Cache Pinning (Experimental)
-
-> [!WARNING]
-> Cache pinning is experimental and available on development branches only. The API may change.
-
-**Required PRs:**
-- SGLang: [feat: TTL-based prefix pinning with refresh-on-hit for HiRadixCache](https://github.com/sgl-project/sglang/pull/18941)
-- Dynamo: [feat: wire nvext.cache_control TTL-based pinning through Dynamo router](https://github.com/ai-dynamo/dynamo/pull/6213)
-
-Cache pinning lets you explicitly protect KV cache for high-value conversation prefixes. When a request includes `nvext.cache_control`, the router fires a `pin_prefix` call to the SGLang worker after generation completes. Pinned nodes resist eviction for the specified TTL -- even under memory pressure, they are retained (demoted to host memory with HiCache rather than deleted).
-
-### How It Works
-
-```mermaid
-sequenceDiagram
-    participant Client
-    participant Preprocessor
-    participant Router
-    participant Worker as SGLang Worker
-    participant Cache as Radix Cache
-
-    Client->>Preprocessor: chat/completions + nvext.cache_control{ttl}
-    Preprocessor->>Preprocessor: Extract TTL, attach to RoutingHints
-    Preprocessor->>Router: PreprocessedRequest (cache_control_ttl=N)
-    Router->>Router: Select worker, record token_ids + TTL in PinState
-    Router->>Worker: Generate request
-    Worker-->>Router: Stream response tokens
-    Router-->>Client: Stream response tokens
-
-    Note over Router,Worker: On stream completion
-
-    Router-)Worker: pin_prefix(token_ids, ttl) [fire-and-forget]
-    Worker->>Cache: Walk radix tree along token sequence
-    Cache->>Cache: Set pin_expiry, acquire host_ref_counter hold
-    Worker--)Router: {status: ok, nodes_pinned: N}
-
-    Note over Cache: TTL expires
-
-    Cache->>Cache: Clear pin_expiry, release host_ref_counter
-    Note over Cache: Node now eligible for normal eviction
-```
-
-1. The client includes `nvext.cache_control` with a TTL in the request.
-2. The Dynamo preprocessor extracts the TTL and attaches it to routing hints.
-3. The router routes the request normally and records the token IDs in a `PinState`.
-4. After the response stream completes, the router spawns a fire-and-forget `pin_prefix` RPC to the worker that served the request.
-5. The worker walks the radix tree along the token sequence and pins each node, setting `pin_expiry` and acquiring a `host_ref_counter` hold that prevents eviction.
-6. When TTL expires, the pin is cleared and the node becomes eligible for normal eviction.
-
-### Enabling Cache Pinning
-
-**Frontend flag:**
-
-```bash
-python -m dynamo.frontend \
-  --router-mode kv \
-  --enable-cache-control \
-  ...
-```
-
-| Flag | Description |
-|------|-------------|
-| `--enable-cache-control` | Enables cache control (PIN with TTL). Creates a `cache_control` service mesh client and fires `pin_prefix` after generation for requests with `nvext.cache_control`. Requires `--router-mode=kv`. |
-
-**SGLang worker:** The worker receives PIN requests via its `cache_control` service mesh endpoint. You **must** set the `SGLANG_HICACHE_MAX_PINNED_RATIO` environment variable to a non-zero value -- pinning is disabled by default.
-
-| Environment Variable | Type | Default | Description |
-|---------------------|------|---------|-------------|
-| `SGLANG_HICACHE_MAX_PINNED_RATIO` | `float` | `0.0` | Max fraction of cache tokens that can be pinned. Must be in `[0, 1)`. `0` disables pinning entirely. |
-
-HiCache is required (`--enable-hierarchical-cache`). Without it, the scheduler rejects PIN requests. For best results, use `write_through` so that pinned nodes demote to host memory instead of being deleted when GPU memory fills:
-
-```bash
-SGLANG_HICACHE_MAX_PINNED_RATIO=0.1 python -m dynamo.sglang \
-  --model-path Qwen/Qwen3-14B-FP8 \
-  --enable-hierarchical-cache \
-  --hicache-ratio 2.0 \
-  --hicache-write-policy write_through \
-  ...
-```
-
-### Request Format
-
-Include `cache_control` as a top-level field in `nvext`:
-
-```json
-{
-    "model": "Qwen/Qwen3-14B-FP8",
-    "messages": [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Explain quantum computing."}
-    ],
-    "nvext": {
-        "cache_control": {
-            "type": "ephemeral",
-            "ttl": "1h"
-        }
-    }
-}
-```
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `cache_control.type` | `string` | Currently only `"ephemeral"` is supported. |
-| `cache_control.ttl` | `string` | TTL as integer seconds (`"600"`) or shorthand (`"5m"`, `"1h"`). Clamped to [300, 3600] seconds. Unrecognized strings default to 300s. |
-
-### Python Example
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
-
-# First turn -- pin the conversation prefix for 1 hour
-response = client.chat.completions.create(
-    model="Qwen/Qwen3-14B-FP8",
-    messages=[
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": "Analyze this codebase and suggest improvements."},
-    ],
-    stream=True,
-    extra_body={
-        "nvext": {
-            "cache_control": {
-                "type": "ephemeral",
-                "ttl": "1h"
-            }
-        }
-    }
-)
-
-# Collect the assistant reply
-assistant_response = ""
-for chunk in response:
-    if chunk.choices[0].delta.content:
-        assistant_response += chunk.choices[0].delta.content
-
-# Later turns reuse the pinned prefix -- even after heavy load from
-# other requests, the KV cache for this conversation is preserved.
-response = client.chat.completions.create(
-    model="Qwen/Qwen3-14B-FP8",
-    messages=[
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": "Analyze this codebase and suggest improvements."},
-        {"role": "assistant", "content": assistant_response},
-        {"role": "user", "content": "Now focus on the database layer."},
-    ],
-    stream=True,
-    extra_body={
-        "nvext": {
-            "cache_control": {
-                "type": "ephemeral",
-                "ttl": "1h"
-            }
-        }
-    }
-)
-```
-
-### Verifying Cache Hits
-
-The response includes `prompt_tokens_details.cached_tokens` in the `usage` object when `--enable-cache-report` is set on the SGLang worker:
-
-```json
-{
-    "usage": {
-        "prompt_tokens": 2048,
-        "completion_tokens": 150,
-        "prompt_tokens_details": {
-            "cached_tokens": 1920
-        }
-    }
-}
-```
-
-A high `cached_tokens / prompt_tokens` ratio on subsequent turns confirms that the pinned prefix was preserved.
-
-### Limitations
-
-- **Pinning disabled by default**: `SGLANG_HICACHE_MAX_PINNED_RATIO` defaults to `0.0`. You must set it to a non-zero value (e.g., `0.1`) or all PIN requests will be rejected.
-- **HiCache required**: The scheduler rejects PIN requests unless `--enable-hierarchical-cache` is set.
-- **TTL clamping**: Values are clamped to [300, 3600] seconds. You cannot pin for less than 5 minutes or more than 1 hour.
-- **Pin budget**: Pinned tokens consume a budget controlled by `SGLANG_HICACHE_MAX_PINNED_RATIO` (fraction of host pool capacity). Requests exceeding this budget are rejected.
-- **No priority on pinned nodes**: `pin_prefix` does not set a priority on the radix tree nodes. All pinned nodes have equal eviction priority and fall back to LRU ordering among themselves when host memory fills.
-- **Requires stack restart for A/B testing**: Pins persist in cache across benchmark runs. When comparing pinned vs. unpinned performance, restart the full stack between phases to avoid false cache hits.
-
 ## See Also
 
 - **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference including agent hints

diff --git a/docs/components/frontend/configuration.md b/docs/components/frontend/configuration.md
@@ -45,7 +45,6 @@ The Rust HTTP server also reads these environment variables (not exposed as CLI
 | `--router-event-threads` | `DYN_ROUTER_EVENT_THREADS` | `4` | Event processing threads. >1 enables concurrent radix tree |
 | `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `4.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling |
 | `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
-| `--enable-cache-control` / `--no-enable-cache-control` | `DYN_ENABLE_CACHE_CONTROL` | `false` | Enable TTL-based cache pinning (requires `--router-mode=kv`) |
 | `--decode-fallback` / `--no-decode-fallback` | `DYN_DECODE_FALLBACK` | `false` | Fall back to aggregated mode when prefill workers unavailable |
 
 ## Fault Tolerance

diff --git a/docs/components/frontend/nvext.md b/docs/components/frontend/nvext.md
@@ -39,7 +39,6 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
 | `prefill_worker_id` | `u64` | `None` | Router | Routes the request to a specific prefill worker (disaggregated serving). |
 | `decode_worker_id` | `u64` | `None` | Router | Routes the request to a specific decode worker (disaggregated serving). |
 | `agent_hints` | object | `None` | Router | Per-request hints for scheduling and load balancing. See [Agent Hints](#agent-hints). |
-| `cache_control` | object | `None` | Router | KV cache pinning hint with TTL. See [Cache Control](#cache-control). |
 
 ### Header Overrides
 
@@ -130,31 +129,6 @@ Backend details:
 }
 ```
 
-## Cache Control
-
-> [!WARNING]
-> Cache control is experimental and available on development branches only. The API may change.
-
-The `cache_control` object enables explicit KV cache pinning with a TTL. When set, the router fires a `pin_prefix` call to the backend worker after generation completes, protecting the conversation's KV cache from eviction for the specified duration.
-
-| Field | Type | Default | Description |
-|-------|------|---------|-------------|
-| `cache_control.type` | `string` | — | Cache control type. Currently only `"ephemeral"` is supported. |
-| `cache_control.ttl` | `string` | `"300"` | TTL as integer seconds (`"600"`) or shorthand (`"5m"`, `"1h"`). Clamped to [300, 3600] seconds. |
-
-```json
-{
-    "nvext": {
-        "cache_control": {
-            "type": "ephemeral",
-            "ttl": "1h"
-        }
-    }
-}
-```
-
-Requires `--enable-cache-control` and `--router-mode=kv` on the frontend. See [SGLang for Agentic Workloads](../../backends/sglang/agents.md#cache-pinning-experimental) for full setup and usage details.
-
 ## Response Extensions
 
 When the client requests response metadata via `extra_fields`, the response includes an `nvext` object with the requested fields:
@@ -190,4 +164,4 @@ When the client requests response metadata via `extra_fields`, the response incl
 |----------|-------------|
 | [Frontend Guide](frontend-guide.md) | KServe gRPC configuration and integration |
 | [Router Guide](../router/router-guide.md) | Full router configuration and CLI arguments |
-| [SGLang for Agentic Workloads](../../backends/sglang/agents.md) | SGLang engine flags for priority scheduling, eviction policies, and cache pinning |
+| [SGLang for Agentic Workloads](../../backends/sglang/agents.md) | SGLang engine flags for priority scheduling and eviction policies |