Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2b56246
[HiCache]: support runtime attach/detach hicache storage
alphabetc1 Dec 26, 2025
72e3929
add ut
alphabetc1 Dec 26, 2025
1b51810
support hicache_storage_prefetch_policy
alphabetc1 Dec 27, 2025
003f7b2
fix
alphabetc1 Dec 27, 2025
775c998
refactor the existing storage backend init to use the same attach/det…
alphabetc1 Dec 29, 2025
fab4275
fix ci
alphabetc1 Dec 30, 2025
9fac448
fix
alphabetc1 Dec 30, 2025
e878adf
support update hicache_write_policy
alphabetc1 Jan 4, 2026
6033659
support config switch
alphabetc1 Jan 4, 2026
5a130de
Merge remote-tracking branch 'origin/main' into feat/hicache_store_ru…
alphabetc1 Jan 4, 2026
59a479a
fix mtr
alphabetc1 Jan 6, 2026
2934b8a
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 6, 2026
908fa97
Merge remote-tracking branch 'origin/main' into feat/hicache_store_ru…
alphabetc1 Jan 6, 2026
86da98a
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 9, 2026
bb7e8d7
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 14, 2026
5d384fb
add security
alphabetc1 Jan 14, 2026
c23477c
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 15, 2026
b8fe011
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 16, 2026
105e7d5
mock ADMIN_FORCE
alphabetc1 Jan 17, 2026
0ef30a8
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 17, 2026
4e6b48b
make API more RESTful
alphabetc1 Jan 19, 2026
a6f0610
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 19, 2026
b25a6c7
Merge branch 'main' into feat/hicache_store_runtime_attach_detach
alphabetc1 Jan 20, 2026
2d994a5
[HiCache] support force attach/detach of HiCache storage
alphabetc1 Jan 21, 2026
50adb6b
[HiCache] storage fault tolerance
alphabetc1 Jan 21, 2026
888e8d5
Merge branch 'main' into feat/hicache_store_ha
alphabetc1 Feb 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions docs/advanced_features/hicache_storage_runtime_attach_detach.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,19 @@

This document explains how to **dynamically attach/detach the HiCache L3 storage backend at runtime** (e.g., `mooncake` / `hf3fs` / `nixl` / `file` / `aibrix` / `eic`) while **SGLang is already running and serving traffic**, without restarting the process.

For safety and consistency, the current implementation **strictly requires** these operations to happen only when the service is **idle**:
For safety and consistency, the default implementation **strictly requires** these operations to happen only when the service is **idle**:

- **No running requests**
- **No waiting/queued requests**

If the idle condition is not met, the API will fail fast (HTTP 400) and **will not modify** the current service state.

You can optionally enable a **force mode** to switch even under load. In force mode:

- Requests **do not use** the storage backend during the switch (read treated as miss; write skipped).
- The switch waits for **existing storage operations to drain** before actual attach/detach.
- Any failure will return an error without crashing the server, and the IO block is rolled back.

---

## 1. Background and implementation overview
Expand Down Expand Up @@ -99,7 +105,8 @@ curl -s -X PUT http://127.0.0.1:30000/hicache/storage-backend \
-d '{
"hicache_storage_backend": "mooncake",
"hicache_storage_backend_extra_config_json": "{\"master_server_address\":\"127.0.0.1:50051\",\"protocol\":\"tcp\",\"global_segment_size\":\"4gb\",\"prefetch_threshold\":256}",
"hicache_storage_prefetch_policy": "timeout"
"hicache_storage_prefetch_policy": "timeout",
"force": true
}'
```

Expand All @@ -115,6 +122,14 @@ Notes:
curl -s -X DELETE http://127.0.0.1:30000/hicache/storage-backend
```

```bash
curl -s -X DELETE http://127.0.0.1:30000/hicache/storage-backend \
-H 'Content-Type: application/json' \
-d '{
"force": true
}'
```

Notes:

- Detach only makes SGLang **stop using** the L3 storage backend and stops prefetch/backup threads
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ def _trigger_backup(
incremental_tokens,
hash_value=page_hashes,
)
if ack_id is None:
return
self.ongoing_backup[ack_id] = (req.rid, host_indices, start_time)

def _compute_prefix_hash(self, tokens, prior_hash=""):
Expand Down
20 changes: 16 additions & 4 deletions python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
ConfigureLoggingReq,
ContinueGenerationReqInput,
DestroyWeightsUpdateGroupReqInput,
DetachHiCacheStorageReqInput,
EmbeddingReqInput,
GenerateReqInput,
GetWeightsByNameReqInput,
Expand Down Expand Up @@ -726,7 +727,8 @@ async def clear_hicache_storage_backend():
# "hicache_storage_backend": "file",
# "hicache_storage_backend_extra_config_json": "{}",
# "hicache_storage_prefetch_policy": "timeout",
# "hicache_write_policy": "write_through"
# "hicache_write_policy": "write_through",
# "force": "false"
# }'
@app.api_route("/hicache/storage-backend", methods=["PUT"])
@auth_level(AuthLevel.ADMIN_OPTIONAL)
Expand All @@ -743,6 +745,7 @@ async def attach_hicache_storage_backend(obj: AttachHiCacheStorageReqInput):
hicache_storage_backend_extra_config_json=obj.hicache_storage_backend_extra_config_json,
hicache_storage_prefetch_policy=obj.hicache_storage_prefetch_policy,
hicache_write_policy=obj.hicache_write_policy,
force=obj.force,
)
msg = getattr(ret, "message", "")
return Response(
Expand All @@ -759,18 +762,27 @@ async def attach_hicache_storage_backend(obj: AttachHiCacheStorageReqInput):


# example usage:
# curl -s -X DELETE http://127.0.0.1:30000/hicache/storage-backend
# curl -s -X DELETE http://127.0.0.1:30000/hicache/storage-backend \
# -H 'Content-Type: application/json' \
# -d '{
# "force": "false"
# }'
@app.api_route("/hicache/storage-backend", methods=["DELETE"])
@auth_level(AuthLevel.ADMIN_OPTIONAL)
async def detach_hicache_storage_backend():
async def detach_hicache_storage_backend(
obj: Optional[DetachHiCacheStorageReqInput] = None,
):
"""Detach (disable) HiCache storage backend at runtime.

Only allowed when there are NO running / queued requests.
"""
if not _global_state.tokenizer_manager.server_args.admin_api_key:
return _admin_api_key_missing_response()

ret = await _global_state.tokenizer_manager.detach_hicache_storage()
if obj is None:
obj = DetachHiCacheStorageReqInput()

ret = await _global_state.tokenizer_manager.detach_hicache_storage(force=obj.force)
msg = getattr(ret, "message", "")
return Response(
content=(
Expand Down
25 changes: 25 additions & 0 deletions python/sglang/srt/managers/cache_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def __init__(
self.storage_backend_type = None
self.pp_rank = pp_rank
self.pp_size = pp_size
self.fault_reporter = None

# Default storage page IO functions (may be overridden by attach).
self.page_get_func = self._generic_page_get
Expand All @@ -284,6 +285,7 @@ def __init__(
# transfer buffers (CPU<->GPU). We want to allow runtime attach/detach of
# storage without stopping the whole controller.
self.storage_stop_event = threading.Event()
self.storage_io_blocked = threading.Event()

self.device = self.mem_pool_device.device
self.layer_num = self.mem_pool_device.layer_num
Expand Down Expand Up @@ -396,6 +398,21 @@ def _stop_storage_threads(self):
)
raise RuntimeError("Failed to stop HiCache storage threads cleanly.")

def set_storage_io_blocked(self, blocked: bool):
if blocked:
self.storage_io_blocked.set()
else:
self.storage_io_blocked.clear()

def is_storage_io_blocked(self) -> bool:
return self.storage_io_blocked.is_set()

def set_fault_reporter(self, reporter):
self.fault_reporter = reporter
if hasattr(self, "storage_backend") and self.storage_backend is not None:
if hasattr(self.storage_backend, "set_fault_reporter"):
self.storage_backend.set_fault_reporter(reporter)

def attach_storage_backend(
self,
storage_backend: str,
Expand Down Expand Up @@ -444,6 +461,10 @@ def attach_storage_backend(
storage_backend, self.storage_config, self.mem_pool_host
)
self.storage_backend.register_mem_pool_host(self.mem_pool_host)
if self.fault_reporter is not None and hasattr(
self.storage_backend, "set_fault_reporter"
):
self.storage_backend.set_fault_reporter(self.fault_reporter)

self.enable_storage = True
# todo: threshold policy for prefetching
Expand Down Expand Up @@ -769,6 +790,8 @@ def prefetch(
"""
Prefetch KV caches from storage backend to host memory.
"""
if self.storage_io_blocked.is_set():
return None
operation = PrefetchOperation(
request_id, host_indices, new_input_tokens, last_hash, prefix_keys
)
Expand Down Expand Up @@ -967,6 +990,8 @@ def write_storage(
"""
Write KV caches from host memory to storage backend.
"""
if self.storage_io_blocked.is_set():
return None
operation = StorageOperation(
host_indices, token_ids, hash_value=hash_value, prefix_keys=prefix_keys
)
Expand Down
3 changes: 2 additions & 1 deletion python/sglang/srt/managers/io_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,7 @@ class AttachHiCacheStorageReqInput(BaseReq):
hicache_storage_backend_extra_config_json: Optional[str] = None
hicache_storage_prefetch_policy: Optional[str] = None
hicache_write_policy: Optional[str] = None
force: bool = False

def __post_init__(self):
if self.hicache_storage_prefetch_policy is None:
Expand Down Expand Up @@ -1211,7 +1212,7 @@ class AttachHiCacheStorageReqOutput(BaseReq):
class DetachHiCacheStorageReqInput(BaseReq):
"""Dynamically detach (disable) HiCache storage backend at runtime."""

pass
force: bool = False


@dataclass
Expand Down
Loading
Loading