diff --git a/docs/advanced_features/hicache_design.md b/docs/advanced_features/hicache_design.md index fd06aff17212..226617d4d4dc 100644 --- a/docs/advanced_features/hicache_design.md +++ b/docs/advanced_features/hicache_design.md @@ -121,9 +121,9 @@ Specifically, **LMCache**, an efficient KV cache layer for enterprise-scale LLM - **`--enable-hierarchical-cache`**: Enable hierarchical cache functionality. This is required to use HiCache. -- **`--hicache-ratio HICACHE_RATIO`**: The ratio of the size of host KV cache memory pool to the size of device pool. For example, a value of 2 means the host memory pool is twice as large as the device memory pool. The minimum allowed value is 2. +- **`--hicache-ratio HICACHE_RATIO`**: The ratio of the size of host KV cache memory pool to the size of device pool. For example, a value of 2 means the host memory pool is twice as large as the device memory pool. The value of this parameter must be greater than 1, as the current implementation requires the host memory allocated for the KV cache to be larger than the device memory allocated for the KV cache. -- **`--hicache-size HICACHE_SIZE`**: The size of host KV cache memory pool in gigabytes. This parameter overrides `hicache-ratio` if set. For example, `--hicache-size 30` allocates 30GB for the host memory pool **for each rank**. If there are 8 ranks, then the total memory size is 240GB. +- **`--hicache-size HICACHE_SIZE`**: The size of host KV cache memory pool in gigabytes. This parameter overrides `hicache-ratio` if set. For example, `--hicache-size 30` allocates 30GB (1GB = 1e9 bytes) for the host memory pool **for each rank**. If there are 8 ranks, then the total memory size is 240GB. Just like `hicache-ratio`, the value of this parameter must be larger than the size of device memory allocated for KV cache. **Note**: `--hicache-ratio` and `--hicache-size` are two critical parameters. In general, a larger HiCache size leads to a higher cache hit rate, which improves prefill performance. However, the relationship between cache size and hit rate is not linear. Once most reusable KV data—especially hot tokens—are already cached, further increasing the size may yield only marginal performance gains. Users can set these parameters based on their workload characteristics and performance requirements. diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index ee5b4234972c..72d87dfa06fe 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -194,6 +194,17 @@ class Envs: SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvStr(None) ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False) + # Mooncake Store + SGLANG_HICACHE_MOONCAKE_CONFIG_PATH = EnvStr(None) + MOONCAKE_MASTER = EnvStr(None) + MOONCAKE_LOCAL_HOSTNAME = EnvStr("localhost") + MOONCAKE_TE_META_DATA_SERVER = EnvStr("P2PHANDSHAKE") + MOONCAKE_GLOBAL_SEGMENT_SIZE = EnvStr("4gb") + MOONCAKE_PROTOCOL = EnvStr("tcp") + MOONCAKE_DEVICE = EnvStr("") + MOONCAKE_MASTER_METRICS_PORT = EnvInt(9003) + MOONCAKE_CHECK_SERVER = EnvBool(False) + # AMD & ROCm SGLANG_USE_AITER = EnvBool(False) SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False) diff --git a/python/sglang/srt/mem_cache/storage/mooncake_store/README.md b/python/sglang/srt/mem_cache/storage/mooncake_store/README.md index 4ccc8ca7e06b..a70dbe56c7d2 100644 --- a/python/sglang/srt/mem_cache/storage/mooncake_store/README.md +++ b/python/sglang/srt/mem_cache/storage/mooncake_store/README.md @@ -2,21 +2,44 @@ This document describes how to use Mooncake as the L3 KV cache for SGLang. +Related documentation: +* [Quick Start: SGLang HiCache with Mooncake Backend](https://kvcache-ai.github.io/Mooncake/getting_started/examples/sglang-integration/hicache-quick-start.html) +* [Complete Guide: SGLang HiCache with Mooncake Backend](https://kvcache-ai.github.io/Mooncake/getting_started/examples/sglang-integration/hicache-integration-v1.html) +* [Mooncake x SGLang HiCache System Design](https://kvcache-ai.github.io/Mooncake/design/hicache-design.html) +* [HiCache System Design and Optimization](https://docs.sglang.ai/advanced_features/hicache_design.html) +* [SGLang HiCache with Mooncake Backend Benchmark](https://kvcache-ai.github.io/Mooncake/performance/sglang-hicache-benchmark-results-v1.html) + ## About Mooncake Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine. For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/). +### Mooncake & SGLang HiCache + +Mooncake serves as a high-performance L3 storage backend for SGLang HiCache, enabling distributed KV cache storage across multiple servers with RDMA-accelerated data transfer. This integration addresses the capacity limitations of traditional GPU-only or GPU+CPU caching by providing virtually unlimited cache storage through a distributed memory pool. + +When a cache miss occurs in L1 and L2, HiCache automatically fetches the required KV cache from Mooncake's distributed memory pool. The system uses intelligent prefetching strategies to minimize latency, and utilize RDMA technology and zero-copy technique to ensure high-bandwidth, low-latency data transfer between SGLang instances and Mooncake storage nodes. + +**Key Advantages:** + +- **Scalable Capacity**: Aggregate memory across entire clusters into large distributed pools. +- **Cache Sharing**: KV caches can be shared by all SGLang instances in the cluster. +- **RDMA Acceleration**: Direct memory access eliminates CPU overhead and reduces latency. +- **Zero Copy**: Direct data transfer between L2 and Mooncake without intermediate copying, maximizing throughput. +- **Fault Tolerance**: Distributed architecture provides resilience against individual node failures. + +This integration is particularly valuable for production deployments involving long-context models, multi-turn conversations, and high-throughput serving scenarios where traditional caching approaches become capacity-constrained. + ## Install Mooncake -### Method 1: with pip +**Method 1: with pip** ```bash pip install mooncake-transfer-engine ``` -### Method 2: from source +**Method 2: from source** Clone Mooncake project: @@ -31,7 +54,7 @@ cd Mooncake bash dependencies.sh ``` -Build the project. For additional build options, please refer to [the official guide](https://kvcache-ai.github.io/Mooncake/getting_started/build.html). +Build the project: ```bash mkdir build @@ -46,40 +69,49 @@ Install Mooncake: sudo make install ``` -## Deploy Mooncake +For more details, please refer to [Mooncake official installation guide](https://kvcache-ai.github.io/Mooncake/getting_started/build.html). + +## Deployment **Mooncake** is a distributed system that efficiently aggregates memory resources across multiple servers. It can also be deployed on a single server for simpler setups. -When integrated with **SGLang**, the system conceptually consists of four key components: `the master service`, `metadata service`, `store service`, and the `SGLang server`. Among them, the `master service` and `metadata service` are responsible for object and metadata maintenance. The `store service` manages a contiguous memory segment that contributes to the distributed KV cache, making its memory accessible to both local and remote `SGLang servers`. Data transfer occurs directly between the `store service` and `SGLang servers`, bypassing the `master service`. +When integrated with **SGLang**, the system conceptually consists of four key components: `the master service`, `metadata service` (Optional), `store service` (Optional), and the `SGLang server`. Among them, the `master service` and `metadata service` are responsible for object and metadata maintenance. The `store service` manages a contiguous memory segment that contributes to the distributed KV cache, making its memory accessible to both local and remote `SGLang servers`. Data transfer occurs directly between the `store service` and `SGLang servers`, bypassing the `master service`. ### Single Server Deployment -There are four components for deploying Mooncake: metadata service, master service, store service and sglang instance. -Note: *Only **master service** is mandatory for single server deployment.* - -**Launch Mooncake `metadata service`(Optional):** +**Launch Mooncake `metadata service` (Optional):** ```bash python -m mooncake.http_metadata_server ``` +This service is responsible for centralized metadata management including internal connection status and related metadata. + +Deployment of the `metadata service` can be skipped in the following cases: +* Mooncake supports non-centralized metadata management via a P2P handshake mechanism to exchange metadata. When using this mode, deployment of the `metadata service` can be skipped. +* Mooncake also supports embedding `mededata service` into `master service`. In this case, only the `master service` needs to be started. + **Launch Mooncake `master service`:** +The `master service` orchestrates the logical storage space pool across the entire cluster, managing KV cache space allocation and eviction. + +To start `mooncake_master`: + ```bash mooncake_master --eviction_high_watermark_ratio=0.95 ``` -To start both the metadata and master services together: +To start `mooncake_master` with embedded `metadata service` (so that a separate `metadata service` deployment can be skipped): + ```bash -mooncake_master --enable_http_metadata_server=true --eviction_high_watermark_ratio=0.95 +mooncake_master --enable_http_metadata_server=true --http_metadata_server_port=8080 --eviction_high_watermark_ratio=0.95 ``` **Understanding `eviction_high_watermark_ratio`:** When a `PutStart` request fails due to insufficient memory, or when the eviction thread detects that space usage has reached the configured high watermark ratio, an eviction task is triggered to free up space by evicting a portion of objects. -Due to memory fragmentation, allocation failures may occur even when memory usage has not yet reached 100%. The actual threshold depends on the workload. This [benchmark document](https://kvcache-ai.github.io/Mooncake/performance/allocator_benchmark_result.html) - provides memory allocation efficiency results under different scenarios. if excessive allocation failures are observed, consider lowering this parameter accordingly. +Due to memory fragmentation, allocation failures may occur even when memory usage has not yet reached 100%. The actual threshold depends on the workload. This [benchmark document](https://kvcache-ai.github.io/Mooncake/performance/allocator-benchmark-result.html) provides memory allocation efficiency results under different scenarios. if excessive allocation failures are observed, consider lowering this parameter accordingly. **Launch Mooncake `store service` (Optional):** @@ -88,8 +120,8 @@ First, create and save a configuration file in JSON format. For example: ```json { "local_hostname": "localhost", - "metadata_server": "http://localhost:8080/metadata", - "master_server_address": "localhost:50051", + "metadata_server": "http://127.0.0.1:8080/metadata", + "master_server_address": "127.0.0.1:50051", "protocol": "rdma", "device_name": "", "global_segment_size": "4gb", @@ -97,13 +129,38 @@ First, create and save a configuration file in JSON format. For example: } ``` -Parameter Explanation: +Note: If the `metadata service` is not deployed, set this field to: -* `local_hostname`: The hostname of the `store service`. -* `metadata_server`: The network address of the `metadata service`. The default port is 8080. -* `master_server_address`: The network address of the `master service`. The default port is 50051. -* `protocol`: The protocol used by Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended. -* `device_name`: For `"rdma"`, you can leave this empty in most cases. Mooncake auto-discovers RDMA NICs by default. If you want to pin specific NICs (e.g., `mlx5_0,mlx5_1`), just set `device_name` accordingly. To list available devices, use `ibv_devices`. +```json + "metadata_server": "P2PHANDSHAKE", +``` + +Then start the `store service`: + +```bash +python -m mooncake.mooncake_store_service --config=[config_path] --port=8081 +``` + +Mooncake `store service` configuration can also be provided via environment variables: + +```bash +MOONCAKE_LOCAL_HOSTNAME="localhost" \ +MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ +MOONCAKE_MASTER="127.0.0.1:50051" \ +MOONCAKE_PROTOCOL="rdma" \ +MOONCAKE_DEVICE="" \ +MOONCAKE_GLOBAL_SEGMENT_SIZE="4gb" \ +MOONCAKE_LOCAL_BUFFER_SIZE=0 \ +python -m mooncake.mooncake_store_service --port=8081 +``` + +**Parameter Explanation:** + +* `local_hostname`, `MOONCAKE_LOCAL_HOSTNAME`: The hostname of the `store service`. +* `metadata_server`, `MOONCAKE_TE_META_DATA_SERVER` : The network address of the `metadata service`. The default port is 8080. If the `metadata service` is not deployed, set this field to: `"metadata_server": "P2PHANDSHAKE"`. +* `master_server_address`, `MOONCAKE_MASTER`: The network address of the `master service`. The default port is 50051. +* `protocol`, `MOONCAKE_PROTOCOL`: The protocol used by Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended. +* `device_name`, `MOONCAKE_DEVICE`: The RDMA devices used by Mooncake. This field can usually be left empty, as Mooncake automatically discovers available NICs by default. This parameter is required only when the protocol is set to `"rdma"` **and** a specific set of NICs needs to be used. Example: `"device_name": "mlx5_0,mlx5_1"`. To list available devices, run `ibv_devices`. **Note:** If the environment variable `MC_MS_AUTO_DISC` is set to `1`, any `device_name` or `MOONCAKE_DEVICE` configuration will be overridden, and Mooncake will switch to auto-discovery mode. - For tensor parallel deployments where different ranks should use different devices, you can specify device configurations using JSON format: ```json { @@ -114,87 +171,167 @@ Parameter Explanation: ```bash MOONCAKE_DEVICE="{\"0\": \"ib0,ib1\", \"1\": \"ib2,ib3\", \"2\": \"ib4,ib5\"}" ``` -* `global_segment_size`: The amount of memory contributed to the global memory pool. Accepts either bytes (integer) or a string with the `gb` suffix, e.g., `"16gb"`. A larger value allows Mooncake to cache more KV tensors. -* `local_buffer_size`: Local buffer is used to do request operations such as `Get` or `Put`. In this case, it is set to 0 because the instance functions solely as a storage server, contributing memory to the global pool without issuing any request operations. +* `global_segment_size`, `MOONCAKE_GLOBAL_SEGMENT_SIZE`: The amount of memory contributed to the global memory pool. Accepts either bytes (integer) or a string with the `gb` suffix, e.g., `"4294967296"` or `"4gb"`. A larger value allows Mooncake to cache more KV tensors. +* `local_buffer_size`, `MOONCAKE_LOCAL_BUFFER_SIZE`: Local buffer is used to do request operations such as `Get` or `Put`. In this case, it is set to 0 because the instance functions solely as a storage server, contributing memory to the global pool without issuing any request operations. -Then start the `store service`: +**Important: Understanding Global Segment Size** -```bash -python -m mooncake.mooncake_store_service --config=[config_path] -``` +`global_segment_size` and `MOONCAKE_GLOBAL_SEGMENT_SIZE`: This parameter specifies the amount of memory each instance contributes to the distributed memory pool. The total memory available for KV cache storage across the cluster is the sum of the memory contributed by all instances. + +Adjust this value according to system’s available memory and expected cache requirements. Note: If `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also takes on the role of the `store service`, which simplifies deployment but couples the two components together. Users can choose the deployment approach that best fits their needs. **Start the `SGLang server` with Mooncake enabled:** -Mooncake configuration can be provided via environment variables. Note that, for optimal performance, the Mooncake backend currently supports only the `page_first` layout (which optimizes memory access patterns for KV cache operations). +There are three ways to configure Mooncake: -There are three ways to prepare mooncakes: -1. Use environment variables; -2. Use json configuration files; -3. Additional configuration using the sglang parameter. +1. Via extra configuration passed through sglang parameters +2. Using JSON configuration files +3. Using environment variables -**Using env variables to configure Mooncake** +Mooncake loads configuration in the following priority order: + +1. If Mooncake-specific options are provided in `--hicache-storage-backend-extra-config`, they are used first. +2. If not, Mooncake checks whether the environment variable `DEFAULT_MOONCAKE_CONFIG_PATH_ENV` is set, and loads the JSON config file from that path. +3. If neither of the above is provided, Mooncake falls back to environment variables. + +**Using extra-config of sglang arguments to configure Mooncake** ```bash -MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ -MOONCAKE_MASTER=127.0.0.1:50051 \ -MOONCAKE_PROTOCOL="rdma" \ -# Leave MOONCAKE_DEVICE empty for auto-discovery (default) -# To pin NICs, disable auto-discovery then set MOONCAKE_DEVICE, e.g.: -# export MC_MS_AUTO_DISC=0 -# export MOONCAKE_DEVICE="mlx5_0,mlx5_1" -MOONCAKE_GLOBAL_SEGMENT_SIZE=4gb \ python -m sglang.launch_server \ --enable-hierarchical-cache \ - --hicache-storage-backend mooncake\ - --model-path [model_path] + --hicache-storage-backend mooncake \ + --model-path [model_path] \ + --hicache-storage-backend-extra-config '{"master_server_address": "127.0.0.1:50051", "local_hostname": "localhost", "metadata_server": "http://127.0.0.1:8080/metadata", "global_segment_size": "4gb", "protocol": "rdma", "device_name": ""}' ``` -Parameter Explanation: - -* `MOONCAKE_TE_META_DATA_SERVER`: The network address of the `metadata service`. The default port is 8080. -* `MOONCAKE_MASTER`: The network address of the `master service`. The default port is 50051. -* `MOONCAKE_PROTOCOL`: The protocol used by Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended. -* `MOONCAKE_DEVICE`: Optional for `"rdma"`. By default, Mooncake auto-discovers RDMA NICs. If you need to pin specific NICs, set `MOONCAKE_DEVICE` (comma-separated list, e.g., `mlx5_0,mlx5_1`). -* `MOONCAKE_GLOBAL_SEGMENT_SIZE`: The amount of memory contributed to the global memory pool. Accepts either bytes (integer) or a value with the `gb` suffix, e.g., `16gb`. If at least one `store service` is launched, this value can be set to `0`. In this case, the `SGLang server` will not contribute any memory to the system. Note that KV tensors cached in the contributed memory will be lost once this process terminates; however, this will not cause any system errors. - **Using JSON file to configure Mooncake** +SGLang server can load Mooncake config from `SGLANG_HICACHE_MOONCAKE_CONFIG_PATH`. + ```bash export SGLANG_HICACHE_MOONCAKE_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hicache/mooncake_config.json + echo '{ "local_hostname": "localhost", - "metadata_server": "http://localhost:8080/metadata", - "master_server_address": "localhost:50051", + "metadata_server": "http://127.0.0.1:8080/metadata", + "master_server_address": "127.0.0.1:50051", "protocol": "rdma", "device_name": "", - "global_segment_size": "4gb", - "local_buffer_size": 0 + "global_segment_size": "4gb" }' > ${SGLANG_HICACHE_MOONCAKE_CONFIG_PATH} + +python -m sglang.launch_server \ + --enable-hierarchical-cache \ + --hicache-storage-backend mooncake \ + --model-path [model_path] ``` -**Using extra-config of sglang arguments to configure Mooncake** +**Using env variables to configure Mooncake** ```bash +MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ +MOONCAKE_MASTER="127.0.0.1:50051" \ +MOONCAKE_PROTOCOL="rdma" \ +MOONCAKE_DEVICE="" \ +MOONCAKE_GLOBAL_SEGMENT_SIZE="4gb" \ python -m sglang.launch_server \ --enable-hierarchical-cache \ - --hicache-storage-backend mooncake \ - --model-path [model_path] \ - --hicache-storage-backend-extra-config '{"master_server_address": "127.0.0.1:50051", "local_hostname": "localhost", "metadata_server": "http://127.0.0.1:8080/metadata", "global_segment_size": "4gb", "local_buffer_size": 16777216, "protocol": "rdma", "device_name": ""}' + --hicache-storage-backend mooncake\ + --model-path [model_path] ``` -**Important: Understanding Global Segment Size** +**Parameter Explanation:** -`global_segment_size` for `store service` and `MOONCAKE_GLOBAL_SEGMENT_SIZE` for `SGLang service`: This parameter specifies the amount of memory each instance contributes to the distributed memory pool. The total memory available for KV cache storage across the cluster is the sum of the memory contributed by all instances. +The Mooncake parameters used here are essentially the same as those configured for the `store service`. -Adjust this value according to system’s available memory and expected cache requirements. +In particular, for the `global segment size`, if at least one `store service` instance is running, this value can be set to `0`. In this case, the SGLang server will not contribute any memory to the system. Note that KV tensors stored in this contributed memory will be lost when the process exits; however, this will **not** cause any system errors. + +**Important:** when `tp > 1`, each Tensor Parallel (TP) rank launches its own Mooncake backend instance and contributes `1/global_segment_size` memory. Therefore, the total memory consumption equals `global segment size`. + +**HiCache Related Parameters for SGLang Server** + +For a comprehensive overview of HiCache-related parameters, please refer to [this document](https://docs.sglang.ai/advanced_features/hicache_design.html#related-parameters). + + +Note that, for `--hicache-mem-layout {layer_first,page_first,page_first_direct}`, which specifies the memory layout for the host memory pool, `page_first` or `page_first_direct` are required if use Mooncake backend. ### Distributed Deployment Distributed deployment of Mooncake is straightforward. Similar to the single-node setup, start one `metadata service` and one `master service` for this cluster. Then start a `store service` on each server. -Mooncake also supports high availability mode. This mode enhances fault tolerance by running the `master service` as a cluster of multiple master nodes coordinated through an `etcd` cluster. The master nodes use `etcd` to elect a leader, which is responsible for handling client requests. For more details about how to deploy in this mode, please refer to our [documents](https://kvcache-ai.github.io/Mooncake/) . +Mooncake also supports high availability mode. This mode enhances fault tolerance by running the `master service` as a cluster of multiple master nodes coordinated through an `etcd` cluster. The master nodes use `etcd` to elect a leader, which is responsible for handling client requests. For more details about how to deploy in this mode, please refer to our [documents](https://kvcache-ai.github.io/Mooncake/). + +### Prefill/Decode Disaggregation + +In **PD disaggregation**, the configurations for the `metadata service`, `mooncake master`, and the optional `store service` remain the same as described above. The difference is that SGLang introduces three distinct roles: `prefill worker`, `decode worker`, and `router`. + +Among these, the `prefill worker` supports enabling **HiCache**. To run with PD disaggregation, start from the [PD configuration](https://kvcache-ai.github.io/Mooncake/getting_started/examples/sglang-integration-v1.html), and add the HiCache-related parameters (as previously described for the `SGLang server`) to the `prefill worker`. + +In the example below, one `prefill worker`, one `decode worker`, and one `router` are launched. HiCache is enabled on the `prefill worker` to optimize prefill performance. + +**Prefill worker**: + +```bash +MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ +MOONCAKE_MASTER=127.0.0.1:50051 \ +MOONCAKE_PROTOCOL="rdma" \ +MOONCAKE_DEVICE="mlx5_1" \ +MOONCAKE_GLOBAL_SEGMENT_SIZE=4294967296 \ +python -m sglang.launch_server \ + --model-path [model_path] \ + --page-size 64 \ + --enable-hierarchical-cache \ + --hicache-storage-prefetch-policy timeout \ + --hicache-storage-backend mooncake \ + --disaggregation-mode prefill \ + --disaggregation-ib-device "mlx5_1" \ + --base-gpu-id 0 \ + --port 30000 +``` + +**Decode worker**: + +```bash +python -m sglang.launch_server \ + --model-path [model_path] \ + --page-size 64 \ + --disaggregation-mode decode \ + --disaggregation-ib-device "mlx5_1" \ + --base-gpu-id 1 \ + --port 30001 +``` + +**Router**: + +```bash +python -m sglang_router.launch_router \ + --pd-disaggregation \ + --prefill "http://127.0.0.1:30000" \ + --decode "http://127.0.0.1:30001" \ + --host 0.0.0.0 \ + --port 8000 +``` + +## Troubleshooting + +**RDMA Registration Failure:** + +* In some environments, RDMA registration may require root privileges. In this case, try running the program as root. +* In certain environments (e.g., eRDMA), there is an upper limit on the total amount of RDMA memory that can be registered. Once this limit is exceeded, registration will fail. To resolve this, you can lower the value of `MOONCAKE_GLOBAL_SEGMENT_SIZE`, or reduce the host memory allocated to HiCache in the `SGLang server` (since this memory is fully registered with RDMA to enable zero-copy). + +**HiCache CPU Memory Usage:** + +When using HiCache, the default L2 host DRAM (CPU memory) size for KV cache is **2 times** the size of the L1 device memory (GPU memory) for KV cache. + +If the model is small but the GPU memory is large — especially in multi-TP (tensor parallel) setups — this may cause the L1 KV cache to become very large, which in turn can consume excessive CPU DRAM. + +In such cases, you should manually configure an appropriate L2 cache size based on your hardware. This can be done by setting `--hicache-ratio` or `--hicache-size`. + +**More Information:** + +Additional troubleshooting information can be found [here](https://kvcache-ai.github.io/Mooncake/troubleshooting/troubleshooting.html). ## Test Mooncake Store @@ -206,8 +343,6 @@ First, start the `metadata service` and `master service`. Then run the `test_moo MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ MOONCAKE_MASTER=127.0.0.1:50051 \ MOONCAKE_PROTOCOL="rdma" \ -# Auto-discovery by default. To pin NICs: -# export MOONCAKE_DEVICE="mlx5_0,mlx5_1" MOONCAKE_GLOBAL_SEGMENT_SIZE=16777216 \ python3 [path of test_mooncake_store.py] ``` diff --git a/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py b/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py index 73d1ff0138bc..f6101229ec38 100644 --- a/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +++ b/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py @@ -9,6 +9,7 @@ import requests import torch +from sglang.srt.environ import envs from sglang.srt.mem_cache.hicache_storage import ( HiCacheStorage, HiCacheStorageConfig, @@ -16,12 +17,8 @@ ) from sglang.srt.mem_cache.memory_pool_host import HostKVCache -DEFAULT_GLOBAL_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024 # 4 GiB DEFAULT_LOCAL_BUFFER_SIZE = 16 * 1024 * 1024 # 16 MB -DEFAULT_MOONCAKE_CONFIG_PATH_ENV = "SGLANG_HICACHE_MOONCAKE_CONFIG_PATH" SETUP_TIMEOUT = 600 # 10min -DEFAULT_MASTER_METRICS_PORT = 9003 -DEFAULT_CHECK_SERVER = False logger = logging.getLogger(__name__) @@ -47,7 +44,6 @@ class MooncakeStoreConfig: local_hostname: str metadata_server: str global_segment_size: int - local_buffer_size: int protocol: str device_name: str master_server_address: str @@ -57,28 +53,39 @@ class MooncakeStoreConfig: @staticmethod def from_file() -> "MooncakeStoreConfig": """Load the config from a JSON file.""" - file_path = os.getenv(DEFAULT_MOONCAKE_CONFIG_PATH_ENV) + if not envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.is_set(): + raise RuntimeError( + f"Config file path not set. Please set {envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.name}" + ) + file_path = envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.value try: with open(file_path) as fin: config = json.load(fin) except Exception as e: raise RuntimeError(f"Failed to load config from {file_path}: {str(e)}") + if "master_server_address" not in config: + raise ValueError("master_server_address is required in config file") + return MooncakeStoreConfig( - local_hostname=config.get("local_hostname"), - metadata_server=config.get("metadata_server"), + local_hostname=config.get( + "local_hostname", envs.MOONCAKE_LOCAL_HOSTNAME.default + ), + metadata_server=config.get( + "metadata_server", envs.MOONCAKE_TE_META_DATA_SERVER.default + ), global_segment_size=_parse_global_segment_size( - config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE) + config.get( + "global_segment_size", envs.MOONCAKE_GLOBAL_SEGMENT_SIZE.default + ) ), - # Zero copy interface does not need local buffer - local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE, - protocol=config.get("protocol", "tcp"), - device_name=config.get("device_name", ""), + protocol=config.get("protocol", envs.MOONCAKE_PROTOCOL.default), + device_name=config.get("device_name", envs.MOONCAKE_DEVICE.default), master_server_address=config.get("master_server_address"), master_metrics_port=config.get( - "master_metrics_port", DEFAULT_MASTER_METRICS_PORT + "master_metrics_port", envs.MOONCAKE_MASTER_METRICS_PORT.default ), - check_server=config.get("check_server", DEFAULT_CHECK_SERVER), + check_server=config.get("check_server", envs.MOONCAKE_CHECK_SERVER.default), ) @staticmethod @@ -90,23 +97,30 @@ def load_from_env() -> "MooncakeStoreConfig": export MOONCAKE_TE_META_DATA_SERVER="P2PHANDSHAKE" """ # other required environment variables... - if not os.getenv("MOONCAKE_MASTER"): + if not envs.MOONCAKE_MASTER.is_set(): raise ValueError("The environment variable 'MOONCAKE_MASTER' is not set.") + + # Special handling for local_hostname: try MOONCAKE_LOCAL_HOSTNAME first, + # then fall back to LOCAL_HOSTNAME if not set. + # This is for forward compatibility with the legacy LOCAL_HOSTNAME environment variable. + if envs.MOONCAKE_LOCAL_HOSTNAME.is_set(): + local_hostname = envs.MOONCAKE_LOCAL_HOSTNAME.value + else: + local_hostname = os.getenv( + "LOCAL_HOSTNAME", envs.MOONCAKE_LOCAL_HOSTNAME.default + ) + return MooncakeStoreConfig( - local_hostname=os.getenv("LOCAL_HOSTNAME", "localhost"), - metadata_server=os.getenv("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE"), + local_hostname=local_hostname, + metadata_server=envs.MOONCAKE_TE_META_DATA_SERVER.value, global_segment_size=_parse_global_segment_size( - os.getenv("MOONCAKE_GLOBAL_SEGMENT_SIZE", DEFAULT_GLOBAL_SEGMENT_SIZE) + envs.MOONCAKE_GLOBAL_SEGMENT_SIZE.value ), - # Zero copy interface does not need local buffer - local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE, - protocol=os.getenv("MOONCAKE_PROTOCOL", "tcp"), - device_name=os.getenv("MOONCAKE_DEVICE", ""), - master_server_address=os.getenv("MOONCAKE_MASTER"), - master_metrics_port=int( - os.getenv("MOONCAKE_MASTER_METRICS_PORT", DEFAULT_MASTER_METRICS_PORT) - ), - check_server=bool(os.getenv("MOONCAKE_CHECK_SERVER", DEFAULT_CHECK_SERVER)), + protocol=envs.MOONCAKE_PROTOCOL.value, + device_name=envs.MOONCAKE_DEVICE.value, + master_server_address=envs.MOONCAKE_MASTER.value, + master_metrics_port=envs.MOONCAKE_MASTER_METRICS_PORT.value, + check_server=envs.MOONCAKE_CHECK_SERVER.value, ) @staticmethod @@ -116,21 +130,26 @@ def load_from_extra_config(extra_config: dict) -> "MooncakeStoreConfig": raise ValueError("master_server_address is required in extra_config") return MooncakeStoreConfig( - local_hostname=extra_config.get("local_hostname", "localhost"), - metadata_server=extra_config.get("metadata_server", "P2PHANDSHAKE"), - global_segment_size=_parse_global_segment_size( - extra_config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE) + local_hostname=extra_config.get( + "local_hostname", envs.MOONCAKE_LOCAL_HOSTNAME.default ), - local_buffer_size=extra_config.get( - "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE + metadata_server=extra_config.get( + "metadata_server", envs.MOONCAKE_TE_META_DATA_SERVER.default + ), + global_segment_size=_parse_global_segment_size( + extra_config.get( + "global_segment_size", envs.MOONCAKE_GLOBAL_SEGMENT_SIZE.default + ) ), - protocol=extra_config.get("protocol", "tcp"), - device_name=extra_config.get("device_name", ""), + protocol=extra_config.get("protocol", envs.MOONCAKE_PROTOCOL.default), + device_name=extra_config.get("device_name", envs.MOONCAKE_DEVICE.default), master_server_address=extra_config["master_server_address"], master_metrics_port=extra_config.get( - "master_metrics_port", DEFAULT_MASTER_METRICS_PORT + "master_metrics_port", envs.MOONCAKE_MASTER_METRICS_PORT.default + ), + check_server=extra_config.get( + "check_server", envs.MOONCAKE_CHECK_SERVER.default ), - check_server=extra_config.get("check_server", DEFAULT_CHECK_SERVER), ) @@ -164,7 +183,7 @@ def __init__(self, storage_config: HiCacheStorageConfig = None): logger.info( "Mooncake Configuration loaded from extra_config successfully." ) - elif os.getenv(DEFAULT_MOONCAKE_CONFIG_PATH_ENV): + elif envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.is_set(): # Load from config file self.config = MooncakeStoreConfig.from_file() logger.info("Mooncake Configuration loaded from file successfully.") @@ -178,7 +197,6 @@ def __init__(self, storage_config: HiCacheStorageConfig = None): per_tp_global_segment_size = ( self.config.global_segment_size // tp_scale_factor ) - per_tp_local_buffer_size = self.config.local_buffer_size // tp_scale_factor # Check if extra_backend_tag should be passed to MooncakeDistributedStore self.extra_backend_tag = None @@ -213,15 +231,17 @@ def __init__(self, storage_config: HiCacheStorageConfig = None): self.config.local_hostname, self.config.metadata_server, per_tp_global_segment_size, - per_tp_local_buffer_size, + DEFAULT_LOCAL_BUFFER_SIZE, # Zero copy interface does not need local buffer self.config.protocol, device_name, self.config.master_server_address, ) if ret_code: - logger.error(f"failed to setup mooncake store, error code: {ret_code}") + raise RuntimeError( + f"Failed to setup Mooncake store, error code: {ret_code}" + ) + logger.info("Mooncake store setup successfully.") - logger.info("Connect to Mooncake store successfully.") self.warmup() logger.info("Mooncake store warmup successfully.") @@ -292,7 +312,10 @@ def register_mem_pool_host(self, mem_pool_host: HostKVCache): buffer_size = buffer.numel() * buffer.element_size() ret_code = self.store.register_buffer(buffer_ptr, buffer_size) if ret_code: - logger.error(f"failed to register buffer, error code: {ret_code}") + logger.error(f"Failed to register buffer, error code: {ret_code}") + raise RuntimeError( + f"Failed to register buffer to Mooncake Store, error code: {ret_code}" + ) except TypeError as err: logger.error("Failed to register buffer to Mooncake Store: %s", err) raise TypeError("Mooncake Store Register Buffer Error.") from err