-
Notifications
You must be signed in to change notification settings - Fork 37
Add all GB200/GB300 FP8 MTP recipes #134
Changes from all commits
c3a657a
a834cd2
503e5d0
3b94448
6b5fb56
377fce0
95b1a22
f1a43bb
d6f43da
7146d97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,186 @@ | ||||||||||||||
| # GB200 FP8 Max Throughput Configuration | ||||||||||||||
|
|
||||||||||||||
| name: "gb200-fp8-mid-curve-3p1d-mtp" | ||||||||||||||
|
Comment on lines
+1
to
+3
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inconsistent header comment. The comment on Line 1 says "Max Throughput Configuration" but the name and filename indicate this is a "mid-curve" configuration. This appears to be a copy-paste artifact. Proposed fix-# GB200 FP8 Max Throughput Configuration
+# GB200 FP8 Mid-Curve 3P1D MTP Configuration📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||
|
|
||||||||||||||
| frontend: | ||||||||||||||
| nginx_container: nginx | ||||||||||||||
|
|
||||||||||||||
| model: | ||||||||||||||
| path: "dsfp8" | ||||||||||||||
| container: "lmsysorg/sglang:v0.5.8-cu130" | ||||||||||||||
| precision: "fp8" | ||||||||||||||
|
|
||||||||||||||
| resources: | ||||||||||||||
| gpu_type: "gb200" | ||||||||||||||
| prefill_nodes: 6 | ||||||||||||||
| prefill_workers: 3 | ||||||||||||||
| decode_nodes: 12 | ||||||||||||||
| decode_workers: 1 | ||||||||||||||
| gpus_per_node: 4 | ||||||||||||||
|
|
||||||||||||||
| backend: | ||||||||||||||
|
|
||||||||||||||
| # Prefill-specific environment variables | ||||||||||||||
| prefill_environment: | ||||||||||||||
| TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" | ||||||||||||||
| DYN_SKIP_SGLANG_LOG_FORMATTING: "1" | ||||||||||||||
| MC_TE_METRIC: "true" | ||||||||||||||
| SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" | ||||||||||||||
| SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" | ||||||||||||||
| SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" | ||||||||||||||
| SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" | ||||||||||||||
| MC_FORCE_MNNVL: "1" | ||||||||||||||
| NCCL_MNNVL_ENABLE: "1" | ||||||||||||||
| NCCL_CUMEM_ENABLE: "1" | ||||||||||||||
| SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" | ||||||||||||||
| SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" | ||||||||||||||
| PYTHONUNBUFFERED: "1" | ||||||||||||||
| SGLANG_ENABLE_SPEC_V2: "1" | ||||||||||||||
| SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1" | ||||||||||||||
| SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" | ||||||||||||||
|
|
||||||||||||||
| # Decode-specific environment variables | ||||||||||||||
| decode_environment: | ||||||||||||||
| TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" | ||||||||||||||
| DYN_SKIP_SGLANG_LOG_FORMATTING: "1" | ||||||||||||||
| SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" | ||||||||||||||
| MC_TE_METRIC: "true" | ||||||||||||||
| SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" | ||||||||||||||
| SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" | ||||||||||||||
| SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" | ||||||||||||||
| SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" | ||||||||||||||
| SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" | ||||||||||||||
| SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" | ||||||||||||||
| MC_FORCE_MNNVL: "1" | ||||||||||||||
| NCCL_MNNVL_ENABLE: "1" | ||||||||||||||
| NCCL_CUMEM_ENABLE: "1" | ||||||||||||||
| SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" | ||||||||||||||
| SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" | ||||||||||||||
| PYTHONUNBUFFERED: "1" | ||||||||||||||
| SGLANG_ENABLE_SPEC_V2: "1" | ||||||||||||||
| SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1" | ||||||||||||||
| SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" | ||||||||||||||
|
|
||||||||||||||
| sglang_config: | ||||||||||||||
| prefill: | ||||||||||||||
| # Model configuration | ||||||||||||||
| served-model-name: "deepseek-ai/DeepSeek-R1" | ||||||||||||||
| skip-tokenizer-init: true | ||||||||||||||
| trust-remote-code: true | ||||||||||||||
|
|
||||||||||||||
| # Parallelism | ||||||||||||||
| tp-size: 8 | ||||||||||||||
| dp-size: 8 | ||||||||||||||
| ep-size: 8 | ||||||||||||||
| enable-dp-attention: true | ||||||||||||||
|
|
||||||||||||||
| # KV cache and attention | ||||||||||||||
| attention-backend: "trtllm_mla" | ||||||||||||||
| kv-cache-dtype: "fp8_e4m3" | ||||||||||||||
|
|
||||||||||||||
| # Radix cache disabled | ||||||||||||||
| disable-radix-cache: true | ||||||||||||||
|
|
||||||||||||||
| # Other flags | ||||||||||||||
| stream-interval: 50 | ||||||||||||||
| max-running-requests: 30000 | ||||||||||||||
| context-length: 2200 | ||||||||||||||
| watchdog-timeout: 1000000 | ||||||||||||||
| disable-shared-experts-fusion: true | ||||||||||||||
| eplb-algorithm: "deepseek" | ||||||||||||||
| disaggregation-bootstrap-port: 30001 | ||||||||||||||
|
|
||||||||||||||
| # Prefill-specific mode | ||||||||||||||
| disaggregation-mode: "prefill" | ||||||||||||||
| disaggregation-transfer-backend: "nixl" | ||||||||||||||
|
|
||||||||||||||
| # Memory and token limits | ||||||||||||||
| mem-fraction-static: 0.75 | ||||||||||||||
| max-total-tokens: 524288 | ||||||||||||||
| chunked-prefill-size: 131072 | ||||||||||||||
|
|
||||||||||||||
| # Request handling | ||||||||||||||
| load-balance-method: "round_robin" | ||||||||||||||
|
|
||||||||||||||
| # Performance optimizations | ||||||||||||||
| disable-cuda-graph: true | ||||||||||||||
|
|
||||||||||||||
| # DeepEP configuration | ||||||||||||||
| moe-a2a-backend: "deepep" | ||||||||||||||
| deepep-mode: "normal" | ||||||||||||||
| ep-dispatch-algorithm: "dynamic" | ||||||||||||||
| moe-dense-tp-size: 1 | ||||||||||||||
| enable-dp-lm-head: true | ||||||||||||||
| ep-num-redundant-experts: 32 | ||||||||||||||
| deepep-config: "/configs/deepep_config.json" | ||||||||||||||
|
|
||||||||||||||
| # MTP | ||||||||||||||
| speculative-algorithm: "EAGLE" | ||||||||||||||
| speculative-num-steps: 1 | ||||||||||||||
| speculative-eagle-topk: 1 | ||||||||||||||
| speculative-num-draft-tokens: 2 | ||||||||||||||
|
|
||||||||||||||
| decode: | ||||||||||||||
| # Model configuration | ||||||||||||||
| served-model-name: "deepseek-ai/DeepSeek-R1" | ||||||||||||||
| skip-tokenizer-init: true | ||||||||||||||
| trust-remote-code: true | ||||||||||||||
|
|
||||||||||||||
| # Parallelism | ||||||||||||||
| tp-size: 48 | ||||||||||||||
| dp-size: 48 | ||||||||||||||
| ep-size: 48 | ||||||||||||||
| enable-dp-attention: true | ||||||||||||||
|
|
||||||||||||||
| # KV cache and attention | ||||||||||||||
| attention-backend: "trtllm_mla" | ||||||||||||||
| kv-cache-dtype: "fp8_e4m3" | ||||||||||||||
|
|
||||||||||||||
| # Radix cache disabled | ||||||||||||||
| disable-radix-cache: true | ||||||||||||||
|
|
||||||||||||||
| # Other flags | ||||||||||||||
| stream-interval: 50 | ||||||||||||||
| decode-log-interval: 1000 | ||||||||||||||
| max-running-requests: 45000 | ||||||||||||||
| context-length: 2200 | ||||||||||||||
| watchdog-timeout: 1000000 | ||||||||||||||
| disable-shared-experts-fusion: true | ||||||||||||||
| eplb-algorithm: "deepseek" | ||||||||||||||
| disaggregation-bootstrap-port: 30001 | ||||||||||||||
|
|
||||||||||||||
| # Decode-specific mode | ||||||||||||||
| disaggregation-mode: "decode" | ||||||||||||||
| disaggregation-transfer-backend: "nixl" | ||||||||||||||
|
|
||||||||||||||
| # Memory and token limits | ||||||||||||||
| mem-fraction-static: 0.75 | ||||||||||||||
| chunked-prefill-size: 36864 | ||||||||||||||
|
|
||||||||||||||
|
Comment on lines
+156
to
+159
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing The prefill section specifies 🤖 Prompt for AI Agents |
||||||||||||||
| # DeepEP configuration | ||||||||||||||
| moe-a2a-backend: "deepep" | ||||||||||||||
| deepep-mode: "low_latency" | ||||||||||||||
| ep-dispatch-algorithm: "static" | ||||||||||||||
| moe-dense-tp-size: 1 | ||||||||||||||
| enable-dp-lm-head: true | ||||||||||||||
| prefill-round-robin-balance: true | ||||||||||||||
| ep-num-redundant-experts: 32 | ||||||||||||||
| deepep-config: "/configs/deepep_config.json" | ||||||||||||||
|
|
||||||||||||||
| # CUDA graphs | ||||||||||||||
| cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384] | ||||||||||||||
| cuda-graph-max-bs: 384 | ||||||||||||||
|
|
||||||||||||||
| # MTP | ||||||||||||||
| speculative-algorithm: "EAGLE" | ||||||||||||||
| speculative-num-steps: 1 | ||||||||||||||
| speculative-eagle-topk: 1 | ||||||||||||||
| speculative-num-draft-tokens: 2 | ||||||||||||||
|
|
||||||||||||||
| benchmark: | ||||||||||||||
| type: "sa-bench" | ||||||||||||||
| isl: 1024 | ||||||||||||||
| osl: 1024 | ||||||||||||||
| concurrencies: "1024x2048x4096" | ||||||||||||||
| req_rate: "inf" | ||||||||||||||
|
|
||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🌐 Web query:
SGLang v0.5.8 Docker image tags lmsysorg CUDA 13💡 Result:
I couldn’t find any published
lmsysorg/sglangDocker tag forv0.5.8that targets CUDA 13 (e.g., no evidence of av0.5.8-cu13*/v0.5.8-cu130*tag in the public listings I could access).What is documented/visible for CUDA 13 with
lmsysorg/sglang:lmsysorg/sglang:dev-cu13(docs.sglang.io)lmsysorg/sglang:latest-cu130-runtime(docs.sglang.io)lmsysorg/sglang:v0.5.7-cu130-runtime(docker.aityp.com)If you want, tell me whether you need build image vs
-runtime, and I can try to enumerate any additional*-cu13/*-cu130-*tags that are currently published.Citations:
🌐 Web query:
Docker Hub lmsysorg/sglang v0.5.8 available tags cu130 runtime💡 Result:
I couldn’t find any evidence that a
v0.5.8-*-cu130-runtimetag exists forlmsysorg/sglangon Docker Hub (no search hits forv0.5.8-cu130-runtime, and the Docker Hub repo page doesn’t surface anyv0.5.8tags). [1]What is referenced/available for CUDA 13.0 runtime:
lmsysorg/sglang:latest-cu130-runtime(called out in the official install docs for “CUDA 13 environment”). [2]lmsysorg/sglang:v0.5.7-cu130-runtime(listed by a Docker Hub mirror page that tracks that exact tag). [3]Sources:
[1] Docker Hub repo page for
lmsysorg/sglang(hub.docker.com)[2] SGLang install docs mentioning
latest-cu130-runtime(docs.sglang.io)[3] Mirror page showing
v0.5.7-cu130-runtimeexists (docker.aityp.com)Citations:
🌐 Web query:
SGLang CUDA 13 Docker image runtime variant "-runtime" suffix💡 Result:
In SGLang’s official Docker Hub images (
lmsysorg/sglang), the-runtimesuffix means the production “runtime” variant of an image tag (e.g.,latest-runtime) that is smaller (~40%) because it excludes build tools and development dependencies. [1]For CUDA 13, SGLang recommends using either: [1]
lmsysorg/sglang:dev-cu13lmsysorg/sglang:latest-cu130-runtime(notecu130+-runtime)Related naming in NVIDIA’s Dynamo artifacts also uses a “runtime” image plus a CUDA 13 tag (e.g.,
sglang-runtime:<version>-cuda13). [2]Update the SGLang Docker image tag for CUDA 13 with the
-runtimesuffix.The tag
lmsysorg/sglang:v0.5.8-cu130does not exist in published repositories. For CUDA 13, SGLang images require the-runtimesuffix (e.g.,lmsysorg/sglang:v0.5.8-cu130-runtime). The latest confirmed versioned CUDA 13 runtime tag islmsysorg/sglang:v0.5.7-cu130-runtime. Verify whether v0.5.8 with CUDA 13 runtime has been released; if not, use the v0.5.7 tag or check the official SGLang documentation for the recommended CUDA 13 image.🤖 Prompt for AI Agents