diff --git a/docs_new/src/snippets/autoregressive/deepseek-v4-deployment.jsx b/docs_new/src/snippets/autoregressive/deepseek-v4-deployment.jsx index 621c98091367..0ccca45766d7 100644 --- a/docs_new/src/snippets/autoregressive/deepseek-v4-deployment.jsx +++ b/docs_new/src/snippets/autoregressive/deepseek-v4-deployment.jsx @@ -178,6 +178,7 @@ export const DeepSeekV4Deployment = () => { "gb300|small|max-throughput", "h200|small|cp", "h200|small|pd-disagg", + "h200|big|max-throughput", "h200|big|pd-disagg", "gb300|small|cp", "gb300|big|cp", @@ -262,7 +263,9 @@ export const DeepSeekV4Deployment = () => { } } else if (recipe === "max-throughput") { if (hardware === "h200") { - recipeEnv.push("SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256"); + recipeEnv.push(isBig + ? "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=128" + : "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256"); } else { recipeEnv.push(isBig ? "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256" @@ -351,7 +354,7 @@ export const DeepSeekV4Deployment = () => { if (!multinode) flags.push(DEEPEP_LARGE_SMS_FLAG); } else if (recipe === "max-throughput") { // allinone max-throughput: TP + DP + DP-attn + DeepEP (NO MTP). - // H200 small: cg=128 max-run=256 | H200 big: cg=128 max-run=256 (same) + // H200 small: cg=128 max-run=256 | H200 big: max-run=64 mem-frac=0.875 // B200 small: no cg/max-run | B200 big: cg=64 max-run=256 // GB300 small: no cg/max-run | GB300 big: cg=128 max-run=256 flags.push(` --tp ${tp}`); @@ -359,8 +362,14 @@ export const DeepSeekV4Deployment = () => { flags.push(" --enable-dp-attention"); if (multinode) flags.push(...multiNodeFlags(nnodes)); flags.push(" --moe-a2a-backend deepep"); - if (isBig) flags.push(" --mem-fraction-static 0.82"); - if (hardware === "h200") { + if (isBig && hardware === "h200") { + flags.push(" --mem-fraction-static 0.875"); + } else if (isBig) { + flags.push(" --mem-fraction-static 0.82"); + } + if (hardware === "h200" && isBig) { + flags.push(" --max-running-requests 64"); + } else if (hardware === "h200") { flags.push(" --cuda-graph-max-bs 128"); flags.push(" --max-running-requests 256"); } else if (isBig && hardware === "b200") { @@ -416,7 +425,20 @@ export const DeepSeekV4Deployment = () => { const envAll = [...HW_ENV, ...recipeEnv, ...COMMON_ENV]; const envBlock = envAll.length ? envAll.join(" \\\n") + " \\\n" : ""; const base = `${envBlock}sglang serve \\\n${flags.join(" \\\n")}`; - const withMultinode = multinode ? prependMultiNodeNote(base, nnodes) : base; + // H200 big is multinode and may need machine-specific NVSHMEM / NCCL / Gloo + // env vars; emit them as commented hints above the env block. + let cmd = base; + if (hardware === "h200" && multinode) { + cmd = + `# The following env vars may be needed depending on your cluster:\n` + + `# NVSHMEM_ENABLE_NIC_PE_MAPPING=1\n` + + `# NVSHMEM_HCA_LIST=\n` + + `# GLOO_SOCKET_IFNAME=\n` + + `# NCCL_SOCKET_IFNAME=\n` + + `# NCCL_IB_HCA=\n` + + cmd; + } + const withMultinode = multinode ? prependMultiNodeNote(cmd, nnodes) : cmd; const verifyKey = `${hardware}|${modelSize}|${recipe}`; if (TBD_RECIPES.has(verifyKey)) return TBD_PLACEHOLDER; return VERIFIED_RECIPES.has(verifyKey)