Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions docs_new/src/snippets/autoregressive/deepseek-v4-deployment.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ export const DeepSeekV4Deployment = () => {
"gb300|small|max-throughput",
"h200|small|cp",
"h200|small|pd-disagg",
"h200|big|low-latency",
"h200|big|balanced",
"h200|big|max-throughput",
"h200|big|pd-disagg",
"gb300|small|cp",
"gb300|big|cp",
Expand Down Expand Up @@ -272,7 +275,9 @@ export const DeepSeekV4Deployment = () => {
}
} else if (recipe === "balanced") {
if (hardware === "h200") {
recipeEnv.push("SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256");
recipeEnv.push(isBig
? "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=128"
: "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256");
} else {
// Blackwell: small=1024, big=256 (allinone ternary).
recipeEnv.push(isBig
Expand All @@ -281,7 +286,9 @@ export const DeepSeekV4Deployment = () => {
}
} else if (recipe === "max-throughput") {
if (hardware === "h200") {
recipeEnv.push("SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256");
recipeEnv.push(isBig
? "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=128"
: "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256");
} else {
recipeEnv.push(isBig
? "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256"
Expand Down Expand Up @@ -328,8 +335,8 @@ export const DeepSeekV4Deployment = () => {
flags.push(" --moe-runner-backend flashinfer_mxfp4");
}
if (hardware === "h200" && isBig) {
flags.push(" --cuda-graph-max-bs 32");
flags.push(" --max-running-requests 64");
flags.push(" --cuda-graph-max-bs 8");
flags.push(" --max-running-requests 32");
}
Comment on lines 337 to 340
Copy link

Copilot AI Apr 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The surrounding allinone summary comment for H200 big low-latency still mentions cg=32 max-run=64 (and mem-frac 0.82), but the actual flags now emit --cuda-graph-max-bs 8 and --max-running-requests 32 (and mem-frac 0.88). Please update the comment to match the new tuned values so the snippet remains self-consistent.

Copilot uses AI. Check for mistakes.
// MTP 3/4
flags.push(" --speculative-algo EAGLE");
Expand All @@ -340,7 +347,7 @@ export const DeepSeekV4Deployment = () => {
flags.push(" --chunked-prefill-size 4096");
flags.push(" --disable-flashinfer-autotune");
}
if (isBig) flags.push(" --mem-fraction-static 0.82");
if (isBig) flags.push(" --mem-fraction-static 0.88");
Copy link

Copilot AI Apr 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--mem-fraction-static was changed to 0.88 for all isBig low-latency runs. The block comment above says Blackwell big should use mem-frac 0.82, and the PR description suggests this tuning is specific to H200 Pro. Consider gating 0.88 to hardware === "h200" && isBig and keeping the prior value for other big variants to avoid altering existing verified B200/GB300 commands.

Copilot uses AI. Check for mistakes.
} else if (recipe === "balanced") {
// allinone balanced: TP + DP + DP-attn + DeepEP + MTP_112.
// H200 small: cg=128 max-run=128 | H200 big: cg=128 max-run=128 (same)
Expand All @@ -355,12 +362,17 @@ export const DeepSeekV4Deployment = () => {
flags.push(" --speculative-num-steps 1");
flags.push(" --speculative-eagle-topk 1");
flags.push(" --speculative-num-draft-tokens 2");
if (isBig && hardware === "gb200") {
if (hardware === "h200" && isBig) {
flags.push(" --mem-fraction-static 0.88");
} else if (isBig && hardware === "gb200") {
flags.push(" --mem-fraction-static 0.78");
} else if (isBig) {
flags.push(" --mem-fraction-static 0.82");
}
if (hardware === "h200") {
if (hardware === "h200" && isBig) {
flags.push(" --cuda-graph-max-bs 8");
flags.push(" --max-running-requests 32");
} else if (hardware === "h200") {
Comment on lines +372 to +375
Copy link

Copilot AI Apr 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The balanced recipe comment above says H200 big: cg=128 max-run=128 (same) but for hardware === "h200" && isBig the code now emits --cuda-graph-max-bs 8 and --max-running-requests 32. Update the comment to reflect the new H200 Pro tuned values to avoid confusing users (and to preserve the "mirror of allinone" intent documented earlier in the file).

Copilot uses AI. Check for mistakes.
flags.push(" --cuda-graph-max-bs 128");
flags.push(" --max-running-requests 128");
} else if (isBig && hardware === "b200") {
Expand All @@ -386,7 +398,9 @@ export const DeepSeekV4Deployment = () => {
flags.push(" --enable-dp-attention");
if (multinode) flags.push(...multiNodeFlags(nnodes));
flags.push(" --moe-a2a-backend deepep");
if (isBig && hardware === "gb200") {
if (hardware === "h200" && isBig) {
flags.push(" --mem-fraction-static 0.88");
} else if (isBig && hardware === "gb200") {
flags.push(" --mem-fraction-static 0.78");
} else if (isBig) {
flags.push(" --mem-fraction-static 0.82");
Expand Down
Loading