diff --git a/recipes/gb200-fp4/1k1k-dsv4/README.md b/recipes/gb200-fp4/1k1k-dsv4/README.md new file mode 100644 index 00000000..5208fbca --- /dev/null +++ b/recipes/gb200-fp4/1k1k-dsv4/README.md @@ -0,0 +1,21 @@ +# DeepSeek-V4-Pro (1.6T MoE, MXFP4) — 1k/1k aggregated on GB200 + +NVIDIA-verified SGLang recipes for **DeepSeek-V4-Pro** (MXFP4) on **GB200** +(ARM64 Grace + Blackwell, 4 GPU per node), aggregated mode, 1k / 1k workload. +GB200 HBM per GPU is smaller than GB300, so the 1.6T MXFP4 checkpoint only fits +across **2 nodes (8 GPUs) at TP=8**. + +## Container + +Same Grace+Blackwell aarch64 image as GB300 (shared enroot sqsh alias +`dsv4-grace-blackwell` in `srtslurm.yaml.example`). + +## Recipes + +| file | parallelism | MTP | notes | +|---|---|---|---| +| `agg-2n-low-latency.yaml` | TP=8 | EAGLE 3/4 | low-latency, 2-node | +| `agg-2n-nomtp.yaml` | TP=8 | — | throughput, 2-node | + +See `recipes/gb300-fp4/1k1k-dsv4/README.md` for the full flag rationale — +flags are identical to the GB300 2-node recipes apart from the partition. diff --git a/recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml new file mode 100644 index 00000000..cb2e71a1 --- /dev/null +++ b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml @@ -0,0 +1,65 @@ +# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled +name: "dsv4-pro-gb200-2n-agg-ll-1k1k-official" + +slurm: + partition: gb200 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb200/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb200/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 8 + + moe-runner-backend: "flashinfer_mxfp4" + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + mem-fraction-static: 0.82 + + context-length: 2200 + cuda-graph-max-bs: 1024 + max-running-requests: 1024 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml new file mode 100644 index 00000000..570e54e9 --- /dev/null +++ b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml @@ -0,0 +1,61 @@ +# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled +name: "dsv4-pro-gb200-2n-agg-nomtp-1k1k-official" + +slurm: + partition: gb200 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb200/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb200/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 8 + + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + mem-fraction-static: 0.82 + + context-length: 2200 + cuda-graph-max-bs: 1024 + max-running-requests: 1024 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k-dsv4/README.md b/recipes/gb300-fp4/1k1k-dsv4/README.md new file mode 100644 index 00000000..36fb3ebb --- /dev/null +++ b/recipes/gb300-fp4/1k1k-dsv4/README.md @@ -0,0 +1,53 @@ +# DeepSeek-V4-Pro (1.6T MoE, MXFP4) — 1k/1k aggregated on GB300 + +This directory contains NVIDIA-verified SGLang recipes for **DeepSeek-V4-Pro** +(1.6T-parameter MoE with MXFP4 MoE weights + FP8 KV, UE8M0 scales) on **GB300** +(ARM64 Grace + Blackwell, 4 GPU per node), aggregated serving mode, 1024 input / +1024 output workload. + +## Container + +All recipes reference the `dsv4-grace-blackwell` alias defined in +`srtslurm.yaml.example`. Pull + convert: + +```bash +enroot import --output sglang-deepseek-v4-grace-blackwell.sqsh \ + docker://lmsysorg/sglang:deepseek-v4-grace-blackwell +``` + +(Use the `deepseek-v4-blackwell` image for B200 x86_64, or `deepseek-v4-hopper` for H200.) + +## Model checkpoint + +```bash +hf download deepseek-ai/DeepSeek-V4-Pro --local-dir /shared/models/deepseek/DeepSeek-V4-Pro +``` + +## Recipes + +| file | parallelism | MTP | target | notes | +|---|---|---|---|---| +| `agg-low-latency.yaml` | TP=4 | EAGLE 3/4 | minimum TPOT / best per-user latency | GB300 1 node | +| `agg-nomtp.yaml` | TP=4 | — | baseline throughput, no spec decoding | GB300 1 node | +| `agg-balanced-tep.yaml` | TP=4 + DP=4 + DP-attn + DeepEP | EAGLE 1/2 | Pareto mid-curve | GB300 1 node | +| `agg-max-tpt-tep.yaml` | TP=4 + DP=4 + DP-attn + DeepEP | — | maximum TPS/GPU | GB300 1 node | +| `agg-2n-low-latency.yaml` | TP=8 | EAGLE 3/4 | low-latency, 2× memory headroom | GB300 2 nodes | +| `agg-2n-nomtp.yaml` | TP=8 | — | throughput, 2× memory headroom | GB300 2 nodes | + +## Key flags (derived from the SGLang DSv4 cookbook) + +- `moe-runner-backend: flashinfer_mxfp4` — MXFP4 MoE kernels (Blackwell only). +- `chunked-prefill-size: 4096` + `disable-flashinfer-autotune: true` — cookbook recipe. +- `disable-radix-cache: true` — synthetic benchmark best practice; also + reduces contiguous-allocator fragmentation at weight-reorder time. +- `mem-fraction-static: 0.78` — leaves headroom for the MXFP4 + `reorder_w1w3_to_w3w1` path (0.82 intermittently OOMs on GB300). +- TEP recipes: `enable-dp-attention + moe-a2a-backend: deepep` plus + `deepep-config num_sms=96` (DeepEP `DEEPEP_LARGE_SMS_FLAG` for single-node + Blackwell per cookbook). + +## References + +- [SGLang cookbook: `docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx`](https://github.com/sgl-project/sglang/blob/main/docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx) +- [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) +- Upstream SGLang PR: sgl-project/sglang#23600 diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml new file mode 100644 index 00000000..35a4fc90 --- /dev/null +++ b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml @@ -0,0 +1,66 @@ +# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled +name: "dsv4-pro-gb300-2n-agg-ll-1k1k-official" + +slurm: + partition: gb300 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 8 + + moe-runner-backend: "flashinfer_mxfp4" + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + mem-fraction-static: 0.82 + + context-length: 2200 + cuda-graph-max-bs: 1024 + max-running-requests: 1024 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-2n-nomtp.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-nomtp.yaml new file mode 100644 index 00000000..285ae192 --- /dev/null +++ b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-nomtp.yaml @@ -0,0 +1,62 @@ +# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled +name: "dsv4-pro-gb300-2n-agg-nomtp-1k1k-official" + +slurm: + partition: gb300 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 8 + + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + mem-fraction-static: 0.82 + + context-length: 2200 + cuda-graph-max-bs: 1024 + max-running-requests: 1024 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-balanced-tep.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-balanced-tep.yaml new file mode 100644 index 00000000..740e48fd --- /dev/null +++ b/recipes/gb300-fp4/1k1k-dsv4/agg-balanced-tep.yaml @@ -0,0 +1,74 @@ +# DeepSeek-V4-Pro GB300 1n TEP 'balanced' recipe +# From SGLang cookbook (DeepSeek-V4.mdx / deepseek-v4-deployment.jsx): +# TP=4 + DP=4 + DP-attention + DeepEP + MTP 1/2 + cg=128 max-run=256 +# Tests TEP vs pure-TP speedup at medium concurrency. +name: "dsv4-pro-gb300-agg-balanced-1k1k-official" + +slurm: + partition: gb300 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + # TEP: TP=DP=4 + dp-attention + deepep + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + # MTP 1/2 (balanced - gentler than 3/4) + speculative-algo: "EAGLE" + speculative-num-steps: 1 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 2 + + mem-fraction-static: 0.78 + context-length: 2200 + cuda-graph-max-bs: 128 + max-running-requests: 256 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-low-latency.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-low-latency.yaml new file mode 100644 index 00000000..f8bc23df --- /dev/null +++ b/recipes/gb300-fp4/1k1k-dsv4/agg-low-latency.yaml @@ -0,0 +1,71 @@ +# DeepSeek-V4-Pro aggregated mode on GB300 (1 node 4 GPU, TP=4) +# Based on SGLang upstream dsv4-docs cookbook (b200|big|low-latency verified) +# Adapted to GB300 per DeepSeek-V4.mdx: "GB300 4 GPU" is the single-node config +name: "dsv4-pro-gb300-agg-ll-1k1k" + +slurm: + partition: gb300 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + + # V4 low-latency recipe: MXFP4 MoE + MTP 3/4 + chunked-prefill 4096 + moe-runner-backend: "flashinfer_mxfp4" + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + mem-fraction-static: 0.78 + + # Generic knobs + context-length: 2200 + cuda-graph-max-bs: 1024 + max-running-requests: 1024 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-max-tpt-tep.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-max-tpt-tep.yaml new file mode 100644 index 00000000..66d4e405 --- /dev/null +++ b/recipes/gb300-fp4/1k1k-dsv4/agg-max-tpt-tep.yaml @@ -0,0 +1,70 @@ +# DeepSeek-V4-Pro GB300 1n TEP 'max-throughput' recipe +# From SGLang cookbook (DeepSeek-V4.mdx / deepseek-v4-deployment.jsx): +# TP=4 + DP=4 + DP-attention + DeepEP, NO MTP + cg=128 max-run=256 +# Tests TEP peak throughput at high concurrency. +name: "dsv4-pro-gb300-agg-maxtpt-1k1k-official" + +slurm: + partition: gb300 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + # TEP: TP=DP=4 + dp-attention + deepep + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + # NO MTP for max-tpt + + mem-fraction-static: 0.78 + context-length: 2200 + cuda-graph-max-bs: 128 + max-running-requests: 256 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-nomtp.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-nomtp.yaml new file mode 100644 index 00000000..96b4c690 --- /dev/null +++ b/recipes/gb300-fp4/1k1k-dsv4/agg-nomtp.yaml @@ -0,0 +1,65 @@ +# DeepSeek-V4-Pro aggregated mode on GB300 (1 node 4 GPU, TP=4) - NO MTP +# Baseline for comparison against MTP-enabled run (1567486) +name: "dsv4-pro-gb300-agg-nomtp-1k1k-official" + +slurm: + partition: gb300 + time_limit: "4:00:00" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +frontend: + type: sglang + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + # Persistent JIT caches (recommended for repeatable runs). + # Mount your own cache dirs into /configs/** via srt-slurm extra_mount + # or change these to any writable path inside the container. + SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + + # No MTP - pure MXFP4 MoE baseline + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + mem-fraction-static: 0.78 + + context-length: 2200 + cuda-graph-max-bs: 1024 + max-running-requests: 1024 + disable-radix-cache: true + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/srtslurm.yaml.example b/srtslurm.yaml.example index 3a1462f0..12e34c1b 100644 --- a/srtslurm.yaml.example +++ b/srtslurm.yaml.example @@ -24,11 +24,21 @@ containers: sglang-latest: "/shared/containers/sglang-v0.4.sqsh" sglang-dev: "/shared/containers/sglang-dev.sqsh" sglang-fp4: "/shared/containers/sglang-fp4.sqsh" + # DeepSeek-V4 cookbook images (pull via enroot from Docker Hub lmsysorg/sglang): + # lmsysorg/sglang:deepseek-v4-blackwell (B200 / x86_64) + # lmsysorg/sglang:deepseek-v4-grace-blackwell (GB200 / GB300 / aarch64) + # lmsysorg/sglang:deepseek-v4-hopper (H200) + dsv4-blackwell: "/shared/containers/sglang-deepseek-v4-blackwell.sqsh" + dsv4-grace-blackwell: "/shared/containers/sglang-deepseek-v4-grace-blackwell.sqsh" + dsv4-hopper: "/shared/containers/sglang-deepseek-v4-hopper.sqsh" # Model path aliases model_paths: deepseek-r1: "/shared/models/deepseek/DeepSeek-R1" deepseek-r1-distill: "/shared/models/deepseek/DeepSeek-R1-Distill-Qwen-32B" + # DeepSeek-V4 checkpoints (huggingface.co/deepseek-ai/DeepSeek-V4-Pro, -V4-Flash): + dsv4-pro: "/shared/models/deepseek/DeepSeek-V4-Pro" + dsv4-flash: "/shared/models/deepseek/DeepSeek-V4-Flash" llama-3-70b: "/shared/models/meta/llama-3-70b" llama-3-405b: "/shared/models/meta/llama-3-405b"