diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml new file mode 100644 index 00000000..dba4e48f --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml @@ -0,0 +1,102 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true + + +benchmark: + type: "vllm-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + num_warmups: 64 \ No newline at end of file diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml new file mode 100644 index 00000000..47220f76 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml @@ -0,0 +1,101 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + +benchmark: + type: "vllm-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + num_warmups: 256 \ No newline at end of file diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml new file mode 100644 index 00000000..332e7b0f --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml @@ -0,0 +1,101 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true + +benchmark: + type: "vllm-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096x8192" + num_warmups: 256 \ No newline at end of file