diff --git a/configs/patches/vllm_numa_bind_hash_fix.py b/configs/patches/vllm_numa_bind_hash_fix.py new file mode 100644 index 00000000..0759238c --- /dev/null +++ b/configs/patches/vllm_numa_bind_hash_fix.py @@ -0,0 +1,84 @@ +""" +Patch vLLM's ParallelConfig.compute_hash to exclude NUMA-bind fields +(numa_bind / numa_bind_nodes / numa_bind_cpus) from the DP consistency hash. + +Symptom (seen on GB300, 1 worker, DP=4, numa-bind=True): + RuntimeError: Configuration mismatch detected for engine 3. + All DP workers must have identical configurations for parameters that + affect collective communication ... + +Root cause: when numa-bind is enabled, each DP rank auto-detects and stores +its own per-rank NUMA node in ParallelConfig.numa_bind_nodes. These per-rank +values enter compute_hash(), so ranks on different NUMA nodes produce +different hashes and fail the DP startup check. NUMA binding affects only +host-side memory locality, not collective-communication semantics, so it is +safe to exclude from the DP hash. + +Reference: vllm/config/parallel.py, ParallelConfig.compute_hash(), +ignored_factors set. +""" + +import sys +from pathlib import Path + +TARGET = Path( + "/usr/local/lib/python3.12/dist-packages/vllm/config/parallel.py" +) + +# Idempotency: if any of our additions is already present, skip. +MARKER = '"numa_bind",' + +# Anchor: the last entry of the existing ignored_factors set in the +# upstream compute_hash method. We insert the three numa fields just +# before the closing brace. +OLD = ' "_api_process_rank",\n }' + +NEW = ( + ' "_api_process_rank",\n' + ' # srt-slurm-sa hotfix: numa-bind fields are per-rank runtime\n' + ' # topology, not collective-communication semantics.\n' + ' "numa_bind",\n' + ' "numa_bind_nodes",\n' + ' "numa_bind_cpus",\n' + ' }' +) + + +def main(): + if not TARGET.exists(): + print(f"[vllm-numa-bind-hash-fix] Target not found: {TARGET}", file=sys.stderr) + sys.exit(1) + + content = TARGET.read_text() + + if MARKER in content: + print("[vllm-numa-bind-hash-fix] Already patched, skipping.", file=sys.stderr) + return + + count = content.count(OLD) + if count == 0: + print( + "[vllm-numa-bind-hash-fix] Could not find ignored_factors anchor. " + "vLLM version may have drifted; inspect ParallelConfig.compute_hash().", + file=sys.stderr, + ) + sys.exit(1) + if count > 1: + print( + f"[vllm-numa-bind-hash-fix] Anchor is ambiguous ({count} occurrences); " + "refusing to patch.", + file=sys.stderr, + ) + sys.exit(1) + + content = content.replace(OLD, NEW) + TARGET.write_text(content) + print( + "[vllm-numa-bind-hash-fix] Added numa_bind/numa_bind_nodes/numa_bind_cpus " + "to ParallelConfig.compute_hash ignored_factors.", + file=sys.stderr, + ) + + +if __name__ == "__main__": + main() diff --git a/configs/vllm-container-deps.sh b/configs/vllm-container-deps.sh index 43807255..15e7733c 100644 --- a/configs/vllm-container-deps.sh +++ b/configs/vllm-container-deps.sh @@ -2,4 +2,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -pip install msgpack \ No newline at end of file +pip install msgpack + +if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then + python3 /configs/patches/vllm_numa_bind_hash_fix.py +fi diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml new file mode 100644 index 00000000..69ac9cbb --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -0,0 +1,114 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64x256x512x1024" + req_rate: "inf" \ No newline at end of file diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c4096-offload.yaml new file mode 100644 index 00000000..a65305a0 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c4096-offload.yaml @@ -0,0 +1,113 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3072x4096" + req_rate: "inf" \ No newline at end of file diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-32-c2048-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-32-c2048-offload.yaml new file mode 100644 index 00000000..76ebf1d0 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-32-c2048-offload.yaml @@ -0,0 +1,113 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" \ No newline at end of file diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-6p1d-dep8-dep16-64-c8192-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-6p1d-dep8-dep16-64-c8192-offload.yaml new file mode 100644 index 00000000..3dcd2478 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-6p1d-dep8-dep16-64-c8192-offload.yaml @@ -0,0 +1,113 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 12 + decode_nodes: 4 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" \ No newline at end of file