Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions configs/patches/vllm_numa_bind_hash_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Patch vLLM's ParallelConfig.compute_hash to exclude NUMA-bind fields
(numa_bind / numa_bind_nodes / numa_bind_cpus) from the DP consistency hash.

Symptom (seen on GB300, 1 worker, DP=4, numa-bind=True):
RuntimeError: Configuration mismatch detected for engine 3.
All DP workers must have identical configurations for parameters that
affect collective communication ...

Root cause: when numa-bind is enabled, each DP rank auto-detects and stores
its own per-rank NUMA node in ParallelConfig.numa_bind_nodes. These per-rank
values enter compute_hash(), so ranks on different NUMA nodes produce
different hashes and fail the DP startup check. NUMA binding affects only
host-side memory locality, not collective-communication semantics, so it is
safe to exclude from the DP hash.

Reference: vllm/config/parallel.py, ParallelConfig.compute_hash(),
ignored_factors set.
"""

import sys
from pathlib import Path

TARGET = Path(
"/usr/local/lib/python3.12/dist-packages/vllm/config/parallel.py"
)

# Idempotency: if any of our additions is already present, skip.
MARKER = '"numa_bind",'

# Anchor: the last entry of the existing ignored_factors set in the
# upstream compute_hash method. We insert the three numa fields just
# before the closing brace.
OLD = ' "_api_process_rank",\n }'

NEW = (
' "_api_process_rank",\n'
' # srt-slurm-sa hotfix: numa-bind fields are per-rank runtime\n'
' # topology, not collective-communication semantics.\n'
' "numa_bind",\n'
' "numa_bind_nodes",\n'
' "numa_bind_cpus",\n'
' }'
)


def main():
if not TARGET.exists():
print(f"[vllm-numa-bind-hash-fix] Target not found: {TARGET}", file=sys.stderr)
sys.exit(1)

content = TARGET.read_text()

if MARKER in content:
print("[vllm-numa-bind-hash-fix] Already patched, skipping.", file=sys.stderr)
return

count = content.count(OLD)
if count == 0:
print(
"[vllm-numa-bind-hash-fix] Could not find ignored_factors anchor. "
"vLLM version may have drifted; inspect ParallelConfig.compute_hash().",
file=sys.stderr,
)
sys.exit(1)
if count > 1:
print(
f"[vllm-numa-bind-hash-fix] Anchor is ambiguous ({count} occurrences); "
"refusing to patch.",
file=sys.stderr,
)
sys.exit(1)

content = content.replace(OLD, NEW)
TARGET.write_text(content)
print(
"[vllm-numa-bind-hash-fix] Added numa_bind/numa_bind_nodes/numa_bind_cpus "
"to ParallelConfig.compute_hash ignored_factors.",
file=sys.stderr,
)


if __name__ == "__main__":
main()
6 changes: 5 additions & 1 deletion configs/vllm-container-deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

pip install msgpack
pip install msgpack

if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then
python3 /configs/patches/vllm_numa_bind_hash_fix.py
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16"
model:
path: "deepseekv4-fp4"
container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257"
precision: "fp4"

dynamo:
version: 1.0.2
install: true

setup_script: vllm-container-deps.sh
resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8
frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 128
max-cudagraph-capture-size: 128
max-num-batched-tokens: 128
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4


benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "4x8x16x32x64x256x512x1024"
req_rate: "inf"
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16"
model:
path: "deepseekv4-fp4"
container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257"
precision: "fp4"

dynamo:
version: 1.0.2
install: true

setup_script: vllm-container-deps.sh
resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 6
decode_nodes: 4
prefill_workers: 3
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 16
frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 16
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 256
max-cudagraph-capture-size: 256
max-num-batched-tokens: 256
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "3072x4096"
req_rate: "inf"
Loading