Skip to content
84 changes: 84 additions & 0 deletions configs/patches/vllm_numa_bind_hash_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Patch vLLM's ParallelConfig.compute_hash to exclude NUMA-bind fields
(numa_bind / numa_bind_nodes / numa_bind_cpus) from the DP consistency hash.

Symptom (seen on GB300, 1 worker, DP=4, numa-bind=True):
RuntimeError: Configuration mismatch detected for engine 3.
All DP workers must have identical configurations for parameters that
affect collective communication ...

Root cause: when numa-bind is enabled, each DP rank auto-detects and stores
its own per-rank NUMA node in ParallelConfig.numa_bind_nodes. These per-rank
values enter compute_hash(), so ranks on different NUMA nodes produce
different hashes and fail the DP startup check. NUMA binding affects only
host-side memory locality, not collective-communication semantics, so it is
safe to exclude from the DP hash.

Reference: vllm/config/parallel.py, ParallelConfig.compute_hash(),
ignored_factors set.
"""

import sys
from pathlib import Path

TARGET = Path(
"/usr/local/lib/python3.12/dist-packages/vllm/config/parallel.py"
)

# Idempotency: if any of our additions is already present, skip.
MARKER = '"numa_bind",'

# Anchor: the last entry of the existing ignored_factors set in the
# upstream compute_hash method. We insert the three numa fields just
# before the closing brace.
OLD = ' "_api_process_rank",\n }'

NEW = (
' "_api_process_rank",\n'
' # srt-slurm-sa hotfix: numa-bind fields are per-rank runtime\n'
' # topology, not collective-communication semantics.\n'
' "numa_bind",\n'
' "numa_bind_nodes",\n'
' "numa_bind_cpus",\n'
' }'
)


def main():
if not TARGET.exists():
print(f"[vllm-numa-bind-hash-fix] Target not found: {TARGET}", file=sys.stderr)
sys.exit(1)

content = TARGET.read_text()

if MARKER in content:
print("[vllm-numa-bind-hash-fix] Already patched, skipping.", file=sys.stderr)
return

count = content.count(OLD)
if count == 0:
print(
"[vllm-numa-bind-hash-fix] Could not find ignored_factors anchor. "
"vLLM version may have drifted; inspect ParallelConfig.compute_hash().",
file=sys.stderr,
)
sys.exit(1)
if count > 1:
print(
f"[vllm-numa-bind-hash-fix] Anchor is ambiguous ({count} occurrences); "
"refusing to patch.",
file=sys.stderr,
)
sys.exit(1)

content = content.replace(OLD, NEW)
TARGET.write_text(content)
print(
"[vllm-numa-bind-hash-fix] Added numa_bind/numa_bind_nodes/numa_bind_cpus "
"to ParallelConfig.compute_hash ignored_factors.",
file=sys.stderr,
)


if __name__ == "__main__":
main()
6 changes: 5 additions & 1 deletion configs/vllm-container-deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

pip install msgpack
pip install msgpack

if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then
python3 /configs/patches/vllm_numa_bind_hash_fix.py
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16"
model:
path: "deepseekv4-fp4"
container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257"
precision: "fp4"

dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
install: true

setup_script: vllm-container-deps.sh
resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8
frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 128
max-cudagraph-capture-size: 128
max-num-batched-tokens: 128
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "4x8x16x32x64x256x512x1024"
req_rate: "inf"
tokenizer_mode: "deepseek_v4"
use_chat_template: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
name: "svf-vllm-disagg-gb200-2p1d-dep8-dep16"
model:
path: "deepseekv4-fp4"
container: "vllm/vllm-openai@sha256:2af012a17c2cee0bc1428c03a8a5e42b552f25dc6f73495ab5a29ccf4123c257"
precision: "fp4"

dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
install: true

setup_script: vllm-container-deps.sh
resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 6
decode_nodes: 4
prefill_workers: 3
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 16
frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 16
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 256
max-cudagraph-capture-size: 256
max-num-batched-tokens: 256
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "4x8x16x32x64x256x512x1024"
req_rate: "inf"
tokenizer_mode: "deepseek_v4"
use_chat_template: true
Loading
Loading