diff --git a/.gitignore b/.gitignore index fb5fbd86..04a9a266 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ configs/etcdctl configs/*.whl configs/*.deb configs/*.tar.gz +configs/wheels/ .ruff_cache/ *.egg-info/ diff --git a/configs/install-ai-dynamo.sh b/configs/install-ai-dynamo.sh new file mode 100755 index 00000000..a005d5eb --- /dev/null +++ b/configs/install-ai-dynamo.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +DYNAMO_VERSION="${DYNAMO_VERSION:-}" + +if [ -z "${DYNAMO_VERSION}" ]; then + echo "ERROR: DYNAMO_VERSION must be set for ai-dynamo wheel install" >&2 + exit 1 +fi + +DYNAMO_PACKAGE="ai-dynamo==${DYNAMO_VERSION}" +DYNAMO_RUNTIME_PACKAGE="ai-dynamo-runtime==${DYNAMO_VERSION}" +DYNAMO_WHEEL_NAME="${DYNAMO_WHEEL_NAME:-ai_dynamo-${DYNAMO_VERSION}-py3-none-any.whl}" +DYNAMO_RUNTIME_WHEEL_PATTERN="${DYNAMO_RUNTIME_WHEEL_PATTERN:-ai_dynamo_runtime-${DYNAMO_VERSION}-*.whl}" +DYNAMO_WHEEL_DIRS="${DYNAMO_WHEEL_DIRS:-/configs/wheels /configs}" +PYTHON_BIN="${PYTHON_BIN:-}" + +if [ -z "${PYTHON_BIN}" ]; then + if command -v python3 >/dev/null 2>&1; then + PYTHON_BIN="python3" + elif command -v python >/dev/null 2>&1; then + PYTHON_BIN="python" + else + echo "ERROR: neither python3 nor python found in PATH" >&2 + exit 127 + fi +fi + +if "${PYTHON_BIN}" - <&2 + echo "ERROR: expected ${DYNAMO_WHEEL_NAME} and ${DYNAMO_RUNTIME_WHEEL_PATTERN}" >&2 + exit 1 +fi + +"${PYTHON_BIN}" - <<'PY' +import dynamo.llm # noqa: F401 +PY diff --git a/configs/patches/vllm_cumem_expandable_segments_fix.py b/configs/patches/vllm_cumem_expandable_segments_fix.py new file mode 100644 index 00000000..04474195 --- /dev/null +++ b/configs/patches/vllm_cumem_expandable_segments_fix.py @@ -0,0 +1,169 @@ +""" +Patch vLLM's CuMemAllocator to be compatible with PyTorch expandable +segments by temporarily toggling the allocator setting around the memory +pool context (sleep mode), instead of hard-asserting at __init__ time. + +Backports vllm-project/vllm#40812 ("Auto-disable expandable_segments +around cumem memory pool"). Without this patch, setting +PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True together with +enable-sleep-mode causes vLLM to abort during CuMemAllocator +construction; with this patch, expandable segments stay on for normal +allocations and are flipped off only for the duration of +use_memory_pool(). + +Reference: https://github.com/vllm-project/vllm/pull/40812 +Affected file: vllm/device_allocator/cumem.py +""" + +import sys +from pathlib import Path + +TARGET = Path("/usr/local/lib/python3.12/dist-packages/vllm/device_allocator/cumem.py") + +# Idempotency: the new use_memory_pool body introduces this exact line. +MARKER = 'expandable_was_enabled = "expandable_segments:True" in conf' + +# --- Hunk 1: drop the __init__ assertion ------------------------------------- + +INIT_OLD = ( + " def __init__(self):\n" + ' conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n' + ' assert "expandable_segments:True" not in conf, (\n' + ' "Expandable segments are not compatible with memory pool. "\n' + ' "Please track https://github.com/pytorch/pytorch/issues/147851 "\n' + ' "for the latest updates."\n' + " )\n" + "\n" + " self.pointer_to_data: dict[int, AllocationData] = {}\n" +) + +INIT_NEW = ( + " def __init__(self):\n" + " self.pointer_to_data: dict[int, AllocationData] = {}\n" +) + +# --- Hunk 2: wrap use_memory_pool body in try/finally + toggle --------------- + +POOL_OLD = ( + " assert isinstance(tag, str)\n" + "\n" + " old_tag = self.current_tag\n" + " self.current_tag = tag\n" + " with use_memory_pool_with_allocator(\n" + " self.python_malloc_callback, self.python_free_callback\n" + " ) as data:\n" + " # start to hit another PyTorch bug in PyTorch 2.6,\n" + " # possibly because of gc-related issue w.r.t. the allocator and\n" + " # the memory pool.\n" + " # to avoid the issue, we keep a reference of the data.\n" + " # see https://github.com/pytorch/pytorch/issues/146431 .\n" + " self.allocator_and_pools[tag] = data\n" + " yield\n" + " # PyTorch's bug, calling torch.cuda.empty_cache() will error\n" + " # when using pluggable allocator, see\n" + " # https://github.com/pytorch/pytorch/issues/145168 .\n" + " # if we have some memory allocated and then freed,\n" + " # the memory will not be released, e.g. in online quantization,\n" + " # where the model is created in higher precision, and then\n" + " # quantized in lower precision.\n" + " # Find all unused allocations and manually release them.\n" + " # TODO: we should expose `empty_cache` method in the memory pool.\n" + " # TODO: ask for help from PyTorch team to expose this method.\n" + " allocations = data[0].snapshot()\n" + " for allocation in allocations:\n" + " if allocation[\"allocated_size\"] == 0:\n" + " handle = self._python_free_callback(allocation[\"address\"])\n" + " unmap_and_release(handle)\n" + " self.current_tag = old_tag\n" +) + +POOL_NEW = ( + " assert isinstance(tag, str)\n" + "\n" + " # Expandable segments are incompatible with the memory pool used for\n" + " # sleep mode (see https://github.com/pytorch/pytorch/issues/147851).\n" + " # If the user has enabled expandable segments via\n" + " # PYTORCH_CUDA_ALLOC_CONF, temporarily disable them for the duration\n" + " # of the memory pool context and restore on exit.\n" + ' conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n' + ' expandable_was_enabled = "expandable_segments:True" in conf\n' + " if expandable_was_enabled:\n" + ' torch.cuda.memory._set_allocator_settings("expandable_segments:False")\n' + "\n" + " old_tag = self.current_tag\n" + " self.current_tag = tag\n" + " try:\n" + " with use_memory_pool_with_allocator(\n" + " self.python_malloc_callback, self.python_free_callback\n" + " ) as data:\n" + " # start to hit another PyTorch bug in PyTorch 2.6,\n" + " # possibly because of gc-related issue w.r.t. the allocator\n" + " # and the memory pool.\n" + " # to avoid the issue, we keep a reference of the data.\n" + " # see https://github.com/pytorch/pytorch/issues/146431 .\n" + " self.allocator_and_pools[tag] = data\n" + " yield\n" + " # PyTorch's bug, calling torch.cuda.empty_cache() will error\n" + " # when using pluggable allocator, see\n" + " # https://github.com/pytorch/pytorch/issues/145168 .\n" + " # if we have some memory allocated and then freed,\n" + " # the memory will not be released, e.g. in online\n" + " # quantization, where the model is created in higher\n" + " # precision, and then quantized in lower precision.\n" + " # Find all unused allocations and manually release them.\n" + " # TODO: we should expose `empty_cache` method in the memory\n" + " # pool.\n" + " # TODO: ask for help from PyTorch team to expose this method.\n" + " allocations = data[0].snapshot()\n" + " for allocation in allocations:\n" + " if allocation[\"allocated_size\"] == 0:\n" + " handle = self._python_free_callback(allocation[\"address\"])\n" + " unmap_and_release(handle)\n" + " finally:\n" + " self.current_tag = old_tag\n" + " if expandable_was_enabled:\n" + ' torch.cuda.memory._set_allocator_settings("expandable_segments:True")\n' +) + +PATCHES = [ + ("CuMemAllocator.__init__ assertion removal", INIT_OLD, INIT_NEW), + ("CuMemAllocator.use_memory_pool toggle", POOL_OLD, POOL_NEW), +] + + +def main(): + if not TARGET.exists(): + print(f"[vllm-cumem-expandable-fix] Target not found: {TARGET}", file=sys.stderr) + sys.exit(1) + + content = TARGET.read_text() + if MARKER in content: + print("[vllm-cumem-expandable-fix] Already patched, skipping.", file=sys.stderr) + return + + new_content = content + for name, old, new in PATCHES: + count = new_content.count(old) + if count == 0: + print( + f"[vllm-cumem-expandable-fix] Anchor for {name!r} not found. " + "vLLM version may have drifted; inspect cumem.py.", + file=sys.stderr, + ) + sys.exit(1) + if count > 1: + print( + f"[vllm-cumem-expandable-fix] Anchor for {name!r} is ambiguous " + f"({count} matches); refusing to patch.", + file=sys.stderr, + ) + sys.exit(1) + new_content = new_content.replace(old, new, 1) + print(f"[vllm-cumem-expandable-fix] Patched {name}", file=sys.stderr) + + TARGET.write_text(new_content) + print("[vllm-cumem-expandable-fix] Done.", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py b/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py new file mode 100644 index 00000000..44cbbb40 --- /dev/null +++ b/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py @@ -0,0 +1,111 @@ +""" +Free original DeepSeek V4 MoE expert weights after MegaMoE finalize. + +Symptom (seen on GB200 decode, EP=8, VLLM_DEEPSEEK_V4_USE_MEGA_MOE=1): + torch.OutOfMemoryError: Tried to allocate 1008.00 MiB. + GPU 0 has a total capacity of 184.31 GiB of which 381.44 MiB is free. + 181.02 GiB allocated by PyTorch. + Stack ends in deep_gemm/mega/__init__.py interleave(): + torch.empty_like(t).copy_(torch.stack([gate, up], dim=2).reshape(...)) + +Root cause: DeepseekV4MegaMoEExperts.finalize_weights() builds +self._transformed_l1_weights / _transformed_l2_weights but does NOT release +the original self.w13_weight / w2_weight / *_weight_scale parameters. Both +copies stay resident on GPU through finalize iteration, and on EP=8 the +per-rank weight footprint (~125 GiB) plus this duplication leaves no +headroom for the per-layer interleave temporaries (~1 GiB peak). + +Forward path verified (deepseek_v4.py: _run_mega_moe, ~line 538-547) only +reads self._transformed_l1_weights / _transformed_l2_weights. Original +w13_weight / w2_weight / *_weight_scale are dead after finalize. + +Fix (mirrors upstream PR vllm-project/vllm#40860): at the end of +finalize_weights() of each expert module, drop the four original +Parameters by assigning them to None so they are removed from the module's +_parameters dict. transform_weights_for_mega_moe allocates fresh L1 + SF +tensors and only the L2 weight aliases the original w2_weight storage -- +_transformed_l2_weights still holds that reference, so the storage stays +live via refcount. PyTorch's caching allocator can then reuse the freed +storage for the NEXT layer's interleave temporaries within the same +finalize loop. + +Reference: vllm/model_executor/models/deepseek_v4.py, +DeepseekV4MegaMoEExperts.finalize_weights(). +""" + +import sys +from pathlib import Path + +TARGET = Path( + "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py" +) + +# Idempotency marker +MARKER = "srt-slurm-sa hotfix: free original MegaMoE expert weights" + +# Anchor: closing of the _transformed_l1/l2 assignment in finalize_weights(). +# The triple-`)` pattern is unique in the file. +OLD = ( + " self._transformed_l1_weights, self._transformed_l2_weights = (\n" + " deep_gemm.transform_weights_for_mega_moe(\n" + " (self.w13_weight.data.view(torch.int8).contiguous(), w13_scale),\n" + " (self.w2_weight.data.view(torch.int8).contiguous(), w2_scale),\n" + " )\n" + " )\n" +) + +NEW = ( + OLD + + " # srt-slurm-sa hotfix: free original MegaMoE expert weights.\n" + + " # Mirrors vllm-project/vllm#40860. transform_weights_for_mega_moe\n" + + " # allocates fresh L1 + SF tensors; only the L2 weight aliases the\n" + + " # original w2_weight storage, but _transformed_l2_weights holds that\n" + + " # reference, so dropping the Parameters is safe via refcount and the\n" + + " # freed storage returns to the caching allocator in time for the next\n" + + " # layer's interleave temp (~1 GiB).\n" + + " self.w13_weight = None\n" + + " self.w13_weight_scale = None\n" + + " self.w2_weight = None\n" + + " self.w2_weight_scale = None\n" +) + + +def main(): + if not TARGET.exists(): + print(f"[vllm-mega-moe-free-orig] Target not found: {TARGET}", file=sys.stderr) + sys.exit(1) + + content = TARGET.read_text() + + if MARKER in content: + print("[vllm-mega-moe-free-orig] Already patched, skipping.", file=sys.stderr) + return + + count = content.count(OLD) + if count == 0: + print( + "[vllm-mega-moe-free-orig] Could not find finalize_weights anchor. " + "vLLM version may have drifted; inspect " + "DeepseekV4MegaMoEExperts.finalize_weights().", + file=sys.stderr, + ) + sys.exit(1) + if count > 1: + print( + f"[vllm-mega-moe-free-orig] Anchor is ambiguous ({count} occurrences); " + "refusing to patch.", + file=sys.stderr, + ) + sys.exit(1) + + content = content.replace(OLD, NEW) + TARGET.write_text(content) + print( + "[vllm-mega-moe-free-orig] Freed original w13/w2 weights and scales " + "in DeepseekV4MegaMoEExperts.finalize_weights().", + file=sys.stderr, + ) + + +if __name__ == "__main__": + main() diff --git a/configs/patches/vllm_numa_bind_hash_fix.py b/configs/patches/vllm_numa_bind_hash_fix.py new file mode 100644 index 00000000..0759238c --- /dev/null +++ b/configs/patches/vllm_numa_bind_hash_fix.py @@ -0,0 +1,84 @@ +""" +Patch vLLM's ParallelConfig.compute_hash to exclude NUMA-bind fields +(numa_bind / numa_bind_nodes / numa_bind_cpus) from the DP consistency hash. + +Symptom (seen on GB300, 1 worker, DP=4, numa-bind=True): + RuntimeError: Configuration mismatch detected for engine 3. + All DP workers must have identical configurations for parameters that + affect collective communication ... + +Root cause: when numa-bind is enabled, each DP rank auto-detects and stores +its own per-rank NUMA node in ParallelConfig.numa_bind_nodes. These per-rank +values enter compute_hash(), so ranks on different NUMA nodes produce +different hashes and fail the DP startup check. NUMA binding affects only +host-side memory locality, not collective-communication semantics, so it is +safe to exclude from the DP hash. + +Reference: vllm/config/parallel.py, ParallelConfig.compute_hash(), +ignored_factors set. +""" + +import sys +from pathlib import Path + +TARGET = Path( + "/usr/local/lib/python3.12/dist-packages/vllm/config/parallel.py" +) + +# Idempotency: if any of our additions is already present, skip. +MARKER = '"numa_bind",' + +# Anchor: the last entry of the existing ignored_factors set in the +# upstream compute_hash method. We insert the three numa fields just +# before the closing brace. +OLD = ' "_api_process_rank",\n }' + +NEW = ( + ' "_api_process_rank",\n' + ' # srt-slurm-sa hotfix: numa-bind fields are per-rank runtime\n' + ' # topology, not collective-communication semantics.\n' + ' "numa_bind",\n' + ' "numa_bind_nodes",\n' + ' "numa_bind_cpus",\n' + ' }' +) + + +def main(): + if not TARGET.exists(): + print(f"[vllm-numa-bind-hash-fix] Target not found: {TARGET}", file=sys.stderr) + sys.exit(1) + + content = TARGET.read_text() + + if MARKER in content: + print("[vllm-numa-bind-hash-fix] Already patched, skipping.", file=sys.stderr) + return + + count = content.count(OLD) + if count == 0: + print( + "[vllm-numa-bind-hash-fix] Could not find ignored_factors anchor. " + "vLLM version may have drifted; inspect ParallelConfig.compute_hash().", + file=sys.stderr, + ) + sys.exit(1) + if count > 1: + print( + f"[vllm-numa-bind-hash-fix] Anchor is ambiguous ({count} occurrences); " + "refusing to patch.", + file=sys.stderr, + ) + sys.exit(1) + + content = content.replace(OLD, NEW) + TARGET.write_text(content) + print( + "[vllm-numa-bind-hash-fix] Added numa_bind/numa_bind_nodes/numa_bind_cpus " + "to ParallelConfig.compute_hash ignored_factors.", + file=sys.stderr, + ) + + +if __name__ == "__main__": + main() diff --git a/configs/patches/vllm_nvlink_one_sided_bf16_fix.py b/configs/patches/vllm_nvlink_one_sided_bf16_fix.py new file mode 100644 index 00000000..7f19ff8e --- /dev/null +++ b/configs/patches/vllm_nvlink_one_sided_bf16_fix.py @@ -0,0 +1,345 @@ +""" +Patch vLLM to backport Inferact/vllm-svf#180 — bf16 activation support for +the FlashInfer NVLink one-sided MoE all-to-all path. + +Without the patch, FlashInferNVLinkOneSidedPrepareAndFinalize hard-codes the +dispatch payload to nvfp4 (0.5 B/elem hidden + per-16-elem fp8 scales). That +crashes for experts that prefer to receive bf16 tokens and quantize +post-dispatch (e.g. trtllm_mxfp4_moe with mxfp8 activations) and for any +non-nvfp4 quant_dtype. + +Affected files (from PR diff): + - vllm/distributed/device_communicators/all2all.py + - vllm/model_executor/layers/fused_moe/all2all_utils.py + - vllm/model_executor/layers/fused_moe/oracle/mxfp4.py + - vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py + +Reference: https://github.com/Inferact/vllm-svf/pull/180 +""" + +import sys +from pathlib import Path + +VLLM_ROOT = Path("/usr/local/lib/python3.12/dist-packages/vllm") + +# --- File 1: distributed/device_communicators/all2all.py ---------------------- + +ALL2ALL_TARGET = VLLM_ROOT / "distributed/device_communicators/all2all.py" + +ALL2ALL_OLD = ( + " top_k: int,\n" + " num_experts: int,\n" + " hidden_size: int,\n" + " ):\n" + ' """Initialize the MoeAlltoAll workspace."""\n' + " if self.initialized:\n" + " return\n" +) + +ALL2ALL_NEW = ( + " top_k: int,\n" + " num_experts: int,\n" + " hidden_size: int,\n" + " dispatch_dtype_bytes_per_elem: int = 0,\n" + " dispatch_has_fp8_scale: bool = True,\n" + " ):\n" + ' """Initialize the MoeAlltoAll workspace.\n' + "\n" + " dispatch_dtype_bytes_per_elem: bytes/elem for the dispatched hidden\n" + " states. Use 0 as a sentinel for sub-byte nvfp4 (0.5 B/elem); use\n" + " 1 for fp8, 2 for bf16/fp16.\n" + " dispatch_has_fp8_scale: whether a per-16-elem fp8 scale tensor is\n" + " dispatched alongside the hidden states (true for nvfp4/fp8,\n" + " false for bf16 passthrough).\n" + ' """\n' + " if self.initialized:\n" + " return\n" +) + +ALL2ALL_OLD_PAYLOAD = ( + " total_dispatch_payload_size_per_token = (\n" + " hidden_size // 2 # nvfp4 hidden states\n" + " + hidden_size // 16 # fp8 scaling factors\n" + " + top_k * 4 # int32 topks ids\n" + " + top_k * 4 # float32 topk weights\n" + " )\n" +) + +ALL2ALL_NEW_PAYLOAD = ( + " if dispatch_dtype_bytes_per_elem == 0:\n" + " hidden_bytes = hidden_size // 2 # nvfp4\n" + " else:\n" + " hidden_bytes = hidden_size * dispatch_dtype_bytes_per_elem\n" + " scale_bytes = hidden_size // 16 if dispatch_has_fp8_scale else 0\n" + " total_dispatch_payload_size_per_token = (\n" + " hidden_bytes\n" + " + scale_bytes\n" + " + top_k * 4 # int32 topks ids\n" + " + top_k * 4 # float32 topk weights\n" + " )\n" +) + +# --- File 2: fused_moe/all2all_utils.py --------------------------------------- + +ALL2ALL_UTILS_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/all2all_utils.py" + +ALL2ALL_UTILS_OLD_SIG = ( + " routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n" + " allow_new_interface: bool = False,\n" + " use_monolithic: bool = False,\n" + ") -> FusedMoEPrepareAndFinalize | None:\n" +) + +ALL2ALL_UTILS_NEW_SIG = ( + " routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n" + " allow_new_interface: bool = False,\n" + " use_monolithic: bool = False,\n" + " defer_input_quant: bool = False,\n" + ") -> FusedMoEPrepareAndFinalize | None:\n" +) + +ALL2ALL_UTILS_OLD_BUILD = ( + " prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n" + " max_num_tokens=max_num_tokens,\n" + " top_k=moe.experts_per_token,\n" + " num_experts=moe.num_experts,\n" + " hidden_size=moe.hidden_dim,\n" + " num_dispatchers=all2all_manager.world_size,\n" + " )\n" +) + +ALL2ALL_UTILS_NEW_BUILD = ( + " if defer_input_quant or quant_config.quant_dtype is None:\n" + " # Experts (e.g. trtllm_mxfp4 with mxfp8 activations) quantize\n" + " # post-dispatch; ship bf16 tokens with no per-token scale payload.\n" + " dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 2, False\n" + ' elif quant_config.quant_dtype == "nvfp4":\n' + " dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 0, True\n" + " else:\n" + " raise NotImplementedError(\n" + ' "flashinfer_nvlink_one_sided dispatch only supports nvfp4, "\n' + ' "bf16, and defer_input_quant paths today; got "\n' + ' f"quant_dtype={quant_config.quant_dtype!r}"\n' + " )\n" + " prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n" + " max_num_tokens=max_num_tokens,\n" + " top_k=moe.experts_per_token,\n" + " num_experts=moe.num_experts,\n" + " hidden_size=moe.hidden_dim,\n" + " num_dispatchers=all2all_manager.world_size,\n" + " dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n" + " dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n" + " )\n" +) + +# --- File 3: fused_moe/oracle/mxfp4.py ---------------------------------------- + +MXFP4_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/oracle/mxfp4.py" + +MXFP4_OLD = ( + ' """Create a FusedMoEKernel for the given MXFP4 backend."""\n' + " is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n" + "\n" + " # Create Prepare/Finalize.\n" + " prepare_finalize = maybe_make_prepare_finalize(\n" + " moe=moe_config,\n" +) + +MXFP4_NEW = ( + ' """Create a FusedMoEKernel for the given MXFP4 backend."""\n' + " is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n" + "\n" + " # Some experts (trtllm_mxfp4 with mxfp8 activations) prefer bf16 tokens\n" + " # on dispatch and quantize internally; signal this to the prepare/finalize\n" + " # so workspace + prepare path ship bf16 instead of the quant_config dtype.\n" + " from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import (\n" + " TrtLlmMxfp4ExpertsBase,\n" + " )\n" + "\n" + " defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)\n" + "\n" + " # Create Prepare/Finalize.\n" + " prepare_finalize = maybe_make_prepare_finalize(\n" + " moe=moe_config,\n" +) + +MXFP4_OLD_CALL = ( + " routing_tables=routing_tables,\n" + " allow_new_interface=True,\n" + " use_monolithic=is_monolithic,\n" + " )\n" + " assert prepare_finalize is not None\n" +) + +MXFP4_NEW_CALL = ( + " routing_tables=routing_tables,\n" + " allow_new_interface=True,\n" + " use_monolithic=is_monolithic,\n" + " defer_input_quant=defer_input_quant,\n" + " )\n" + " assert prepare_finalize is not None\n" +) + +# --- File 4: fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py -------- + +PREP_TARGET = VLLM_ROOT / ( + "model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py" +) + +PREP_OLD_INIT = ( + " num_experts: int,\n" + " hidden_size: int,\n" + " num_dispatchers: int = 1,\n" + " ):\n" + " super().__init__()\n" +) + +PREP_NEW_INIT = ( + " num_experts: int,\n" + " hidden_size: int,\n" + " num_dispatchers: int = 1,\n" + " dispatch_dtype_bytes_per_elem: int = 0,\n" + " dispatch_has_fp8_scale: bool = True,\n" + " ):\n" + " super().__init__()\n" +) + +PREP_OLD_CALL = ( + " top_k=self.top_k,\n" + " num_experts=self.num_experts,\n" + " hidden_size=self.hidden_size,\n" + " )\n" +) + +PREP_NEW_CALL = ( + " top_k=self.top_k,\n" + " num_experts=self.num_experts,\n" + " hidden_size=self.hidden_size,\n" + " dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n" + " dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n" + " )\n" +) + +PREP_OLD_QUANT = ( + " a1q, a1q_scale = moe_kernel_quantize_input(\n" + " a1,\n" + " quant_config.a1_gscale,\n" + " quant_config.quant_dtype,\n" + " quant_config.per_act_token_quant,\n" + " quant_config.block_shape,\n" + " is_fp4_scale_swizzled=False, # delay swizzle to after comm\n" + " )\n" +) + +PREP_NEW_QUANT = ( + " if defer_input_quant:\n" + " # Experts (e.g. trtllm_mxfp4_moe with mxfp8 activations) will\n" + " # quantize post-dispatch. Ship bf16 tokens and skip scales.\n" + " a1q, a1q_scale = a1, None\n" + " else:\n" + " a1q, a1q_scale = moe_kernel_quantize_input(\n" + " a1,\n" + " quant_config.a1_gscale,\n" + " quant_config.quant_dtype,\n" + " quant_config.per_act_token_quant,\n" + " quant_config.block_shape,\n" + " is_fp4_scale_swizzled=False, # delay swizzle to after comm\n" + " )\n" +) + +# (target file, marker indicating already-patched, [(name, old, new), ...]) +FILES = [ + ( + ALL2ALL_TARGET, + "dispatch_dtype_bytes_per_elem", + [ + ("MoeAlltoAll.initialize signature", ALL2ALL_OLD, ALL2ALL_NEW), + ("MoeAlltoAll dispatch payload sizing", ALL2ALL_OLD_PAYLOAD, ALL2ALL_NEW_PAYLOAD), + ], + ), + ( + ALL2ALL_UTILS_TARGET, + # Note: bare "defer_input_quant" appears in a comment in the base + # file ("# Unquantized dispatch (e.g. AITER with defer_input_quant):"), + # so we anchor on a string we *introduce* — namely the parameter + # declaration in maybe_make_prepare_finalize's signature. + "defer_input_quant: bool = False,", + [ + ("maybe_make_prepare_finalize signature", ALL2ALL_UTILS_OLD_SIG, ALL2ALL_UTILS_NEW_SIG), + ( + "FlashInferNVLinkOneSided builder", + ALL2ALL_UTILS_OLD_BUILD, + ALL2ALL_UTILS_NEW_BUILD, + ), + ], + ), + ( + MXFP4_TARGET, + # Note: bare "TrtLlmMxfp4ExpertsBase" appears in a comment in the + # base file. Anchor on the assignment we introduce instead. + "defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)", + [ + ("make_mxfp4_moe_kernel defer_input_quant detection", MXFP4_OLD, MXFP4_NEW), + ("make_mxfp4_moe_kernel pass-through", MXFP4_OLD_CALL, MXFP4_NEW_CALL), + ], + ), + ( + PREP_TARGET, + "dispatch_dtype_bytes_per_elem", + [ + ("FlashInferNVLinkOneSided __init__ signature", PREP_OLD_INIT, PREP_NEW_INIT), + ("FlashInferNVLinkOneSided initialize call", PREP_OLD_CALL, PREP_NEW_CALL), + ("FlashInferNVLinkOneSided prepare quant branch", PREP_OLD_QUANT, PREP_NEW_QUANT), + ], + ), +] + + +def patch_file(target: Path, marker: str, patches: list[tuple[str, str, str]]) -> bool: + if not target.exists(): + print(f"[nvlink-bf16-patch] Target not found: {target}", file=sys.stderr) + return False + + content = target.read_text() + if marker in content: + print(f"[nvlink-bf16-patch] {target.name}: already patched, skipping.", file=sys.stderr) + return True + + new_content = content + for name, old, new in patches: + count = new_content.count(old) + if count == 0: + print( + f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} not found. " + "vLLM version may have drifted.", + file=sys.stderr, + ) + return False + if count > 1: + print( + f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} is ambiguous " + f"({count} matches); refusing to patch.", + file=sys.stderr, + ) + return False + new_content = new_content.replace(old, new, 1) + print(f"[nvlink-bf16-patch] {target.name}: patched {name}", file=sys.stderr) + + target.write_text(new_content) + return True + + +def main(): + failures = 0 + for target, marker, patches in FILES: + if not patch_file(target, marker, patches): + failures += 1 + + if failures: + print(f"[nvlink-bf16-patch] {failures} file(s) failed to patch", file=sys.stderr) + sys.exit(1) + print("[nvlink-bf16-patch] Done.", file=sys.stderr) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py b/configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py new file mode 100644 index 00000000..ce63d88a --- /dev/null +++ b/configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py @@ -0,0 +1,382 @@ +""" +Patch vLLM v0.20.0 to backport vllm-project/vllm#40960 — bf16 activation +support for the FlashInfer NVLink one-sided MoE all-to-all path. + +Without the patch, FlashInferNVLinkOneSidedPrepareAndFinalize hard-codes the +dispatch payload to nvfp4 (0.5 B/elem hidden + per-16-elem fp8 scales). That +crashes for experts that prefer to receive bf16 tokens and quantize +post-dispatch (e.g. trtllm_mxfp4_moe with mxfp8 activations) and for any +non-nvfp4 quant_dtype. + +Affected files (from PR diff): + - vllm/distributed/device_communicators/all2all.py + - vllm/model_executor/layers/fused_moe/all2all_utils.py + - vllm/model_executor/layers/fused_moe/oracle/mxfp4.py + - vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py + +Reference: https://github.com/vllm-project/vllm/pull/40960 +Target: vLLM v0.20.0 + +Additional local change (NOT in PR 40960): remove the early guard in +maybe_make_prepare_finalize that raises ValueError for any non-nvfp4 +quant_dtype on the flashinfer_nvlink_one_sided backend. The PR 40960 diff +does not touch this guard, so without removing it the new bf16/defer-quant +path is unreachable on v0.20.0. Reviewers on the PR flagged the same gap. +""" + +import sys +from pathlib import Path + +VLLM_ROOT = Path("/usr/local/lib/python3.12/dist-packages/vllm") + +# --- File 1: distributed/device_communicators/all2all.py ---------------------- + +ALL2ALL_TARGET = VLLM_ROOT / "distributed/device_communicators/all2all.py" + +ALL2ALL_OLD = ( + " top_k: int,\n" + " num_experts: int,\n" + " hidden_size: int,\n" + " ):\n" + ' """Initialize the MoeAlltoAll workspace."""\n' + " if self.initialized:\n" + " return\n" +) + +ALL2ALL_NEW = ( + " top_k: int,\n" + " num_experts: int,\n" + " hidden_size: int,\n" + " dispatch_dtype_bytes_per_elem: int = 0,\n" + " dispatch_has_fp8_scale: bool = True,\n" + " ):\n" + ' """Initialize the MoeAlltoAll workspace.\n' + "\n" + " dispatch_dtype_bytes_per_elem: bytes/elem for the dispatched hidden\n" + " states. Use 0 as a sentinel for sub-byte nvfp4 (0.5 B/elem); use\n" + " 1 for fp8, 2 for bf16/fp16.\n" + " dispatch_has_fp8_scale: whether a per-16-elem fp8 scale tensor is\n" + " dispatched alongside the hidden states (true for nvfp4/fp8,\n" + " false for bf16 passthrough).\n" + ' """\n' + " if self.initialized:\n" + " return\n" +) + +ALL2ALL_OLD_PAYLOAD = ( + " total_dispatch_payload_size_per_token = (\n" + " hidden_size // 2 # nvfp4 hidden states\n" + " + hidden_size // 16 # fp8 scaling factors\n" + " + top_k * 4 # int32 topks ids\n" + " + top_k * 4 # float32 topk weights\n" + " )\n" +) + +ALL2ALL_NEW_PAYLOAD = ( + " if dispatch_dtype_bytes_per_elem == 0:\n" + " hidden_bytes = hidden_size // 2 # nvfp4\n" + " else:\n" + " hidden_bytes = hidden_size * dispatch_dtype_bytes_per_elem\n" + " scale_bytes = hidden_size // 16 if dispatch_has_fp8_scale else 0\n" + " total_dispatch_payload_size_per_token = (\n" + " hidden_bytes\n" + " + scale_bytes\n" + " + top_k * 4 # int32 topks ids\n" + " + top_k * 4 # float32 topk weights\n" + " )\n" +) + +# --- File 2: fused_moe/all2all_utils.py --------------------------------------- + +ALL2ALL_UTILS_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/all2all_utils.py" + +ALL2ALL_UTILS_OLD_SIG = ( + " routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n" + " allow_new_interface: bool = False,\n" + " use_monolithic: bool = False,\n" + ") -> FusedMoEPrepareAndFinalize | None:\n" +) + +ALL2ALL_UTILS_NEW_SIG = ( + " routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n" + " allow_new_interface: bool = False,\n" + " use_monolithic: bool = False,\n" + " defer_input_quant: bool = False,\n" + ") -> FusedMoEPrepareAndFinalize | None:\n" +) + +# Local change: drop the nvfp4-only guard so the bf16/defer_input_quant path +# in the patched builder below is reachable. Not part of PR 40960. +ALL2ALL_UTILS_OLD_GUARD = ( + " elif moe.use_fi_nvl_one_sided_kernels:\n" + " assert quant_config is not None\n" + ' if quant_config.quant_dtype != "nvfp4":\n' + " raise ValueError(\n" + " \"The 'flashinfer_nvlink_one_sided' all2all backend only \"\n" + ' "supports nvfp4 activation quantization, but got "\n' + ' f"quant_dtype={quant_config.quant_dtype!r}. Use a different "\n' + " \"all2all backend (e.g. 'flashinfer_nvlink_two_sided' or \"\n" + " \"'allgather_reducescatter') for non-nvfp4 models.\"\n" + " )\n" + " max_num_tokens = (\n" + " get_current_vllm_config().scheduler_config.max_num_batched_tokens\n" + " )\n" +) + +ALL2ALL_UTILS_NEW_GUARD = ( + " elif moe.use_fi_nvl_one_sided_kernels:\n" + " assert quant_config is not None\n" + " max_num_tokens = (\n" + " get_current_vllm_config().scheduler_config.max_num_batched_tokens\n" + " )\n" +) + +ALL2ALL_UTILS_OLD_BUILD = ( + " prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n" + " max_num_tokens=max_num_tokens,\n" + " top_k=moe.experts_per_token,\n" + " num_experts=moe.num_experts,\n" + " hidden_size=moe.hidden_dim,\n" + " num_dispatchers=all2all_manager.world_size,\n" + " )\n" +) + +ALL2ALL_UTILS_NEW_BUILD = ( + " if defer_input_quant or quant_config.quant_dtype is None:\n" + " # Experts (e.g. trtllm_mxfp4 with mxfp8 activations) quantize\n" + " # post-dispatch; ship bf16 tokens with no per-token scale payload.\n" + " dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 2, False\n" + ' elif quant_config.quant_dtype == "nvfp4":\n' + " dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 0, True\n" + " else:\n" + " raise NotImplementedError(\n" + ' "flashinfer_nvlink_one_sided dispatch only supports nvfp4, "\n' + ' "bf16, and defer_input_quant paths today; got "\n' + ' f"quant_dtype={quant_config.quant_dtype!r}"\n' + " )\n" + " prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n" + " max_num_tokens=max_num_tokens,\n" + " top_k=moe.experts_per_token,\n" + " num_experts=moe.num_experts,\n" + " hidden_size=moe.hidden_dim,\n" + " num_dispatchers=all2all_manager.world_size,\n" + " dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n" + " dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n" + " )\n" +) + +# --- File 3: fused_moe/oracle/mxfp4.py ---------------------------------------- + +MXFP4_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/oracle/mxfp4.py" + +MXFP4_OLD = ( + ' """Create a FusedMoEKernel for the given MXFP4 backend."""\n' + " is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n" + "\n" + " # Create Prepare/Finalize.\n" + " prepare_finalize = maybe_make_prepare_finalize(\n" + " moe=moe_config,\n" +) + +MXFP4_NEW = ( + ' """Create a FusedMoEKernel for the given MXFP4 backend."""\n' + " is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n" + "\n" + " # Some experts (trtllm_mxfp4 with mxfp8 activations) prefer bf16 tokens\n" + " # on dispatch and quantize internally; signal this to the prepare/finalize\n" + " # so workspace + prepare path ship bf16 instead of the quant_config dtype.\n" + " from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import (\n" + " TrtLlmMxfp4ExpertsBase,\n" + " )\n" + "\n" + " defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)\n" + "\n" + " # Create Prepare/Finalize.\n" + " prepare_finalize = maybe_make_prepare_finalize(\n" + " moe=moe_config,\n" +) + +MXFP4_OLD_CALL = ( + " routing_tables=routing_tables,\n" + " allow_new_interface=True,\n" + " use_monolithic=is_monolithic,\n" + " )\n" + " assert prepare_finalize is not None\n" +) + +MXFP4_NEW_CALL = ( + " routing_tables=routing_tables,\n" + " allow_new_interface=True,\n" + " use_monolithic=is_monolithic,\n" + " defer_input_quant=defer_input_quant,\n" + " )\n" + " assert prepare_finalize is not None\n" +) + +# --- File 4: fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py -------- + +PREP_TARGET = VLLM_ROOT / ( + "model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py" +) + +PREP_OLD_INIT = ( + " num_experts: int,\n" + " hidden_size: int,\n" + " num_dispatchers: int = 1,\n" + " ):\n" + " super().__init__()\n" +) + +PREP_NEW_INIT = ( + " num_experts: int,\n" + " hidden_size: int,\n" + " num_dispatchers: int = 1,\n" + " dispatch_dtype_bytes_per_elem: int = 0,\n" + " dispatch_has_fp8_scale: bool = True,\n" + " ):\n" + " super().__init__()\n" +) + +PREP_OLD_CALL = ( + " top_k=self.top_k,\n" + " num_experts=self.num_experts,\n" + " hidden_size=self.hidden_size,\n" + " )\n" +) + +PREP_NEW_CALL = ( + " top_k=self.top_k,\n" + " num_experts=self.num_experts,\n" + " hidden_size=self.hidden_size,\n" + " dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n" + " dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n" + " )\n" +) + +PREP_OLD_QUANT = ( + " a1q, a1q_scale = moe_kernel_quantize_input(\n" + " a1,\n" + " quant_config.a1_gscale,\n" + " quant_config.quant_dtype,\n" + " quant_config.per_act_token_quant,\n" + " quant_config.block_shape,\n" + " is_fp4_scale_swizzled=False, # delay swizzle to after comm\n" + " )\n" +) + +PREP_NEW_QUANT = ( + " if defer_input_quant:\n" + " # Experts (e.g. trtllm_mxfp4_moe with mxfp8 activations) will\n" + " # quantize post-dispatch. Ship bf16 tokens and skip scales.\n" + " a1q, a1q_scale = a1, None\n" + " else:\n" + " a1q, a1q_scale = moe_kernel_quantize_input(\n" + " a1,\n" + " quant_config.a1_gscale,\n" + " quant_config.quant_dtype,\n" + " quant_config.per_act_token_quant,\n" + " quant_config.block_shape,\n" + " is_fp4_scale_swizzled=False, # delay swizzle to after comm\n" + " )\n" +) + +# (target file, marker indicating already-patched, [(name, old, new), ...]) +FILES = [ + ( + ALL2ALL_TARGET, + "dispatch_dtype_bytes_per_elem", + [ + ("MoeAlltoAll.initialize signature", ALL2ALL_OLD, ALL2ALL_NEW), + ("MoeAlltoAll dispatch payload sizing", ALL2ALL_OLD_PAYLOAD, ALL2ALL_NEW_PAYLOAD), + ], + ), + ( + ALL2ALL_UTILS_TARGET, + # Anchor on a string this patch introduces — namely the new parameter + # in maybe_make_prepare_finalize's signature. Bare "defer_input_quant" + # appears in unrelated comments in the base file, so use the full + # declaration form for uniqueness. + "defer_input_quant: bool = False,", + [ + ("maybe_make_prepare_finalize signature", ALL2ALL_UTILS_OLD_SIG, ALL2ALL_UTILS_NEW_SIG), + ( + "remove nvfp4-only guard (local, not from PR 40960)", + ALL2ALL_UTILS_OLD_GUARD, + ALL2ALL_UTILS_NEW_GUARD, + ), + ( + "FlashInferNVLinkOneSided builder", + ALL2ALL_UTILS_OLD_BUILD, + ALL2ALL_UTILS_NEW_BUILD, + ), + ], + ), + ( + MXFP4_TARGET, + # Anchor on the assignment introduced by this patch. + "defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)", + [ + ("make_mxfp4_moe_kernel defer_input_quant detection", MXFP4_OLD, MXFP4_NEW), + ("make_mxfp4_moe_kernel pass-through", MXFP4_OLD_CALL, MXFP4_NEW_CALL), + ], + ), + ( + PREP_TARGET, + "dispatch_dtype_bytes_per_elem", + [ + ("FlashInferNVLinkOneSided __init__ signature", PREP_OLD_INIT, PREP_NEW_INIT), + ("FlashInferNVLinkOneSided initialize call", PREP_OLD_CALL, PREP_NEW_CALL), + ("FlashInferNVLinkOneSided prepare quant branch", PREP_OLD_QUANT, PREP_NEW_QUANT), + ], + ), +] + + +def patch_file(target: Path, marker: str, patches: list[tuple[str, str, str]]) -> bool: + if not target.exists(): + print(f"[nvlink-bf16-patch] Target not found: {target}", file=sys.stderr) + return False + + content = target.read_text() + if marker in content: + print(f"[nvlink-bf16-patch] {target.name}: already patched, skipping.", file=sys.stderr) + return True + + new_content = content + for name, old, new in patches: + count = new_content.count(old) + if count == 0: + print( + f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} not found. " + "vLLM version may have drifted.", + file=sys.stderr, + ) + return False + if count > 1: + print( + f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} is ambiguous " + f"({count} matches); refusing to patch.", + file=sys.stderr, + ) + return False + new_content = new_content.replace(old, new, 1) + print(f"[nvlink-bf16-patch] {target.name}: patched {name}", file=sys.stderr) + + target.write_text(new_content) + return True + + +def main(): + failures = 0 + for target, marker, patches in FILES: + if not patch_file(target, marker, patches): + failures += 1 + + if failures: + print(f"[nvlink-bf16-patch] {failures} file(s) failed to patch", file=sys.stderr) + sys.exit(1) + print("[nvlink-bf16-patch] Done.", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/configs/prefetch-ai-dynamo-wheel.sh b/configs/prefetch-ai-dynamo-wheel.sh new file mode 100755 index 00000000..2a770493 --- /dev/null +++ b/configs/prefetch-ai-dynamo-wheel.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +DYNAMO_VERSION="${DYNAMO_VERSION:-}" + +if [ -z "${DYNAMO_VERSION}" ]; then + echo "ERROR: DYNAMO_VERSION must be set for ai-dynamo wheel prefetch" >&2 + exit 1 +fi + +DYNAMO_PACKAGE="ai-dynamo==${DYNAMO_VERSION}" +DYNAMO_RUNTIME_PACKAGE="ai-dynamo-runtime==${DYNAMO_VERSION}" +DYNAMO_WHEEL_NAME="${DYNAMO_WHEEL_NAME:-ai_dynamo-${DYNAMO_VERSION}-py3-none-any.whl}" +DYNAMO_RUNTIME_WHEEL_PATTERN="${DYNAMO_RUNTIME_WHEEL_PATTERN:-ai_dynamo_runtime-${DYNAMO_VERSION}-*.whl}" +DYNAMO_INDEX_URL="${DYNAMO_INDEX_URL:-https://pypi.org/simple}" +DYNAMO_EXTRA_INDEX_URL="${DYNAMO_EXTRA_INDEX_URL:-https://pypi.nvidia.com}" + +source_dir="${SRTCTL_SOURCE_DIR:-$(pwd)}" +wheel_dir="${DYNAMO_WHEEL_HOST_DIR:-${source_dir}/configs/wheels}" +wheel_path="${wheel_dir}/${DYNAMO_WHEEL_NAME}" +lock_path="${wheel_dir}/.${DYNAMO_WHEEL_NAME}.lock" + +mkdir -p "${wheel_dir}" + +runtime_wheel_path() { + find "${wheel_dir}" -maxdepth 1 -type f -name "${DYNAMO_RUNTIME_WHEEL_PATTERN}" -print -quit +} + +python_with_pip() { + if python3 -m pip --version >/dev/null 2>&1; then + command -v python3 + return + fi + + if ! command -v uv >/dev/null 2>&1; then + echo "ERROR: python3 does not provide pip, and uv is unavailable to create a pip-seeded prefetch venv" >&2 + return 1 + fi + + local prefetch_venv="${DYNAMO_PREFETCH_VENV:-${wheel_dir}/.prefetch-venv}" + uv venv --seed "${prefetch_venv}" >/dev/null + echo "${prefetch_venv}/bin/python" +} + +if [ -f "${wheel_path}" ] && [ -n "$(runtime_wheel_path)" ]; then + echo "ai-dynamo wheels already staged: ${wheel_dir}" + exit 0 +fi + +download_wheels() { + local python_bin + python_bin="$(python_with_pip)" + "${python_bin}" -m pip download \ + --no-deps \ + --pre \ + --only-binary=:all: \ + --index-url "${DYNAMO_INDEX_URL}" \ + --extra-index-url "${DYNAMO_EXTRA_INDEX_URL}" \ + --dest "${wheel_dir}" \ + "${DYNAMO_RUNTIME_PACKAGE}" \ + "${DYNAMO_PACKAGE}" +} + +if command -v flock >/dev/null 2>&1; then + ( + flock -x 9 + if [ ! -f "${wheel_path}" ] || [ -z "$(runtime_wheel_path)" ]; then + echo "Staging ai-dynamo wheels: ${DYNAMO_RUNTIME_PACKAGE} ${DYNAMO_PACKAGE} -> ${wheel_dir}" + download_wheels + fi + ) 9>"${lock_path}" +else + echo "Staging ai-dynamo wheels: ${DYNAMO_RUNTIME_PACKAGE} ${DYNAMO_PACKAGE} -> ${wheel_dir}" + download_wheels +fi + +if [ ! -f "${wheel_path}" ]; then + echo "ERROR: expected ${wheel_path} after download" >&2 + exit 1 +fi + +if [ -z "$(runtime_wheel_path)" ]; then + echo "ERROR: expected ${DYNAMO_RUNTIME_WHEEL_PATTERN} in ${wheel_dir} after download" >&2 + exit 1 +fi diff --git a/configs/vllm-container-deps-one-sided.sh b/configs/vllm-container-deps-one-sided.sh new file mode 100644 index 00000000..d716e84f --- /dev/null +++ b/configs/vllm-container-deps-one-sided.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apt-get -y update && apt-get install -y --no-install-recommends --allow-change-held-packages numactl + +pip install msgpack + +if [ -n "${DYNAMO_VERSION:-}" ] || [ -n "${DYNAMO_WHEEL_NAME:-}" ]; then + if [ -f /configs/install-ai-dynamo.sh ]; then + bash /configs/install-ai-dynamo.sh + else + echo "ERROR: /configs/install-ai-dynamo.sh not found for ai-dynamo wheel install" >&2 + exit 1 + fi +fi + +# Upgrade FlashInfer for the NVLink one-sided all-to-all bf16 dispatch patch. +# flashinfer-python / flashinfer-cubin publish on PyPI; flashinfer-jit-cache is +# CUDA-specific and only on the cu130 index. --index-url replaces PyPI entirely, +# so split into two calls. +pip install --upgrade flashinfer-python==0.6.9 flashinfer-cubin==0.6.9 +pip install --upgrade flashinfer-jit-cache==0.6.9 --index-url https://flashinfer.ai/whl/cu130 + +if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then + python3 /configs/patches/vllm_numa_bind_hash_fix.py +fi + +if [ -f /configs/patches/vllm_nvlink_one_sided_bf16_fix.py ]; then + python3 /configs/patches/vllm_nvlink_one_sided_bf16_fix.py +fi diff --git a/configs/vllm-container-deps.sh b/configs/vllm-container-deps.sh index 43807255..1d48d023 100644 --- a/configs/vllm-container-deps.sh +++ b/configs/vllm-container-deps.sh @@ -2,4 +2,26 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -pip install msgpack \ No newline at end of file +pip install msgpack + +if [ -n "${DYNAMO_VERSION:-}" ] || [ -n "${DYNAMO_WHEEL_NAME:-}" ]; then + if [ -f /configs/install-ai-dynamo.sh ]; then + bash /configs/install-ai-dynamo.sh + else + echo "ERROR: /configs/install-ai-dynamo.sh not found for ai-dynamo wheel install" >&2 + exit 1 + fi +fi + +if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then + python3 /configs/patches/vllm_numa_bind_hash_fix.py +fi + +if [ -f /configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py ]; then + python3 /configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py +fi + +if [ -f /configs/patches/vllm_cumem_expandable_segments_fix.py ]; then + python3 /configs/patches/vllm_cumem_expandable_segments_fix.py +fi + diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p17d-tep4-tp4.yaml new file mode 100644 index 00000000..0dea47db --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p17d-tep4-tp4.yaml @@ -0,0 +1,93 @@ +name: "vllm-disagg-gb300-1p17d-tep4-tp4" +model: + path: "dsv4_pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 17 + prefill_workers: 1 + decode_workers: 17 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "18x36x72" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p6d-dep4-tp4.yaml new file mode 100644 index 00000000..c1d324ca --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p6d-dep4-tp4.yaml @@ -0,0 +1,99 @@ +name: "vllm-disagg-gb200-1p6d-dep4-tp4" +model: + path: "dsv4_pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 6 + prefill_workers: 1 + decode_workers: 6 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "192x256" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml new file mode 100644 index 00000000..83e78d46 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml @@ -0,0 +1,108 @@ +name: "dsv4-vllm-disagg-gb300-4p1d-dep4-dep8" + +model: + path: "dsv4-pro" + container: "vllm/vllm-openai:v0.20.0-aarch64-cu130" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml new file mode 100644 index 00000000..119fafcf --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml @@ -0,0 +1,108 @@ +name: "dsv4-vllm-disagg-gb300-5p1d-dep4-dep8" + +model: + path: "dsv4-pro" + container: "vllm/vllm-openai:v0.20.0-aarch64-cu130" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml new file mode 100644 index 00000000..03c72fa3 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml @@ -0,0 +1,108 @@ +name: "dsv4-vllm-disagg-gb300-6p1d-dep4-dep8" + +model: + path: "dsv4-pro" + container: "vllm/vllm-openai:v0.20.0-aarch64-cu130" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-7p2d-dep4-dep16.yaml new file mode 100644 index 00000000..5278efc0 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-7p2d-dep4-dep16.yaml @@ -0,0 +1,104 @@ +name: "vllm-disagg-gb200-7p2d-dep4-dep16" +model: + path: "dsv4_pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 7 + decode_nodes: 8 + prefill_workers: 7 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048x3072" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml new file mode 100644 index 00000000..ed2004e5 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -0,0 +1,118 @@ +name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: false + +setup_script: vllm-container-deps.sh +health_check: + max_attempts: 360 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9280 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128x256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml new file mode 100644 index 00000000..a29d51a0 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -0,0 +1,119 @@ +name: "svf-vllm-disagg-gb200-1p4d-dep8-tp8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# version: 1.0.2 + install: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml new file mode 100644 index 00000000..d79b78f4 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml @@ -0,0 +1,119 @@ +name: "svf-vllm-disagg-gb200-1p8d-dep8-tp8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# version: 1.0.2 + install: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 16 + prefill_workers: 1 + decode_workers: 8 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128x256x512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml new file mode 100644 index 00000000..2a175a5a --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -0,0 +1,115 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# version: 1.0.2 + install: false + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml new file mode 100644 index 00000000..fd3f4f36 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -0,0 +1,115 @@ +name: "svf-vllm-disagg-gb200-3p1d-dep8-dep16-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# version: 1.0.2 + install: false + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml new file mode 100644 index 00000000..417d8476 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -0,0 +1,115 @@ +name: "svf-vllm-disagg-gb200-3p1d-dep8-dep8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# version: 1.0.2 + install: false + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml new file mode 100644 index 00000000..83b5e6a4 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -0,0 +1,118 @@ +name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh +health_check: + max_attempts: 360 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9280 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 00000000..5943c41c --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,119 @@ +name: "svf-vllm-disagg-gb200-1p1d-dep8-tep8" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh +health_check: + max_attempts: 360 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9280 + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + attention-config: '{"use_fp4_indexer_cache":true}' + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml new file mode 100644 index 00000000..e35f9f62 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -0,0 +1,118 @@ +name: "svf-vllm-disagg-gb200-1p4d-dep8-tp8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +health_check: + max_attempts: 360 + interval_seconds: 10 + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml new file mode 100644 index 00000000..dc2401f8 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml @@ -0,0 +1,118 @@ +name: "svf-vllm-disagg-gb200-1p8d-dep8-tp8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +health_check: + max_attempts: 360 + interval_seconds: 10 + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 16 + prefill_workers: 1 + decode_workers: 8 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8x16x32x64x128x256x512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml new file mode 100644 index 00000000..ef6ad86e --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml @@ -0,0 +1,110 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps-one-sided.sh +health_check: + max_attempts: 360 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml new file mode 100644 index 00000000..e7fa4a49 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -0,0 +1,114 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml new file mode 100644 index 00000000..0b456f83 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml @@ -0,0 +1,118 @@ +name: "svf-vllm-disagg-gb200-3p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh +health_check: + max_attempts: 360 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9280 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048x4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml new file mode 100644 index 00000000..043cd370 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -0,0 +1,114 @@ +name: "svf-vllm-disagg-gb200-3p1d-dep8-dep16-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml new file mode 100644 index 00000000..df713575 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml @@ -0,0 +1,109 @@ +name: "svf-vllm-disagg-gb200-3p1d-dep8-dep8" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps-one-sided.sh +health_check: + max_attempts: 360 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml new file mode 100644 index 00000000..e39ea0df --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -0,0 +1,114 @@ +name: "svf-vllm-disagg-gb200-3p1d-dep8-dep8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml new file mode 100644 index 00000000..87475266 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml @@ -0,0 +1,109 @@ +name: "svf-vllm-disagg-gb200-7p1d-dep8-dep16" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps-one-sided.sh +health_check: + max_attempts: 360 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-dep8-tp4-c256-c512.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-dep8-tp4-c256-c512.yaml new file mode 100644 index 00000000..b8f7592d --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-dep8-tp4-c256-c512.yaml @@ -0,0 +1,111 @@ +name: "svf-vllm-disagg-gb300-1p12d-dep8-tp4" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 12 + prefill_workers: 1 + decode_workers: 12 + gpus_per_prefill: 8 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 64 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 1024 + max-cudagraph-capture-size: 1024 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-tp4-tp4-c32-c128.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-tp4-tp4-c32-c128.yaml new file mode 100644 index 00000000..88571eb7 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-tp4-tp4-c32-c128.yaml @@ -0,0 +1,96 @@ +name: "svf-vllm-disagg-gb300-1p12d-tp4-tp4" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 12 + prefill_workers: 1 + decode_workers: 12 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 10240 #auto + max-num-seqs: 128 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 1024 + max-cudagraph-capture-size: 1024 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x128" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-dep8-tep8-c1024.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-dep8-tep8-c1024.yaml new file mode 100644 index 00000000..eff5968d --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-dep8-tep8-c1024.yaml @@ -0,0 +1,112 @@ +name: "svf-vllm-disagg-gb300-1p4d-dep8-tep8" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 64 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 4096 + max-cudagraph-capture-size: 4096 + max-num-batched-tokens: 4096 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-tp4-tp4-c128.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-tp4-tp4-c128.yaml new file mode 100644 index 00000000..df12dd7a --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-tp4-tp4-c128.yaml @@ -0,0 +1,89 @@ +name: "svf-vllm-disagg-gb300-1p4d-tp4-tp4" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 10240 + max-num-seqs: 128 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 10240 + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-dep8-tp4-c512.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-dep8-tp4-c512.yaml new file mode 100644 index 00000000..2928a199 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-dep8-tp4-c512.yaml @@ -0,0 +1,111 @@ +name: "svf-vllm-disagg-gb300-1p8d-dep8-tp4" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 8 + gpus_per_prefill: 8 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 64 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 1024 + max-cudagraph-capture-size: 1024 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-tp4-tp4-c8-c16-c128.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-tp4-tp4-c8-c16-c128.yaml new file mode 100644 index 00000000..3c2f1b9f --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-tp4-tp4-c8-c16-c128.yaml @@ -0,0 +1,96 @@ +name: "svf-vllm-disagg-gb300-1p8d-tp4-tp4" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 8 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 10240 + max-num-seqs: 128 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 1024 + max-cudagraph-capture-size: 1024 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8x16x128" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep16-offload-c512-c4096.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep16-offload-c512-c4096.yaml new file mode 100644 index 00000000..2aa2464e --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep16-offload-c512-c4096.yaml @@ -0,0 +1,109 @@ +name: "svf-vllm-disagg-gb300-2p1d-dep8-dep16-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep8-offload-c4096.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep8-offload-c4096.yaml new file mode 100644 index 00000000..21b3d196 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep8-offload-c4096.yaml @@ -0,0 +1,109 @@ +name: "svf-vllm-disagg-gb300-2p1d-dep8-dep8-offload" +model: + path: "deepseekv4-fp4" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/src/srtctl/benchmarks/sa_bench.py b/src/srtctl/benchmarks/sa_bench.py index 5f220393..e690cb19 100644 --- a/src/srtctl/benchmarks/sa_bench.py +++ b/src/srtctl/benchmarks/sa_bench.py @@ -101,5 +101,6 @@ def build_command( str(b.num_warmup_mult) if b.num_warmup_mult is not None else "2", b.custom_tokenizer or "", str(b.use_chat_template).lower(), + b.tokenizer_mode or "auto", ] return cmd diff --git a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py index 87f3f9ef..0014f221 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py +++ b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py @@ -629,10 +629,30 @@ def get_tokenizer( "to use mistral tokenizer mode." ) from e return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path)) + if tokenizer_mode == "deepseek_v4": + try: + from vllm.tokenizers.deepseek_v4 import DeepseekV4Tokenizer + except ImportError as e: + raise ImportError( + "DeepseekV4Tokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use deepseek_v4 tokenizer mode." + ) from e + return DeepseekV4Tokenizer.from_pretrained(str(pretrained_model_name_or_path)) if custom_tokenizer: if custom_tokenizer == "glm_moe_dsa": return _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path) + if custom_tokenizer == "deepseek_v4": + try: + from vllm.tokenizers.deepseek_v4 import DeepseekV4Tokenizer + except ImportError as e: + raise ImportError( + "DeepseekV4Tokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use deepseek_v4 tokenizer." + ) from e + return DeepseekV4Tokenizer.from_pretrained(str(pretrained_model_name_or_path)) from importlib import import_module try: module_path, class_name = custom_tokenizer.rsplit('.', 1) diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh index acddf754..999705e0 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh +++ b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh @@ -64,6 +64,10 @@ NUM_PROMPTS_MULT=${13:-10} NUM_WARMUP_MULT=${14:-2} CUSTOM_TOKENIZER=${15:-} USE_CHAT_TEMPLATE=${16:-true} +TOKENIZER_MODE=${17:-auto} + +# Build optional tokenizer mode args +TOKENIZER_MODE_ARGS=(--tokenizer-mode "$TOKENIZER_MODE") # Build optional custom tokenizer args CUSTOM_TOKENIZER_ARGS=() @@ -136,6 +140,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do --percentile-metrics ttft,tpot,itl,e2el \ --max-concurrency "$concurrency" \ --trust-remote-code \ + "${TOKENIZER_MODE_ARGS[@]}" \ + "${CHAT_TEMPLATE_ARGS[@]}" \ "${CUSTOM_TOKENIZER_ARGS[@]}" num_prompts=$((concurrency * 10)) @@ -166,6 +172,7 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do --percentile-metrics ttft,tpot,itl,e2el \ --max-concurrency "$concurrency" \ --trust-remote-code \ + "${TOKENIZER_MODE_ARGS[@]}" \ "${CHAT_TEMPLATE_ARGS[@]}" \ "${CUSTOM_TOKENIZER_ARGS[@]}" \ --save-result --result-dir "$result_dir" --result-filename "$result_filename" @@ -179,4 +186,3 @@ done stop_all_profiling echo "SA-Bench complete. Results in $result_dir" - diff --git a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py index a5ea6490..952a8b23 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py +++ b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py @@ -1272,11 +1272,12 @@ def main(args: argparse.Namespace): "--tokenizer-mode", type=str, default="auto", - choices=["auto", "slow", "mistral", "custom"], + choices=["auto", "slow", "mistral", "custom", "deepseek_v4"], help='The tokenizer mode.\n\n* "auto" will use the ' 'fast tokenizer if available.\n* "slow" will ' "always use the slow tokenizer. \n* " - '"mistral" will always use the `mistral_common` tokenizer. \n*' + '"mistral" will always use the `mistral_common` tokenizer. \n* ' + '"deepseek_v4" will use vLLM\'s DeepSeek V4 tokenizer. \n* ' '"custom" will use --tokenizer to select the preregistered tokenizer.', ) diff --git a/src/srtctl/cli/submit.py b/src/srtctl/cli/submit.py index 21f26d9f..697f14e6 100644 --- a/src/srtctl/cli/submit.py +++ b/src/srtctl/cli/submit.py @@ -19,6 +19,7 @@ import logging import os import re +import shlex import shutil import subprocess import sys @@ -115,7 +116,8 @@ def show_config_details(config: SrtConfig) -> None: console.print(Panel(mounts_table, border_style="green")) # --- Environment Variables --- - has_env = bool(config.environment) + dynamo_environment = config.dynamo.get_wheel_environment() + has_env = bool(config.environment or dynamo_environment) backend = config.backend mode_envs: list[tuple[str, dict[str, str]]] = [] for mode_name, attr in [ @@ -134,6 +136,9 @@ def show_config_details(config: SrtConfig) -> None: env_table.add_column("Variable", style="yellow") env_table.add_column("Value", style="white") + for var, val in sorted(dynamo_environment.items()): + env_table.add_row("dynamo", var, val) + for var, val in sorted(config.environment.items()): env_table.add_row("global", var, val) @@ -207,6 +212,8 @@ def generate_minimal_sbatch_script( container_image = os.path.expandvars(config.model.container) job_name = get_job_name(config) + config_environment = config.dynamo.get_wheel_environment() + config_environment.update(config.environment) rendered = template.render( job_name=job_name, @@ -227,6 +234,7 @@ def generate_minimal_sbatch_script( srtctl_source=str(srtctl_source.resolve()), output_base=output_base, setup_script=setup_script, + config_environment={key: shlex.quote(str(value)) for key, value in config_environment.items()}, ) return rendered diff --git a/src/srtctl/core/runtime.py b/src/srtctl/core/runtime.py index 31195ed3..2a65c67b 100644 --- a/src/srtctl/core/runtime.py +++ b/src/srtctl/core/runtime.py @@ -242,6 +242,9 @@ def from_config( # Add FormattablePath mounts from config.container_mounts # These need to be expanded with the runtime context, so we create a # temporary context first and then update + environment = config.dynamo.get_wheel_environment() + environment.update(config.environment) + temp_context = cls( job_id=job_id, run_name=run_name, @@ -255,7 +258,7 @@ def from_config( network_interface=get_srtslurm_setting("network_interface", "eth0"), container_mounts={}, srun_options=dict(config.srun_options), - environment=dict(config.environment), + environment=environment, is_hf_model=is_hf_model, ) @@ -278,7 +281,7 @@ def from_config( network_interface=get_srtslurm_setting("network_interface", "eth0"), container_mounts=container_mounts, srun_options=dict(config.srun_options), - environment=dict(config.environment), + environment=environment, is_hf_model=is_hf_model, ) diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index c535be39..a6e84bbd 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -14,6 +14,7 @@ import builtins import itertools import logging +import shlex from collections.abc import Iterator, Mapping from dataclasses import field from enum import Enum @@ -543,6 +544,7 @@ class BenchmarkConfig: num_warmup_mult: int | None = None # Multiplier for warmup prompts = concurrency * mult (default: 2) # Trace replay benchmark fields (uses aiperf with mooncake_trace dataset type) trace_file: str | None = None # Path to trace JSONL file (container path, e.g., /traces/dataset.jsonl) + tokenizer_mode: str | None = None # Tokenizer mode passed to SA-Bench (e.g., "auto", "deepseek_v4") custom_tokenizer: str | None = None # Custom tokenizer class (e.g., "module.path.ClassName") use_chat_template: bool = True # Pass --use-chat-template to benchmark (default: true) @@ -680,7 +682,7 @@ def get_nsys_prefix(self, output_file: str, *, frontend_type: str | None = None) class DynamoConfig: """Dynamo installation configuration. - Only one of version, hash, or top_of_tree should be specified. + Only one of version, hash, top_of_tree, or wheel should be specified. Defaults to version="0.8.0" (pip install). Options: @@ -689,31 +691,133 @@ class DynamoConfig: version: Install specific version from PyPI (e.g., "0.8.0") hash: Clone repo and checkout specific commit hash top_of_tree: Clone repo at HEAD (latest) + wheel: ai-dynamo package version to install via staged wheels. The + matching ai-dynamo-runtime wheel is installed automatically. - If top_of_tree or hash is set, version is automatically cleared. + If top_of_tree, hash, or wheel is set, version is automatically cleared. """ install: bool = True version: str | None = "0.8.0" hash: str | None = None top_of_tree: bool = False + wheel: str | None = None def __post_init__(self) -> None: - # Auto-clear version if hash or top_of_tree is set - if self.hash is not None or self.top_of_tree: + install_sources = [ + ("hash", self.hash is not None), + ("top_of_tree", self.top_of_tree), + ("wheel", self.wheel is not None), + ] + enabled_sources = [name for name, enabled in install_sources if enabled] + + # Auto-clear version if another install source is set. + if enabled_sources: object.__setattr__(self, "version", None) # Validate only one source option is set - if self.hash is not None and self.top_of_tree: - raise ValueError("Cannot specify both hash and top_of_tree") + if len(enabled_sources) > 1: + raise ValueError(f"Cannot specify both Dynamo install sources: {', '.join(enabled_sources)}") + + if self.wheel is not None: + if not self.wheel.strip(): + raise ValueError("dynamo.wheel must be a non-empty package version") + if Path(self.wheel).name.endswith(".whl") or "/" in self.wheel: + raise ValueError("dynamo.wheel must be a package version like '1.2.0.dev20260426', not a filename") @property def needs_source_install(self) -> bool: """Whether this config requires a source install (git clone + maturin).""" - return self.hash is not None or self.top_of_tree + return self.wheel is None and (self.hash is not None or self.top_of_tree) + + @property + def wheel_version(self) -> str | None: + """Package version requested for staged wheel installation.""" + return self.wheel + + @property + def wheel_name(self) -> str | None: + """Return the ai-dynamo wheel filename for the requested package version.""" + if not self.wheel: + return None + return f"ai_dynamo-{self.wheel}-py3-none-any.whl" + + def get_wheel_environment(self) -> dict[str, str]: + """Environment variables consumed by ai-dynamo prefetch/setup scripts.""" + if not self.wheel: + return {} + wheel_name = self.wheel_name + env = {"DYNAMO_WHEEL_NAME": wheel_name} if wheel_name else {} + version = self.wheel_version + if version: + env["DYNAMO_VERSION"] = version + return env + + @staticmethod + def _source_install_retry_helpers() -> str: + """Bash helpers for transient network failures during source installs.""" + return ( + "dynamo_retry_git_clone() { " + 'target="$1"; ' + 'attempts="${DYNAMO_INSTALL_RETRIES:-5}"; ' + 'delay="${DYNAMO_INSTALL_RETRY_DELAY:-10}"; ' + 'max_delay="${DYNAMO_INSTALL_RETRY_MAX_DELAY:-120}"; ' + 'jitter="${DYNAMO_INSTALL_RETRY_JITTER:-5}"; ' + "attempt=1; " + "while true; do " + 'tmp_target="${target}.clone.$$.$attempt"; ' + 'rm -rf "$target" "$tmp_target"; ' + 'if git clone https://github.com/ai-dynamo/dynamo.git "$tmp_target"; then ' + 'rm -rf "$target" && mv "$tmp_target" "$target" && return 0; ' + "else " + "rc=$?; " + "fi; " + 'rm -rf "$tmp_target"; ' + 'if [ "$attempt" -ge "$attempts" ]; then ' + 'echo "Dynamo git clone failed after $attempts attempts" >&2; ' + 'return "$rc"; ' + "fi; " + 'sleep_for="$delay"; ' + 'if [ "$jitter" -gt 0 ]; then sleep_for=$((sleep_for + RANDOM % (jitter + 1))); fi; ' + 'echo "Dynamo git clone failed on attempt $attempt/$attempts (exit $rc); retrying in ${sleep_for}s" >&2; ' + 'sleep "$sleep_for"; ' + "attempt=$((attempt + 1)); " + "delay=$((delay * 2)); " + 'if [ "$delay" -gt "$max_delay" ]; then delay="$max_delay"; fi; ' + "done; " + "}; " + ) def get_install_commands(self) -> str: """Get the bash commands to install dynamo.""" + if self.wheel is not None: + wheel_name = self.wheel_name or Path(self.wheel).name + wheels_path_shell = shlex.quote(f"/configs/wheels/{wheel_name}") + configs_path_shell = shlex.quote(f"/configs/{wheel_name}") + version = self.wheel_version + if not version: + raise ValueError("dynamo.wheel must provide an exact package version") + runtime_package = f"ai-dynamo-runtime=={version}" + runtime_package_shell = shlex.quote(runtime_package) + start_message = shlex.quote(f"Installing ai-dynamo-runtime and ai-dynamo from wheel {wheel_name}...") + done_message = shlex.quote(f"ai-dynamo-runtime and ai-dynamo install path completed for {wheel_name}") + return ( + f"echo {start_message} && " + "if [ -f /configs/install-ai-dynamo.sh ]; then " + "bash /configs/install-ai-dynamo.sh; " + f"elif [ -f {wheels_path_shell} ]; then " + "python3 -m pip install --pre --no-deps --no-index " + f"--find-links /configs/wheels {runtime_package_shell} {wheels_path_shell}; " + f"elif [ -f {configs_path_shell} ]; then " + "python3 -m pip install --pre --no-deps --no-index " + f"--find-links /configs {runtime_package_shell} {configs_path_shell}; " + "else " + f"echo 'ERROR: exact ai-dynamo wheels for {version} were not found in /configs/wheels or /configs' >&2; " + "exit 1; " + "fi && " + f"echo {done_message}" + ) + if self.version is not None: return ( f"echo 'Installing dynamo {self.version}...' && " @@ -729,7 +833,7 @@ def get_install_commands(self) -> str: sglang = ( "apt-get update -qq && apt-get install -y -qq libclang-dev > /dev/null 2>&1 && " "cd /sgl-workspace/ && " - "git clone https://github.com/ai-dynamo/dynamo.git && " + "dynamo_retry_git_clone dynamo && " "cd dynamo && " f"{checkout_cmd + ' && ' if checkout_cmd else ''}" "cd lib/bindings/python/ && " @@ -751,7 +855,7 @@ def get_install_commands(self) -> str: "if ! command -v maturin &> /dev/null; then " "pip install --break-system-packages maturin; fi; fi && " "ORIG_DIR=$(pwd) && rm -rf /tmp/dynamo_build && mkdir -p /tmp/dynamo_build && cd /tmp/dynamo_build && " - "git clone https://github.com/ai-dynamo/dynamo.git && " + "dynamo_retry_git_clone dynamo && " "cd dynamo && " f"{checkout_cmd + ' && ' if checkout_cmd else ''}" "cd lib/bindings/python/ && " @@ -767,6 +871,7 @@ def get_install_commands(self) -> str: return ( f"echo 'Installing dynamo from source ({git_ref})...' && " + f"{self._source_install_retry_helpers()}" f"if [ -d /sgl-workspace ]; then {sglang}; else {portable}; fi" ) diff --git a/src/srtctl/core/slurm.py b/src/srtctl/core/slurm.py index 6b5f4e58..7518ae50 100644 --- a/src/srtctl/core/slurm.py +++ b/src/srtctl/core/slurm.py @@ -245,15 +245,16 @@ def start_srun_process( # Build bash command with environment setup bash_parts = [] - # Add preamble if provided - if bash_preamble: - bash_parts.append(bash_preamble) - - # Export environment variables + # Export environment variables before the preamble so setup scripts can + # consume recipe-provided values. if env_to_set: for name, value in env_to_set.items(): bash_parts.append(f"export {name}={shlex.quote(value)}") + # Add preamble if provided + if bash_preamble: + bash_parts.append(bash_preamble) + # Add the main command bash_parts.append(shlex.join(command)) diff --git a/src/srtctl/core/topology.py b/src/srtctl/core/topology.py index f2a24e5d..472e377d 100644 --- a/src/srtctl/core/topology.py +++ b/src/srtctl/core/topology.py @@ -35,8 +35,8 @@ class NodePortAllocator: assignments per node and hands out the next available port. Port ranges (non-overlapping): - - kv_events_port: 5550+ (global) - ZMQ port for kv-events publishing - - nixl_port: 6550+ (global) - NIXL side channel for KV transfers (vLLM) + - kv_events_port: 20000+ (global) - ZMQ port for kv-events publishing + - nixl_port: 21000+ (global) - NIXL side channel for KV transfers (vLLM) - http_port: 30000+ (per node) - HTTP serving port - bootstrap_port: 31000+ (per node) - P/D coordination port (prefill only) @@ -53,8 +53,8 @@ class NodePortAllocator: base_http_port: int = 30000 base_bootstrap_port: int = 31000 - base_kv_events_port: int = 5550 - base_nixl_port: int = 6550 # NIXL side channel ports (must not overlap with kv_events) + base_kv_events_port: int = 20000 + base_nixl_port: int = 21000 # NIXL side channel ports (must not overlap with kv_events) _http_ports: dict[str, int] = field(default_factory=dict, repr=False) _bootstrap_ports: dict[str, int] = field(default_factory=dict, repr=False) diff --git a/src/srtctl/frontends/dynamo.py b/src/srtctl/frontends/dynamo.py index 5e5109a1..48b41ea2 100644 --- a/src/srtctl/frontends/dynamo.py +++ b/src/srtctl/frontends/dynamo.py @@ -83,6 +83,10 @@ def start_frontends( "DYN_REQUEST_PLANE": "nats", } + # Add global recipe environment, including values derived from + # dynamo.wheel, before frontend-specific overrides. + env_to_set.update(runtime.environment) + # Add frontend env from config if config.frontend.env: env_to_set.update(config.frontend.env) diff --git a/src/srtctl/templates/job_script_minimal.j2 b/src/srtctl/templates/job_script_minimal.j2 index 6c0fa9a0..b8c8f50b 100644 --- a/src/srtctl/templates/job_script_minimal.j2 +++ b/src/srtctl/templates/job_script_minimal.j2 @@ -75,18 +75,55 @@ echo "Head node: ${HEAD_NODE}" # Set source directory for container mounts (/configs) export SRTCTL_SOURCE_DIR="${SRTCTL_SOURCE}" +{% for key, value in config_environment.items() %} +export {{ key }}={{ value }} +{% endfor %} + echo "" echo "Preparing srtctl environment..." -# Install uv if not present (single binary, no dependencies) -if ! command -v uv &> /dev/null; then +# SLURM inherits the submitter environment by default. If srtctl was submitted +# from an activated virtualenv, that venv can point at a pipless or wrong-arch +# Python on the compute node. Drop it before selecting Python/uv for the job. +if [ -n "${VIRTUAL_ENV:-}" ]; then + echo "Ignoring inherited virtualenv: ${VIRTUAL_ENV}" + CLEAN_PATH="" + OLD_IFS="${IFS}" + IFS=":" + for PATH_ENTRY in ${PATH}; do + if [ "${PATH_ENTRY}" != "${VIRTUAL_ENV}/bin" ]; then + if [ -z "${CLEAN_PATH}" ]; then + CLEAN_PATH="${PATH_ENTRY}" + else + CLEAN_PATH="${CLEAN_PATH}:${PATH_ENTRY}" + fi + fi + done + IFS="${OLD_IFS}" + export PATH="${CLEAN_PATH}" + unset VIRTUAL_ENV +fi + +# Install a job-local uv so inherited submitter binaries cannot shadow the +# compute-node architecture. +UV_BIN_DIR="${OUTPUT_DIR}/uv-bin" +mkdir -p "${UV_BIN_DIR}" +if ! "${UV_BIN_DIR}/uv" --version >/dev/null 2>&1; then echo "Installing uv package manager..." - curl -LsSf https://astral.sh/uv/install.sh | sh - export PATH="$HOME/.local/bin:$PATH" + curl -LsSf https://astral.sh/uv/install.sh | env XDG_BIN_HOME="${UV_BIN_DIR}" INSTALLER_NO_MODIFY_PATH=1 sh fi +export PATH="${UV_BIN_DIR}:$PATH" echo "Using uv with Python 3.12..." +if [ -n "${DYNAMO_WHEEL_NAME:-}" ] || [ "${SRTCTL_PREFETCH_AI_DYNAMO:-0}" = "1" ]; then + if [ -f "${SRTCTL_SOURCE}/configs/prefetch-ai-dynamo-wheel.sh" ]; then + bash "${SRTCTL_SOURCE}/configs/prefetch-ai-dynamo-wheel.sh" + else + echo "WARNING: ${SRTCTL_SOURCE}/configs/prefetch-ai-dynamo-wheel.sh not found" + fi +fi + {% if setup_script %} # Custom setup script override from CLI export SRTCTL_SETUP_SCRIPT="{{ setup_script }}" diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index c15759b2..5a2b2d47 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -77,6 +77,36 @@ def test_validate_config_valid(self): errors = runner.validate_config(config) assert errors == [] + def test_build_command_includes_tokenizer_mode(self): + """Passes tokenizer mode through to the SA-Bench script.""" + from unittest.mock import MagicMock + + from srtctl.benchmarks.sa_bench import SABenchRunner + from srtctl.core.schema import BenchmarkConfig, ModelConfig, ResourceConfig, SrtConfig + + runner = SABenchRunner() + runtime = MagicMock() + runtime.frontend_port = 8000 + runtime.is_hf_model = False + + config = SrtConfig( + name="test", + model=ModelConfig(path="/model", container="/image", precision="fp4"), + resources=ResourceConfig(gpu_type="h100"), + benchmark=BenchmarkConfig( + type="sa-bench", + isl=1024, + osl=1024, + concurrencies=[4, 8], + tokenizer_mode="deepseek_v4", + use_chat_template=True, + ), + ) + + cmd = runner.build_command(config, runtime) + + assert cmd[-3:] == ["", "true", "deepseek_v4"] + class TestSGLangBenchRunner: """Test SGLang-Bench runner.""" diff --git a/tests/test_configs.py b/tests/test_configs.py index 0b4138d5..1c08d10f 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -125,6 +125,14 @@ def test_hash_install_command(self): assert config.needs_source_install cmd = config.get_install_commands() assert "git clone" in cmd + assert "dynamo_retry_git_clone dynamo" in cmd + assert "DYNAMO_INSTALL_RETRIES:-5" in cmd + assert "DYNAMO_INSTALL_RETRY_DELAY:-10" in cmd + assert "DYNAMO_INSTALL_RETRY_MAX_DELAY:-120" in cmd + assert "DYNAMO_INSTALL_RETRY_JITTER:-5" in cmd + assert "RANDOM % (jitter + 1)" in cmd + assert 'rm -rf "$target" "$tmp_target"' in cmd + assert "else rc=$?; fi" in cmd assert "git checkout abc123" in cmd assert "maturin build" in cmd assert "if [ -d /sgl-workspace ]" in cmd @@ -133,6 +141,60 @@ def test_hash_install_command(self): assert "if ! command -v cargo" in cmd assert "if ! command -v maturin" in cmd + def test_wheel_install_command(self): + """Wheel config installs ai-dynamo plus runtime without source build.""" + from srtctl.core.schema import DynamoConfig + + config = DynamoConfig(wheel="1.2.0.dev20260426") + cmd = config.get_install_commands() + + assert config.version is None + assert config.needs_source_install is False + assert "install-ai-dynamo.sh" in cmd + assert "ai_dynamo-1.2.0.dev20260426-py3-none-any.whl" in cmd + assert "--no-deps" in cmd + assert "ai-dynamo-runtime==1.2.0.dev20260426" in cmd + assert "--find-links /configs/wheels" in cmd + assert "--find-links /configs" in cmd + assert "--extra-index-url" not in cmd + assert "were not found" in cmd + assert "maturin" not in cmd + assert "git clone" not in cmd + + def test_source_install_clone_retry_helper_retries_and_cleans_partial_clone(self, tmp_path): + """Clone helper retries transient failures and cleans partial clone directories.""" + import subprocess + + from srtctl.core.schema import DynamoConfig + + script = f""" +set -euo pipefail +{DynamoConfig._source_install_retry_helpers()} +git() {{ + count=0 + if [ -f attempts ]; then count=$(cat attempts); fi + count=$((count + 1)) + echo "$count" > attempts + mkdir -p "$3" + echo "attempt-$count" > "$3/marker" + if [ "$count" -lt 3 ]; then + return 22 + fi + return 0 +}} +export DYNAMO_INSTALL_RETRIES=4 +export DYNAMO_INSTALL_RETRY_DELAY=0 +export DYNAMO_INSTALL_RETRY_JITTER=0 +dynamo_retry_git_clone dynamo +test "$(cat attempts)" = "3" +test "$(cat dynamo/marker)" = "attempt-3" +if find . -maxdepth 1 -type d -name 'dynamo.clone.*' | grep -q .; then + echo "leftover temp clone" >&2 + exit 1 +fi +""" + subprocess.run(["bash", "-c", script], cwd=tmp_path, check=True, capture_output=True, text=True) + def test_top_of_tree_install_command(self): """Top-of-tree config generates source install without checkout.""" from srtctl.core.schema import DynamoConfig @@ -156,6 +218,40 @@ def test_hash_and_top_of_tree_not_allowed(self): with pytest.raises(ValueError, match="Cannot specify both"): DynamoConfig(hash="abc123", top_of_tree=True) + def test_hash_and_wheel_not_allowed(self): + """Cannot specify both hash and wheel.""" + from srtctl.core.schema import DynamoConfig + + with pytest.raises(ValueError, match="Cannot specify both"): + DynamoConfig(hash="abc123", wheel="1.2.0.dev20260426") + + def test_wheel_filename_not_allowed(self): + """Wheel config takes a package version, not an artifact filename.""" + from srtctl.core.schema import DynamoConfig + + with pytest.raises(ValueError, match="package version"): + DynamoConfig(wheel="ai_dynamo-1.2.0.dev20260426-py3-none-any.whl") + + def test_wheel_version_required(self): + """Wheel config must provide an exact package version.""" + from srtctl.core.schema import DynamoConfig + + with pytest.raises(ValueError, match="non-empty package version"): + DynamoConfig(wheel="") + + def test_wheel_environment_from_version(self): + """Wheel version is converted to setup/prefetch environment.""" + from srtctl.core.schema import DynamoConfig + + config = DynamoConfig(wheel="1.2.0.dev20260426") + + assert config.wheel_version == "1.2.0.dev20260426" + assert config.wheel_name == "ai_dynamo-1.2.0.dev20260426-py3-none-any.whl" + assert config.get_wheel_environment() == { + "DYNAMO_VERSION": "1.2.0.dev20260426", + "DYNAMO_WHEEL_NAME": "ai_dynamo-1.2.0.dev20260426-py3-none-any.whl", + } + class TestSGLangProtocol: """Tests for SGLangProtocol.""" @@ -479,6 +575,32 @@ def test_sbatch_template_includes_setup_script_env_var(self): ) assert 'export SRTCTL_SETUP_SCRIPT="install-sglang-main.sh"' in script + def test_sbatch_template_prefetches_dynamo_wheel(self): + """Test that dynamo.wheel is exported and prefetched before orchestrator launch.""" + from pathlib import Path + + from srtctl.cli.submit import generate_minimal_sbatch_script + from srtctl.core.schema import DynamoConfig, ModelConfig, ResourceConfig, SrtConfig + + config = SrtConfig( + name="test", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig(gpu_type="h100", gpus_per_node=8, agg_nodes=1), + dynamo=DynamoConfig( + install=True, + wheel="1.2.0.dev20260426", + ), + ) + + script = generate_minimal_sbatch_script(config, Path("/tmp/test.yaml")) + + assert "export DYNAMO_VERSION=1.2.0.dev20260426" in script + assert "export DYNAMO_WHEEL_NAME=ai_dynamo-1.2.0.dev20260426-py3-none-any.whl" in script + assert "Ignoring inherited virtualenv" in script + assert 'unset VIRTUAL_ENV' in script + assert 'UV_BIN_DIR="${OUTPUT_DIR}/uv-bin"' in script + assert "configs/prefetch-ai-dynamo-wheel.sh" in script + def test_setup_script_env_var_override(self, monkeypatch): """Test that SRTCTL_SETUP_SCRIPT env var overrides config.""" import os @@ -1323,8 +1445,6 @@ def test_connector_lmcache_generates_kv_transfer_config(self): def test_connector_custom_json_passthrough(self): """connector set to a raw JSON string is passed through as-is.""" - import json - custom = '{"kv_connector":"MyCustomConnector","kv_role":"kv_both"}' cmd = self._build_cmd_with_connector(custom) idx = cmd.index("--kv-transfer-config")