diff --git a/recipes/b200-fp4/1k1k.yaml b/recipes/b200-fp4/1k1k.yaml new file mode 100644 index 00000000..b08193bc --- /dev/null +++ b/recipes/b200-fp4/1k1k.yaml @@ -0,0 +1,259 @@ +# B200-FP4 1k1k — STP and MTP in one file +# +# Two inference modes distinguished by override key names: +# zip_override_stp_* — standard token prediction (no speculative decoding) +# zip_override_mtp_* — multi-token prediction (EAGLE speculative decoding) +# +# Low-latency variants: tep8 decode (DP=1), dep4 prefill (DP=4 TP=4) +# Max-throughput variants: dep8 decode (DP=8), adds SGLANG_MOE_NVFP4_DISPATCH +# +# Note: max-tpt 1d has max-running-requests=1024; max-tpt 2d keeps 512. +# MTP max-tpt 1d additionally uses mem-fraction=0.75 for decode. +# +# Usage: +# srtctl apply -f recipes/b200-fp4/1k1k.yaml # all 8 variants +# srtctl apply -f recipes/b200-fp4/1k1k.yaml:*stp* # all STP variants +# srtctl apply -f recipes/b200-fp4/1k1k.yaml:*mtp* # all MTP variants +# srtctl apply -f recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0] # STP 1p5d only +# srtctl dry-run -f recipes/b200-fp4/1k1k.yaml # preview + +base: + name: "b200-fp4-stp-1k1k" + + model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + + resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 + + backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + health_check: + max_attempts: 360 + interval_seconds: 10 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + + +# STP low-latency: tep8 decode (DP=1), scale sweep 1p5d and 1p6d +zip_override_stp_lowlat: + name: + - "b200-fp4-stp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-stp-low-latency-dep4-1p-tep8-6d" + resources: + decode_nodes: [5, 6] + decode_workers: [5, 6] + benchmark: + concurrencies: ["16x128", "32x64x256"] + + +# MTP low-latency: same scales as STP, adds EAGLE speculative decoding + fp4-gemm-backend +zip_override_mtp_lowlat: + name: + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-6d" + resources: + decode_nodes: [5, 6] + decode_workers: [5, 6] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + fp4-gemm-backend: "flashinfer_trtllm" + decode: + fp4-gemm-backend: "flashinfer_trtllm" + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["16x512", "32x64x256x512"] + + +# STP max-throughput: dep8 decode (DP=8), scale sweep 1p1d and 1p2d +# Adds SGLANG_MOE_NVFP4_DISPATCH + SGLANG_FLASHINFER_FP4_GEMM_BACKEND env vars +# 1d: max-running-requests=1024; 2d: keeps 512 +zip_override_stp_maxtpt: + name: + - "b200-fp4-stp-max-tpt-dep4-1p-dep8-1d" + - "b200-fp4-stp-max-tpt-dep4-1p-dep8-2d" + resources: + decode_nodes: [1, 2] + decode_workers: [1, 2] + backend: + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + sglang_config: + prefill: + max-running-requests: [1024, 512] + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: [1024, 512] + cuda-graph-max-bs: [1024, 512] + benchmark: + concurrencies: ["512", "512"] + + +# MTP max-throughput: dep8 decode, scale sweep 1p1d and 1p2d, adds EAGLE speculative decoding +# Adds SGLANG_MOE_NVFP4_DISPATCH + SGLANG_FLASHINFER_FP4_GEMM_BACKEND + fp4-gemm-backend +# 1d: max-running-requests=1024, mem-fraction=0.75 for decode; 2d: keeps 512/0.85 +zip_override_mtp_maxtpt: + name: + - "b200-fp4-mtp-max-tpt-dep4-1p-dep8-1d" + - "b200-fp4-mtp-max-tpt-dep4-1p-dep8-2d" + resources: + decode_nodes: [1, 2] + decode_workers: [1, 2] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + fp4-gemm-backend: "flashinfer_trtllm" + max-running-requests: [1024, 512] + decode: + fp4-gemm-backend: "flashinfer_trtllm" + mem-fraction-static: [0.75, 0.85] + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: [1024, 512] + cuda-graph-max-bs: [1024, 512] + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["512x1024", "512"] diff --git a/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml deleted file mode 100644 index ba299dc8..00000000 --- a/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml +++ /dev/null @@ -1,149 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-5d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "16x512" - req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-6d.yaml deleted file mode 100644 index 449ab625..00000000 --- a/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-6d.yaml +++ /dev/null @@ -1,149 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-6d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 6 - decode_workers: 6 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x256x512" - req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-1d.yaml b/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-1d.yaml deleted file mode 100644 index 522df0f0..00000000 --- a/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-1d.yaml +++ /dev/null @@ -1,153 +0,0 @@ -name: "b200-fp4-max-tpt-dep4-1p-dep8-1d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.75 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024" - req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-2d.yaml b/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-2d.yaml deleted file mode 100644 index a56a3fd1..00000000 --- a/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-2d.yaml +++ /dev/null @@ -1,153 +0,0 @@ -name: "b200-fp4-max-tpt-dep4-1p-dep8-2d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 2 - decode_workers: 2 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512" - req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml deleted file mode 100644 index 0f219d07..00000000 --- a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml +++ /dev/null @@ -1,139 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-5d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "16x128" - req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml deleted file mode 100644 index 55347d69..00000000 --- a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml +++ /dev/null @@ -1,139 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-6d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 6 - decode_workers: 6 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x256" - req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml deleted file mode 100644 index 7e617cb2..00000000 --- a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml +++ /dev/null @@ -1,143 +0,0 @@ -name: "b200-fp4-max-tpt-dep4-1p-dep8-1d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512" - req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml deleted file mode 100644 index 51051ce4..00000000 --- a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml +++ /dev/null @@ -1,143 +0,0 @@ -name: "b200-fp4-max-tpt-dep4-1p-dep8-2d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 2 - decode_workers: 2 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k.yaml b/recipes/b200-fp4/8k1k.yaml new file mode 100644 index 00000000..f5bfc964 --- /dev/null +++ b/recipes/b200-fp4/8k1k.yaml @@ -0,0 +1,351 @@ +# B200-FP4 8k1k — STP and MTP in one file +# +# Three modes distinguished by override key names: +# override_stp_tp4 / override_mtp_tp4: TP4 prefill (DP=1, EP=1) — low-latency single-node +# zip_override_stp_lowlat / zip_override_mtp_lowlat: dep4 prefill + tep8 decode (DP=1) +# override_stp_maxtpt_7p2d / override_mtp_maxtpt_7p2d: dep4 prefill + dep8 decode, 7p2d +# override_mtp_maxtpt_4p1d: MTP-only 4p1d, no frontends, env-var FP4 backend +# +# Usage: +# srtctl apply -f recipes/b200-fp4/8k1k.yaml # all 11 variants +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:*stp* # all STP variants +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:*mtp* # all MTP variants +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:override_stp_tp4 # STP tp4 only +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0] # STP 1p1d only +# srtctl dry-run -f recipes/b200-fp4/8k1k.yaml # preview + +base: + name: "b200-fp4-stp-8k1k" + + dynamo: + version: 0.8.1 + + model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + + frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + + resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + + backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + fp4-gemm-backend: "flashinfer_trtllm" + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + health_check: + max_attempts: 360 + interval_seconds: 10 + + benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: "inf" + + +# STP TP4 prefill mode: TP4 (DP=1, EP=1) instead of dep4 — low-latency single-node +override_stp_tp4: + name: "b200-fp4-stp-low-latency-tp4-1p-tp8-1d" + frontend: + num_additional_frontends: 2 + backend: + sglang_config: + prefill: + data-parallel-size: 1 + expert-parallel-size: 1 + enable-dp-attention: null + enable-dp-lm-head: null + decode: + expert-parallel-size: 1 + benchmark: + concurrencies: "4x8x16x64" + + +# MTP TP4 prefill mode: same as STP tp4 but adds EAGLE speculative decoding +override_mtp_tp4: + name: "b200-fp4-mtp-low-latency-tp4-1p-tp8-1d" + frontend: + num_additional_frontends: 2 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + data-parallel-size: 1 + expert-parallel-size: 1 + enable-dp-attention: null + enable-dp-lm-head: null + decode: + expert-parallel-size: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: "4x8x16x64" + + +# STP low-latency: dep4 prefill + tep8 decode (DP=1), scale sweep 1p1d/1p5d/2p5d +zip_override_stp_lowlat: + name: + - "b200-fp4-stp-low-latency-dep4-1p-tep8-1d" + - "b200-fp4-stp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-stp-low-latency-dep4-2p-tep8-5d" + resources: + prefill_nodes: [1, 1, 2] + prefill_workers: [1, 1, 2] + decode_nodes: [1, 5, 5] + decode_workers: [1, 5, 5] + benchmark: + concurrencies: ["64x128", "8", "4x128"] + + +# MTP low-latency: same scales as STP, adds EAGLE speculative decoding +zip_override_mtp_lowlat: + name: + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-1d" + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-mtp-low-latency-dep4-2p-tep8-5d" + resources: + prefill_nodes: [1, 1, 2] + prefill_workers: [1, 1, 2] + decode_nodes: [1, 5, 5] + decode_workers: [1, 5, 5] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + decode: + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["64x128", "8", "4x128"] + + +# STP max-throughput 7p2d: dep4 prefill + dep8 decode, flashinfer_cutlass backend +override_stp_maxtpt_7p2d: + name: "b200-fp4-stp-max-tpt-dep4-7p-dep8-2d" + resources: + prefill_nodes: 7 + prefill_workers: 7 + decode_nodes: 2 + decode_workers: 2 + backend: + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + sglang_config: + prefill: + max-prefill-tokens: 65536 + chunked-prefill-size: 65536 + max-running-requests: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 2048 + cuda-graph-max-bs: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + benchmark: + concurrencies: "1024x2048" + + +# MTP max-throughput 7p2d: same as STP but adds EAGLE speculative decoding +override_mtp_maxtpt_7p2d: + name: "b200-fp4-mtp-max-tpt-dep4-7p-dep8-2d" + resources: + prefill_nodes: 7 + prefill_workers: 7 + decode_nodes: 2 + decode_workers: 2 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + max-prefill-tokens: 65536 + chunked-prefill-size: 65536 + max-running-requests: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 2048 + cuda-graph-max-bs: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: "1024x2048" + + +# MTP-only: 4p1d, no frontends, SGLANG_FLASHINFER_FP4_GEMM_BACKEND env var (fp4-gemm-backend: null +# removes the sglang_config key), mem-fraction=0.75 for decode +override_mtp_maxtpt_4p1d: + name: "b200-fp4-mtp-max-tpt-dep4-4p-dep8-1d" + dynamo: null + frontend: null + resources: + prefill_nodes: 4 + prefill_workers: 4 + decode_nodes: 1 + decode_workers: 1 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + max-running-requests: 1024 + fp4-gemm-backend: null + decode: + mem-fraction-static: 0.75 + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + fp4-gemm-backend: null + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: "1024" diff --git a/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-1d.yaml b/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-1d.yaml deleted file mode 100644 index 26a76a4b..00000000 --- a/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-1d.yaml +++ /dev/null @@ -1,157 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-1d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml deleted file mode 100644 index 995ce55b..00000000 --- a/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml +++ /dev/null @@ -1,157 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-5d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "8" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-2p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-2p-tep8-5d.yaml deleted file mode 100644 index 2264db05..00000000 --- a/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-2p-tep8-5d.yaml +++ /dev/null @@ -1,157 +0,0 @@ -name: "b200-fp4-low-latency-dep4-2p-tep8-5d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 2 - prefill_workers: 2 - gpus_per_prefill: 4 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x128" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/mtp/low-latency-tp4-1p-tp8-1d.yaml b/recipes/b200-fp4/8k1k/mtp/low-latency-tp4-1p-tp8-1d.yaml deleted file mode 100644 index a950c3b9..00000000 --- a/recipes/b200-fp4/8k1k/mtp/low-latency-tp4-1p-tp8-1d.yaml +++ /dev/null @@ -1,157 +0,0 @@ -name: "b200-fp4-low-latency-tp4-1p-tp8-1d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 2 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 1 - expert-parallel-size: 1 -# enable-dp-attention: false -# enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 1 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x64" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/mtp/max-tpt-dep4-4p-dep8-1d.yaml b/recipes/b200-fp4/8k1k/mtp/max-tpt-dep4-4p-dep8-1d.yaml deleted file mode 100644 index 82c44bac..00000000 --- a/recipes/b200-fp4/8k1k/mtp/max-tpt-dep4-4p-dep8-1d.yaml +++ /dev/null @@ -1,151 +0,0 @@ -name: "b200-fp4-max-tpt-dep4-4p-dep8-1d" - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 4 - prefill_workers: 4 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.75 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/mtp/max-tpt-dep4-7p-dep8-2d.yaml b/recipes/b200-fp4/8k1k/mtp/max-tpt-dep4-7p-dep8-2d.yaml deleted file mode 100644 index bc47ad28..00000000 --- a/recipes/b200-fp4/8k1k/mtp/max-tpt-dep4-7p-dep8-2d.yaml +++ /dev/null @@ -1,160 +0,0 @@ -name: "b200-fp4-max-tpt-dep4-7p-dep8-2d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 7 - prefill_workers: 7 - gpus_per_prefill: 4 - decode_nodes: 2 - decode_workers: 2 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 65536 - chunked-prefill-size: 65536 - context-length: 9600 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_cutlass" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 2048 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_cutlass" - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml deleted file mode 100644 index 03a930d5..00000000 --- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml +++ /dev/null @@ -1,149 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-1d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml deleted file mode 100644 index ca4684d7..00000000 --- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml +++ /dev/null @@ -1,149 +0,0 @@ -name: "b200-fp4-low-latency-dep4-1p-tep8-5d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "8" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml deleted file mode 100644 index 450fbcba..00000000 --- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml +++ /dev/null @@ -1,149 +0,0 @@ -name: "b200-fp4-low-latency-dep4-2p-tep8-5d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 2 - prefill_workers: 2 - gpus_per_prefill: 4 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x128" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml deleted file mode 100644 index f1e3c39f..00000000 --- a/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml +++ /dev/null @@ -1,149 +0,0 @@ -name: "b200-fp4-low-latency-tp4-1p-tp8-1d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 2 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 1 - expert-parallel-size: 1 -# enable-dp-attention: false -# enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 1 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_trtllm" - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x64" - req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml deleted file mode 100644 index a9f0d01e..00000000 --- a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml +++ /dev/null @@ -1,152 +0,0 @@ -name: "b200-fp4-max-tpt-dep4-7p-dep8-2d" - -dynamo: - version: 0.8.1 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 4 - -model: - path: "dsr1" - container: "dynamo-sglang" - precision: "fp4" - -resources: - gpu_type: "b200" - prefill_nodes: 7 - prefill_workers: 7 - gpus_per_prefill: 4 - decode_nodes: 2 - decode_workers: 2 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 65536 - chunked-prefill-size: 65536 - context-length: 9600 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_cutlass" - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "modelopt_fp4" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 2048 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: "flashinfer_cutlass" - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k.yaml b/recipes/b200-fp8/1k1k.yaml new file mode 100644 index 00000000..7489586a --- /dev/null +++ b/recipes/b200-fp8/1k1k.yaml @@ -0,0 +1,281 @@ +# B200-FP8 1k1k — STP and MTP in one file +# +# Two inference modes distinguished by override key names: +# zip_override_stp_* — standard token prediction (no speculative decoding) +# zip_override_mtp_* — multi-token prediction (EAGLE speculative decoding) +# +# Low-latency variants: tep8 decode (DP=1) +# Max-throughput variants: dep8 decode (DP=8) +# +# Usage: +# srtctl apply -f recipes/b200-fp8/1k1k.yaml # all 10 variants +# srtctl apply -f recipes/b200-fp8/1k1k.yaml:*stp* # all STP variants +# srtctl apply -f recipes/b200-fp8/1k1k.yaml:*mtp* # all MTP variants +# srtctl apply -f recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0] # STP 1p1d only +# srtctl dry-run -f recipes/b200-fp8/1k1k.yaml # preview + +base: + name: "b200-fp8-stp-1k1k" + + model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + + resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + + backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + # disable-chunked-prefix-cache: true + + health_check: + max_attempts: 360 + interval_seconds: 10 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + + +# STP low-latency: tep8 decode (DP=1), scale sweep 1p1d and 1p3d +zip_override_stp_lowlat: + name: + - "b200-fp8-stp-low-latency-tep8-1p-1d" + - "b200-fp8-stp-low-latency-tep8-1p-3d" + resources: + decode_nodes: [1, 3] + decode_workers: [1, 3] + benchmark: + concurrencies: ["4", "16x32x64x128x256"] + + +# MTP low-latency: same scales as STP, adds EAGLE speculative decoding +zip_override_mtp_lowlat: + name: + - "b200-fp8-mtp-low-latency-tep8-1p-1d" + - "b200-fp8-mtp-low-latency-tep8-1p-3d" + resources: + decode_nodes: [1, 3] + decode_workers: [1, 3] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + moe-dense-tp-size: 1 + decode: + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["4x64", "4x8x16x32x128"] + + +# STP max-throughput: dep8 decode (DP=8), scale sweep 1p5d and 2p5d +zip_override_stp_maxtpt: + name: + - "b200-fp8-stp-max-tpt-dep8-1p-5d" + - "b200-fp8-stp-max-tpt-dep8-2p-5d" + resources: + prefill_nodes: [1, 2] + prefill_workers: [1, 2] + decode_nodes: [5, 5] + decode_workers: [5, 5] + backend: + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + benchmark: + concurrencies: ["1024", "2048"] + + +# MTP max-throughput: dep8 decode, scale sweep 1p1d/1p5d/2p5d, adds EAGLE speculative decoding +# Note: max-running-requests stays at 512 for MTP (unlike STP which raises to 1024) +zip_override_mtp_maxtpt: + name: + - "b200-fp8-mtp-max-tpt-dep8-1p-1d" + - "b200-fp8-mtp-max-tpt-dep8-1p-5d" + - "b200-fp8-mtp-max-tpt-dep8-2p-5d" + resources: + prefill_nodes: [1, 1, 2] + prefill_workers: [1, 1, 2] + decode_nodes: [1, 5, 5] + decode_workers: [1, 5, 5] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["512x1024x2048x4096", "512x4096", "1024x2048x4096"] + + +# MTP special case: 1p2d uses speculative-num-steps=1 and draft-tokens=2 (vs 2/3 for all others) +override_mtp_maxtpt_1p2d: + name: "b200-fp8-mtp-max-tpt-dep8-1p-2d" + resources: + decode_nodes: 2 + decode_workers: 2 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 1 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 2 + benchmark: + concurrencies: "512x1024x2048" diff --git a/recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p1d.yaml deleted file mode 100644 index 0d29c128..00000000 --- a/recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p1d.yaml +++ /dev/null @@ -1,143 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x64" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p3d.yaml b/recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p3d.yaml deleted file mode 100644 index 9410f4c7..00000000 --- a/recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p3d.yaml +++ /dev/null @@ -1,143 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-3d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 3 - decode_workers: 3 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x128" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p1d.yaml b/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p1d.yaml deleted file mode 100644 index 5fb4f20f..00000000 --- a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p1d.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-1p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048x4096" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p2d.yaml b/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p2d.yaml deleted file mode 100644 index 52f15d3a..00000000 --- a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p2d.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-1p-2d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 2 - decode_workers: 2 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 1 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 2 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p5d.yaml b/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p5d.yaml deleted file mode 100644 index 6c2dec24..00000000 --- a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p5d.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-1p-5d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x4096" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-2p5d.yaml b/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-2p5d.yaml deleted file mode 100644 index ef3bb2da..00000000 --- a/recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-2p5d.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-2p-5d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 2 - prefill_workers: 2 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048x4096" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml deleted file mode 100644 index 214f07f7..00000000 --- a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml +++ /dev/null @@ -1,134 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml deleted file mode 100644 index 2cb6757a..00000000 --- a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml +++ /dev/null @@ -1,134 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-3d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 3 - decode_workers: 3 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "16x32x64x128x256" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml deleted file mode 100644 index 50657865..00000000 --- a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-1p-5d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024" - req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml deleted file mode 100644 index b3292d13..00000000 --- a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-2p-5d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 2 - prefill_workers: 2 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2048" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k.yaml b/recipes/b200-fp8/8k1k.yaml new file mode 100644 index 00000000..881b13bf --- /dev/null +++ b/recipes/b200-fp8/8k1k.yaml @@ -0,0 +1,253 @@ +# B200-FP8 8k1k — STP and MTP in one file +# +# Two inference modes distinguished by override key names: +# zip_override_stp_* — standard token prediction (no speculative decoding) +# zip_override_mtp_* — multi-token prediction (EAGLE speculative decoding) +# +# Low-latency variants: tep8 decode (DP=1) +# Max-throughput variants: dep8 decode (DP=8) +# +# Usage: +# srtctl apply -f recipes/b200-fp8/8k1k.yaml # all 10 variants +# srtctl apply -f recipes/b200-fp8/8k1k.yaml:*stp* # all STP variants +# srtctl apply -f recipes/b200-fp8/8k1k.yaml:*mtp* # all MTP variants +# srtctl apply -f recipes/b200-fp8/8k1k.yaml:zip_override_stp_lowlat[0] # STP 1p1d only +# srtctl dry-run -f recipes/b200-fp8/8k1k.yaml # preview + +base: + name: "b200-fp8-stp-8k1k" + + model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + + resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + + backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + # disable-chunked-prefix-cache: true + + health_check: + max_attempts: 360 + interval_seconds: 10 + + benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: "inf" + + +# STP low-latency: tep8 decode (DP=1), scale sweep 1p1d/1p4d/1p6d +zip_override_stp_lowlat: + name: + - "b200-fp8-stp-low-latency-tep8-1p-1d" + - "b200-fp8-stp-low-latency-tep8-1p-4d" + - "b200-fp8-stp-low-latency-tep8-1p-6d" + resources: + decode_nodes: [1, 4, 6] + decode_workers: [1, 4, 6] + benchmark: + concurrencies: ["4x32x64", "64", "32"] + + +# MTP low-latency: same scales as STP, adds EAGLE speculative decoding +zip_override_mtp_lowlat: + name: + - "b200-fp8-mtp-low-latency-tep8-1p-1d" + - "b200-fp8-mtp-low-latency-tep8-1p-4d" + - "b200-fp8-mtp-low-latency-tep8-1p-6d" + resources: + decode_nodes: [1, 4, 6] + decode_workers: [1, 4, 6] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + moe-dense-tp-size: 1 + decode: + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["16x32x64", "8x256", "4x8x16x256"] + + +# STP max-throughput: dep8 decode (DP=8), scale sweep 1p1d and 2p1d +zip_override_stp_maxtpt: + name: + - "b200-fp8-stp-max-tpt-dep8-1p-1d" + - "b200-fp8-stp-max-tpt-dep8-2p-1d" + resources: + prefill_nodes: [1, 2] + prefill_workers: [1, 2] + decode_nodes: [1, 1] + decode_workers: [1, 1] + backend: + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + benchmark: + concurrencies: ["128", "256"] + + +# MTP max-throughput: dep8 decode, scale sweep 1p1d/1p2d/2p1d, adds EAGLE speculative decoding +# Note: max-running-requests stays at 512 for MTP (unlike STP which raises to 1024) +zip_override_mtp_maxtpt: + name: + - "b200-fp8-mtp-max-tpt-dep8-1p-1d" + - "b200-fp8-mtp-max-tpt-dep8-1p-2d" + - "b200-fp8-mtp-max-tpt-dep8-2p-1d" + resources: + prefill_nodes: [1, 1, 2] + prefill_workers: [1, 1, 2] + decode_nodes: [1, 2, 1] + decode_workers: [1, 2, 1] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["256", "128x256x512x1024", "128x512"] diff --git a/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p1d.yaml deleted file mode 100644 index 29306154..00000000 --- a/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p1d.yaml +++ /dev/null @@ -1,143 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "16x32x64" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p4d.yaml b/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p4d.yaml deleted file mode 100644 index 417f58fe..00000000 --- a/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p4d.yaml +++ /dev/null @@ -1,143 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-4d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 4 - decode_workers: 4 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "8x256" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p6d.yaml b/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p6d.yaml deleted file mode 100644 index a1602de4..00000000 --- a/recipes/b200-fp8/8k1k/mtp/low-latency-tep8-1p6d.yaml +++ /dev/null @@ -1,143 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-6d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 6 - decode_workers: 6 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x256" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-1p1d.yaml b/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-1p1d.yaml deleted file mode 100644 index 4d05406f..00000000 --- a/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-1p1d.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-1p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-1p2d.yaml b/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-1p2d.yaml deleted file mode 100644 index 62d7f861..00000000 --- a/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-1p2d.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-1p-2d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 2 - decode_workers: 2 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128x256x512x1024" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-2p1d.yaml b/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-2p1d.yaml deleted file mode 100644 index 0fa0c3f8..00000000 --- a/recipes/b200-fp8/8k1k/mtp/max-tpt-dep8-2p1d.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-2p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 2 - prefill_workers: 2 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - # disable-chunked-prefix-cache: true - - # MTP settings - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128x512" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml deleted file mode 100644 index cb5b9d23..00000000 --- a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml +++ /dev/null @@ -1,134 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x32x64" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml deleted file mode 100644 index e3a2c810..00000000 --- a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml +++ /dev/null @@ -1,134 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-4d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 4 - decode_workers: 4 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml deleted file mode 100644 index c863a0c8..00000000 --- a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml +++ /dev/null @@ -1,134 +0,0 @@ -name: "b200-fp8-low-latency-tep8-1p-6d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 6 - decode_workers: 6 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 512 - cuda-graph-max-bs: 512 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - # moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml deleted file mode 100644 index 06a969f4..00000000 --- a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-1p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128" - req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml deleted file mode 100644 index 6885cd5f..00000000 --- a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "b200-fp8-max-tpt-dep8-2p-1d" - -model: - path: "dsr1-fp8" - container: "dynamo-sglang" - precision: "fp8" - -resources: - gpu_type: "b200" - prefill_nodes: 2 - prefill_workers: 2 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - DYN_REQUEST_PLANE: nats - - sglang_config: - prefill: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 1024 - disable-cuda-graph: true - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - - decode: - # Model configuration - served-model-name: "deepseek-ai/DeepSeek-R1" - trust-remote-code: true - quantization: "fp8" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 9600 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - - # Parallelism - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true - - # Attention - attention-backend: "trtllm_mla" - kv-cache-dtype: "fp8_e4m3" - - # MoE - moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 - - # Other flags - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256" - req_rate: "inf" diff --git a/tests/test_override.py b/tests/test_override.py index 5aed0e42..f498267b 100644 --- a/tests/test_override.py +++ b/tests/test_override.py @@ -109,7 +109,7 @@ def test_full_expansion(self) -> None: suffixes = [s for s, _ in variants] assert suffixes == ["small", "tp_0", "tp_1"] - # override auto-name and deep-merge + # override auto-name and deep-merge (no explicit name in override) assert variants[0][1]["name"] == "base-job_small" assert variants[0][1]["resources"]["decode_nodes"] == 2