ishandhanani · elvischenv · Feb 5, 2026 · Feb 5, 2026 · Feb 7, 2026 · Feb 9, 2026
diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
@@ -0,0 +1,139 @@
+name: "b200-fp4-low-latency-dep4-1p-tep8-5d"
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "16x128x512"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
@@ -0,0 +1,139 @@
+name: "b200-fp4-low-latency-dep4-1p-tep8-6d"
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "32x64x256x512"
+  req_rate: "inf"