diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
new file mode 100644
index 00000000..15c552f0
--- /dev/null
+++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
@@ -0,0 +1,139 @@
+name: "b200-fp4-low-latency-dep4-1p-tep8-5d"
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "16x128x512"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
new file mode 100644
index 00000000..d70696ed
--- /dev/null
+++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
@@ -0,0 +1,139 @@
+name: "b200-fp4-low-latency-dep4-1p-tep8-6d"
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "32x64x256x512"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml
new file mode 100644
index 00000000..6be9fbf8
--- /dev/null
+++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml
@@ -0,0 +1,143 @@
+name: "b200-fp4-max-tpt-dep4-1p-dep8-1d"
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml
new file mode 100644
index 00000000..d3d1ad0f
--- /dev/null
+++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml
@@ -0,0 +1,143 @@
+name: "b200-fp4-max-tpt-dep4-1p-dep8-2d"
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 2
+  decode_workers: 2
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml
new file mode 100644
index 00000000..a0c39932
--- /dev/null
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml
@@ -0,0 +1,149 @@
+name: "b200-fp4-low-latency-dep4-1p-tep8-1d"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64x128"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
new file mode 100644
index 00000000..f0ee8950
--- /dev/null
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
@@ -0,0 +1,149 @@
+name: "b200-fp4-low-latency-dep4-1p-tep8-5d"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml
new file mode 100644
index 00000000..d085f621
--- /dev/null
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml
@@ -0,0 +1,149 @@
+name: "b200-fp4-low-latency-dep4-2p-tep8-5d"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x128"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml
new file mode 100644
index 00000000..74a0b8b8
--- /dev/null
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml
@@ -0,0 +1,149 @@
+name: "b200-fp4-low-latency-tp4-1p-tp8-1d"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+#      enable-dp-attention: false
+#      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x64"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml
new file mode 100644
index 00000000..b61f50b1
--- /dev/null
+++ b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml
@@ -0,0 +1,143 @@
+name: "b200-fp4-max-tpt-dep4-4p-dep8-1d"
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 4
+  prefill_workers: 4
+  gpus_per_prefill: 4
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 1024
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024"
+  req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml
new file mode 100644
index 00000000..04373acf
--- /dev/null
+++ b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml
@@ -0,0 +1,152 @@
+name: "b200-fp4-max-tpt-dep4-7p-dep8-2d"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
+model:
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+  decode_nodes: 2
+  decode_workers: 2
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 65536
+      chunked-prefill-size: 65536
+      context-length: 9600
+      max-running-requests: 1024
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 2048
+      cuda-graph-max-bs: 1024
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024x2048"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml
new file mode 100644
index 00000000..892a8cab
--- /dev/null
+++ b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml
@@ -0,0 +1,134 @@
+name: "b200-fp8-low-latency-tep8-1p-1d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml
new file mode 100644
index 00000000..ec0a691c
--- /dev/null
+++ b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml
@@ -0,0 +1,134 @@
+name: "b200-fp8-low-latency-tep8-1p-3d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "16x32x64x128x256"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml
new file mode 100644
index 00000000..852990c2
--- /dev/null
+++ b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml
@@ -0,0 +1,138 @@
+name: "b200-fp8-max-tpt-dep8-1p-5d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml
new file mode 100644
index 00000000..27150ce1
--- /dev/null
+++ b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml
@@ -0,0 +1,138 @@
+name: "b200-fp8-max-tpt-dep8-2p-5d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  decode_nodes: 5
+  decode_workers: 5
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 2200
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml
new file mode 100644
index 00000000..6cedfa72
--- /dev/null
+++ b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml
@@ -0,0 +1,134 @@
+name: "b200-fp8-low-latency-tep8-1p-1d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x32x64"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml
new file mode 100644
index 00000000..6eac575e
--- /dev/null
+++ b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml
@@ -0,0 +1,134 @@
+name: "b200-fp8-low-latency-tep8-1p-4d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml
new file mode 100644
index 00000000..69804809
--- /dev/null
+++ b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml
@@ -0,0 +1,134 @@
+name: "b200-fp8-low-latency-tep8-1p-6d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 8
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      # moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml
new file mode 100644
index 00000000..3f3fcf6f
--- /dev/null
+++ b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml
@@ -0,0 +1,138 @@
+name: "b200-fp8-max-tpt-dep8-1p-1d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 1024
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128"
+  req_rate: "inf"
diff --git a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml
new file mode 100644
index 00000000..0549b9dd
--- /dev/null
+++ b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml
@@ -0,0 +1,138 @@
+name: "b200-fp8-max-tpt-dep8-2p-1d"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 1024
+      disable-cuda-graph: true
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      quantization: "fp8"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+      context-length: 9600
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+
+      # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      # Attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # MoE
+      moe-runner-backend: "flashinfer_trtllm"
+      moe-dense-tp-size: 1
+
+      # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256"
+  req_rate: "inf"