ishandhanani · ishandhanani · Feb 4, 2026 · Jan 29, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
@@ -0,0 +1,109 @@
+name: "h100-fp8-1p1d-max-dep-mtp"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 4
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # MTP (Multi-Token Prediction)
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+
+      # MTP
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
+  req_rate: "inf"
diff --git a/recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
@@ -0,0 +1,111 @@
+name: "h100-fp8-1p2d-max-tp-mtp"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 2
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      max-running-requests: 2
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # MTP (Multi-Token Prediction)
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+      # MTP
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml b/recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml
@@ -0,0 +1,97 @@
+name: "h100-fp8-1p1d-max-dep"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 4
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
+  req_rate: "inf"