diff --git a/recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml new file mode 100644 index 00000000..57cc4630 --- /dev/null +++ b/recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml @@ -0,0 +1,109 @@ +name: "h100-fp8-1p1d-max-dep-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml new file mode 100644 index 00000000..39c102aa --- /dev/null +++ b/recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml @@ -0,0 +1,111 @@ +name: "h100-fp8-1p2d-max-tp-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 2 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf" diff --git a/recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml b/recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml new file mode 100644 index 00000000..501d972d --- /dev/null +++ b/recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml @@ -0,0 +1,97 @@ +name: "h100-fp8-1p1d-max-dep" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml b/recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml new file mode 100644 index 00000000..460c7bb1 --- /dev/null +++ b/recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml @@ -0,0 +1,97 @@ +name: "h100-fp8-1p2d-max-tp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 2 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf" diff --git a/recipes/h100/1k8k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/recipes/h100/1k8k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml new file mode 100644 index 00000000..ea5b978f --- /dev/null +++ b/recipes/h100/1k8k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml @@ -0,0 +1,111 @@ +name: "h100-fp8-1p1d-max-dep-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/recipes/h100/1k8k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/recipes/h100/1k8k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml new file mode 100644 index 00000000..430511c2 --- /dev/null +++ b/recipes/h100/1k8k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml @@ -0,0 +1,111 @@ +name: "h100-fp8-1p2d-max-tp-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 2 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/recipes/h100/1k8k/stp/h100-fp8-1p1d-max-dep.yaml b/recipes/h100/1k8k/stp/h100-fp8-1p1d-max-dep.yaml new file mode 100644 index 00000000..d071cfad --- /dev/null +++ b/recipes/h100/1k8k/stp/h100-fp8-1p1d-max-dep.yaml @@ -0,0 +1,97 @@ +name: "h100-fp8-1p1d-max-dep" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1x2x4x8" + req_rate: "inf" diff --git a/recipes/h100/1k8k/stp/h100-fp8-1p2d-max-tp.yaml b/recipes/h100/1k8k/stp/h100-fp8-1p2d-max-tp.yaml new file mode 100644 index 00000000..8a6313b6 --- /dev/null +++ b/recipes/h100/1k8k/stp/h100-fp8-1p2d-max-tp.yaml @@ -0,0 +1,97 @@ +name: "h100-fp8-1p2d-max-tp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 2 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 32 + cuda-graph-max-bs: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1x2x4x8x16x32" + req_rate: "inf" diff --git a/recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml new file mode 100644 index 00000000..ec6fb1c4 --- /dev/null +++ b/recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml @@ -0,0 +1,111 @@ +name: "h100-fp8-1p1d-max-dep-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml new file mode 100644 index 00000000..902d4282 --- /dev/null +++ b/recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml @@ -0,0 +1,111 @@ +name: "h100-fp8-1p1d-max-tp-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf" diff --git a/recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml b/recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml new file mode 100644 index 00000000..7e558619 --- /dev/null +++ b/recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml @@ -0,0 +1,97 @@ +name: "h100-fp8-1p1d-max-dep" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml b/recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml new file mode 100644 index 00000000..c6bcb18a --- /dev/null +++ b/recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml @@ -0,0 +1,97 @@ +name: "h100-fp8-1p1d-max-tp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf"