diff --git a/recipes/h200/1k1k/bs128-agg-tp-mtp.yaml b/recipes/h200/1k1k/bs128-agg-tp-mtp.yaml new file mode 100644 index 00000000..45fa8871 --- /dev/null +++ b/recipes/h200/1k1k/bs128-agg-tp-mtp.yaml @@ -0,0 +1,66 @@ +name: "agg-tp-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + agg_nodes: 1 + agg_workers: 1 + gpus_per_node: 8 + +backend: + + # Aggregated environment variables + aggregated_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + aggregated: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 10 + max-running-requests: 128 # sum of all dp + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # CUDA graphs + cuda-graph-max-bs: 128 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x16x32x64x128x256x512" + req_rate: "inf" diff --git a/recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml b/recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml new file mode 100644 index 00000000..7c85acbc --- /dev/null +++ b/recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml @@ -0,0 +1,115 @@ +name: "bs256-1p6d-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 512 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 65536 + chunked-prefill-size: 262144 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512x1024x2048" + req_rate: "inf" diff --git a/recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml b/recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml new file mode 100644 index 00000000..2d1d3626 --- /dev/null +++ b/recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml @@ -0,0 +1,115 @@ +name: "bs256-1p6d-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 512 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.7 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + # concurrencies: "128x256x512" + concurrencies: "512x1024x2048" + req_rate: "inf" diff --git a/recipes/h200/1k1k/low-latency-1p9d-mtp.yaml b/recipes/h200/1k1k/low-latency-1p9d-mtp.yaml new file mode 100644 index 00000000..2f5938bc --- /dev/null +++ b/recipes/h200/1k1k/low-latency-1p9d-mtp.yaml @@ -0,0 +1,113 @@ +name: "low-latency-1p9d-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 9 + decode_workers: 9 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 256 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64x128x256" + req_rate: "inf" diff --git a/recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml b/recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml new file mode 100644 index 00000000..323b10e1 --- /dev/null +++ b/recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml @@ -0,0 +1,115 @@ +name: "bs128-1p1d-dep-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-running-requests: 192 + cuda-graph-max-bs: 192 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128x256x512" + req_rate: "inf" diff --git a/recipes/h200/8k1k/bs128-agg-tp-mtp.yaml b/recipes/h200/8k1k/bs128-agg-tp-mtp.yaml new file mode 100644 index 00000000..8e2e8fe9 --- /dev/null +++ b/recipes/h200/8k1k/bs128-agg-tp-mtp.yaml @@ -0,0 +1,66 @@ +name: "agg-tp-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + agg_nodes: 1 + agg_workers: 1 + gpus_per_node: 8 + +backend: + + # Aggregated environment variables + aggregated_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + aggregated: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 10 + max-running-requests: 32 # sum of all dp + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # CUDA graphs + cuda-graph-max-bs: 32 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x16x32x64x128x256" + req_rate: "inf" diff --git a/recipes/h200/8k1k/bs16-1p3d-mtp.yaml b/recipes/h200/8k1k/bs16-1p3d-mtp.yaml new file mode 100644 index 00000000..97bba3be --- /dev/null +++ b/recipes/h200/8k1k/bs16-1p3d-mtp.yaml @@ -0,0 +1,113 @@ +name: "bs16-1p3d-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 32 + cuda-graph-max-bs: 32 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64" + req_rate: "inf" diff --git a/recipes/h200/8k1k/bs4-1p7d-mtp.yaml b/recipes/h200/8k1k/bs4-1p7d-mtp.yaml new file mode 100644 index 00000000..381bfaa9 --- /dev/null +++ b/recipes/h200/8k1k/bs4-1p7d-mtp.yaml @@ -0,0 +1,113 @@ +name: "bs4-1p7d-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 7 + decode_workers: 7 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 2 + cuda-graph-max-bs: 2 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8" + req_rate: "inf" diff --git a/recipes/h200/8k1k/bs64-2p3d-mtp.yaml b/recipes/h200/8k1k/bs64-2p3d-mtp.yaml new file mode 100644 index 00000000..75535b80 --- /dev/null +++ b/recipes/h200/8k1k/bs64-2p3d-mtp.yaml @@ -0,0 +1,122 @@ +name: "bs64-2p3d-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + context-length: 72000 + max-total-tokens: 128000 + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 16 + cuda-graph-max-bs: 16 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128" + req_rate: "inf" + +# benchmark: +# type: "gpqa" +# num_examples: 198 +# repeat: 4 +# num_threads: 32 +# max_tokens: 64000 diff --git a/recipes/h200/8k1k/bs8-1p6d-mtp.yaml b/recipes/h200/8k1k/bs8-1p6d-mtp.yaml new file mode 100644 index 00000000..d3d61d70 --- /dev/null +++ b/recipes/h200/8k1k/bs8-1p6d-mtp.yaml @@ -0,0 +1,114 @@ +name: "bs8-1p6d-h200-fp8-mtp" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 16 + cuda-graph-max-bs: 16 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2x4x8x16x32" + req_rate: "inf"