diff --git a/.gitignore b/.gitignore index 5a522a4d..fb5fbd86 100644 --- a/.gitignore +++ b/.gitignore @@ -51,4 +51,8 @@ configs/*.tar.gz .ruff_cache/ *.egg-info/ -.coverage \ No newline at end of file +.coverage + +configs/dg-* +configs/flashinfer-cache/ +outputs/* diff --git a/recipies/gb200-fp4/1k8k/low-latency.yaml b/recipies/gb200-fp4/1k8k/low-latency.yaml index 119be5ca..6c2a9536 100644 --- a/recipies/gb200-fp4/1k8k/low-latency.yaml +++ b/recipies/gb200-fp4/1k8k/low-latency.yaml @@ -1,8 +1,16 @@ name: "gb200-fp4-1p2d" +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + model: path: "dsr1" - container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" + container: "lmsysorg/sglang:v0.5.5.post2" precision: "fp4" resources: @@ -24,8 +32,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -43,8 +49,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -64,7 +68,7 @@ backend: moe-runner-backend: "flashinfer_trtllm" stream-interval: 10 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 mem-fraction-static: 0.95 max-total-tokens: 8192 chunked-prefill-size: 8192 @@ -77,7 +81,6 @@ backend: data-parallel-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 - disaggregation-transfer-backend: nixl decode: disaggregation-mode: "decode" @@ -92,7 +95,7 @@ backend: disaggregation-bootstrap-port: 30001 stream-interval: 10 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 mem-fraction-static: 0.95 chunked-prefill-size: 8192 cuda-graph-max-bs: 256 @@ -100,7 +103,6 @@ backend: moe-dense-tp-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 - disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml index f6ad6141..d2c46140 100644 --- a/recipies/gb200-fp4/1k8k/max-tpt.yaml +++ b/recipies/gb200-fp4/1k8k/max-tpt.yaml @@ -1,10 +1,16 @@ -# 4P1D, with 12 Decode Nodes. Uses single batch overlap - name: "gb200-fp4-max-tpt" +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + model: path: "dsr1" - container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" + container: "lmsysorg/sglang:v0.5.5.post2" precision: "fp4" resources: @@ -56,13 +62,13 @@ backend: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true - disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -80,7 +86,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -112,7 +118,6 @@ backend: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true - disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -130,7 +135,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -228,7 +233,6 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true - fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 48 diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml index bd5f8a23..bf455b72 100644 --- a/recipies/gb200-fp4/1k8k/mid-curve.yaml +++ b/recipies/gb200-fp4/1k8k/mid-curve.yaml @@ -1,11 +1,16 @@ -# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher -# per gpu throughput +name: "gb200-fp4-mid-curve" -name: "gb200-fp4-max-tpt-2" +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: path: "dsr1" - container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" + container: "lmsysorg/sglang:v0.5.5.post2" precision: "fp4" resources: @@ -57,6 +62,7 @@ backend: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -67,7 +73,6 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" - disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -81,7 +86,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -117,7 +122,6 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" - disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -131,7 +135,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -228,7 +232,6 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true - fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 32 diff --git a/recipies/h200/1k1k/bs128-agg-tp.yaml b/recipies/h200/1k1k/bs128-agg-tp.yaml new file mode 100644 index 00000000..c036f948 --- /dev/null +++ b/recipies/h200/1k1k/bs128-agg-tp.yaml @@ -0,0 +1,59 @@ +name: "agg-tp-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + agg_nodes: 1 + agg_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + aggregated: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 10 + max-running-requests: 512 # sum of all dp + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # CUDA graphs + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x16x32x64x128x256x512" + req_rate: "inf" diff --git a/recipies/h200/1k1k/bs256-1p6d-dep.yaml b/recipies/h200/1k1k/bs256-1p6d-dep.yaml new file mode 100644 index 00000000..76b58665 --- /dev/null +++ b/recipies/h200/1k1k/bs256-1p6d-dep.yaml @@ -0,0 +1,100 @@ +name: "bs256-1p6d-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 512 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 65536 + chunked-prefill-size: 262144 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 512 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512x1024x2048" + req_rate: "inf" + diff --git a/recipies/h200/1k1k/bs256-1p6d-tp.yaml b/recipies/h200/1k1k/bs256-1p6d-tp.yaml new file mode 100644 index 00000000..1214d55b --- /dev/null +++ b/recipies/h200/1k1k/bs256-1p6d-tp.yaml @@ -0,0 +1,100 @@ +name: "bs256-1p6d-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 512 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.7 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 512 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + # concurrencies: "128x256x512" + concurrencies: "512x1024x2048" + req_rate: "inf" + diff --git a/recipies/h200/1k1k/low-latency-1p9d.yaml b/recipies/h200/1k1k/low-latency-1p9d.yaml new file mode 100644 index 00000000..5e88422b --- /dev/null +++ b/recipies/h200/1k1k/low-latency-1p9d.yaml @@ -0,0 +1,97 @@ +name: "low-latency-1p9d-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 9 + decode_workers: 9 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 256 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 256 + cuda-graph-max-bs: 256 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64x128x256" + req_rate: "inf" diff --git a/recipies/h200/8k1k/bs128-1p1d-dep.yaml b/recipies/h200/8k1k/bs128-1p1d-dep.yaml new file mode 100644 index 00000000..1a08a8ca --- /dev/null +++ b/recipies/h200/8k1k/bs128-1p1d-dep.yaml @@ -0,0 +1,100 @@ +name: "bs128-1p1d-dep-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.88 + max-running-requests: 256 + cuda-graph-max-bs: 256 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128x256" + req_rate: "inf" + diff --git a/recipies/h200/8k1k/bs128-agg-tp.yaml b/recipies/h200/8k1k/bs128-agg-tp.yaml new file mode 100644 index 00000000..9191f8a7 --- /dev/null +++ b/recipies/h200/8k1k/bs128-agg-tp.yaml @@ -0,0 +1,60 @@ +name: "agg-tp-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + agg_nodes: 1 + agg_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + aggregated: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 10 + max-running-requests: 256 # sum of all dp + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # CUDA graphs + cuda-graph-max-bs: 256 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x16x32x64x128x256" + req_rate: "inf" + diff --git a/recipies/h200/8k1k/bs16-1p3d.yaml b/recipies/h200/8k1k/bs16-1p3d.yaml new file mode 100644 index 00000000..95f756dd --- /dev/null +++ b/recipies/h200/8k1k/bs16-1p3d.yaml @@ -0,0 +1,98 @@ +name: "bs16-1p3d-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 32 + cuda-graph-max-bs: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8x16x32" + req_rate: "inf" + diff --git a/recipies/h200/8k1k/bs4-1p7d.yaml b/recipies/h200/8k1k/bs4-1p7d.yaml new file mode 100644 index 00000000..75fe19f9 --- /dev/null +++ b/recipies/h200/8k1k/bs4-1p7d.yaml @@ -0,0 +1,98 @@ +name: "bs4-1p7d-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 7 + decode_workers: 7 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 8 + cuda-graph-max-bs: 8 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8" + req_rate: "inf" + diff --git a/recipies/h200/8k1k/bs64-2p3d.yaml b/recipies/h200/8k1k/bs64-2p3d.yaml new file mode 100644 index 00000000..23b1a3d9 --- /dev/null +++ b/recipies/h200/8k1k/bs64-2p3d.yaml @@ -0,0 +1,106 @@ +name: "bs64-2p3d-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + context-length: 72000 + max-total-tokens: 128000 + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 128 + cuda-graph-max-bs: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128" + req_rate: "inf" + +# benchmark: +# type: "gpqa" +# num_examples: 198 +# repeat: 4 +# num_threads: 32 +# max_tokens: 64000 \ No newline at end of file diff --git a/recipies/h200/8k1k/bs8-1p6d.yaml b/recipies/h200/8k1k/bs8-1p6d.yaml new file mode 100644 index 00000000..8b18c479 --- /dev/null +++ b/recipies/h200/8k1k/bs8-1p6d.yaml @@ -0,0 +1,99 @@ +name: "bs8-1p6d-h200-fp8" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + # Decode-specific environment variables + decode_environment: + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 16 + cuda-graph-max-bs: 16 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16" + req_rate: "inf" + diff --git a/src/srtctl/backends/trtllm.py b/src/srtctl/backends/trtllm.py index 25f1673c..2553860f 100644 --- a/src/srtctl/backends/trtllm.py +++ b/src/srtctl/backends/trtllm.py @@ -16,6 +16,7 @@ # Type alias for worker modes WorkerMode = Literal["prefill", "decode", "agg"] + @dataclass(frozen=True) class TRTLLMServerConfig: """SGLang server CLI configuration per mode (prefill/decode/aggregated). @@ -30,6 +31,7 @@ class TRTLLMServerConfig: Schema: ClassVar[type[Schema]] = Schema + @dataclass(frozen=True) class TRTLLMProtocol: """TRTLLM protocol - implements BackendProtocol. @@ -169,7 +171,7 @@ def build_worker_command( "--extra-engine-args", str(container_config_path), "--request-plane", - "nats" + "nats", ] return cmd diff --git a/src/srtctl/cli/mixins/frontend_stage.py b/src/srtctl/cli/mixins/frontend_stage.py index 17dd33a2..72111c6b 100644 --- a/src/srtctl/cli/mixins/frontend_stage.py +++ b/src/srtctl/cli/mixins/frontend_stage.py @@ -146,6 +146,9 @@ def _start_nginx(self, topology: FrontendTopology) -> ManagedProcess: container_image=str(self.runtime.container_image), container_mounts=self.runtime.container_mounts, use_bash_wrapper=False, # Already wrapped in bash -c + srun_options={ + "container-remap-root": "", + }, ) return ManagedProcess( diff --git a/src/srtctl/cli/mixins/worker_stage.py b/src/srtctl/cli/mixins/worker_stage.py index bbae37ae..fc4347e9 100644 --- a/src/srtctl/cli/mixins/worker_stage.py +++ b/src/srtctl/cli/mixins/worker_stage.py @@ -137,7 +137,6 @@ def __missing__(self, key: str) -> str: formatted_value = value.format_map(SafeDict(template_vars)) env_to_set[key] = formatted_value - # Add profiling environment variables if profiling.enabled: profile_dir = str(self.runtime.log_dir / "profiles") diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index eabdc5c6..7e338f0c 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -585,7 +585,7 @@ def get_install_commands(self) -> str: if self.version is not None: return ( f"echo 'Installing dynamo {self.version}...' && " - f"pip install --quiet ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && " + f"pip install --break-system-packages --quiet ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && " f"echo 'Dynamo {self.version} installed'" ) @@ -600,7 +600,7 @@ def get_install_commands(self) -> str: "cd dynamo && " f"{checkout_cmd + ' && ' if checkout_cmd else ''}" "cd lib/bindings/python/ && " - "export RUSTFLAGS=\"${RUSTFLAGS:-} -C target-cpu=native\" && " + 'export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native" && ' "maturin build -o /tmp && " "pip install /tmp/ai_dynamo_runtime*.whl && " "cd /sgl-workspace/dynamo/ && "