diff --git a/recipies/gb200-fp4/1k8k/low-latency.yaml b/recipies/gb200-fp4/1k8k/low-latency.yaml index a40ed9ba..b89d0982 100644 --- a/recipies/gb200-fp4/1k8k/low-latency.yaml +++ b/recipies/gb200-fp4/1k8k/low-latency.yaml @@ -2,7 +2,7 @@ name: "gb200-fp4-1p2d" model: path: "dsr1" - container: "lmsysorg/sglang:dev-cu13" + container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" precision: "fp4" resources: diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml index adbc3855..97f0f3f1 100644 --- a/recipies/gb200-fp4/1k8k/max-tpt.yaml +++ b/recipies/gb200-fp4/1k8k/max-tpt.yaml @@ -4,7 +4,7 @@ name: "gb200-fp4-max-tpt" model: path: "dsr1" - container: "lmsysorg/sglang:dev-cu13" + container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" precision: "fp4" resources: @@ -55,7 +55,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -230,6 +229,7 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 48 diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml index 358126cb..1e5771f2 100644 --- a/recipies/gb200-fp4/1k8k/mid-curve.yaml +++ b/recipies/gb200-fp4/1k8k/mid-curve.yaml @@ -5,7 +5,7 @@ name: "gb200-fp4-max-tpt-2" model: path: "dsr1" - container: "lmsysorg/sglang:dev-cu13" + container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" precision: "fp4" resources: @@ -56,7 +56,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -230,6 +229,7 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 32