diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 1c961b5983da..00fc5db9a741 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -6626,6 +6626,14 @@ def check_server_args(self): f"Got: {self.pp_max_micro_batch_size}" ) + assert not (self.disable_cuda_graph_padding and self.enable_torch_compile), ( + "--disable-cuda-graph-padding is incompatible with --enable-torch-compile. " + "With padding disabled, every distinct batch size gets its own torch.compile + " + "Triton autotune cycle (O(max_batch_size) compilations) instead of the small fixed " + "set of padded bucket sizes, causing engine initialisation to stall for many minutes. " + "Remove --disable-cuda-graph-padding or --enable-torch-compile." + ) + if self.pp_size > 1: assert ( self.disable_overlap_schedule and self.speculative_algorithm is None