diff --git a/python/sglang/srt/model_executor/breakable_cuda_graph_runner.py b/python/sglang/srt/model_executor/breakable_cuda_graph_runner.py index 31206ce3c932..69426f29f359 100644 --- a/python/sglang/srt/model_executor/breakable_cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/breakable_cuda_graph_runner.py @@ -122,12 +122,17 @@ def __init__(self, model_runner: ModelRunner): language_model = getattr( model_runner.model, "language_model", model_runner.model ) - self.layer_model = ( - language_model.model - if hasattr(language_model, "model") - and hasattr(language_model.model, "layers") - else language_model - ) + if hasattr(language_model, "model") and hasattr(language_model.model, "layers"): + self.layer_model = language_model.model + else: + # If we can't find the inner layer_model, disable BCG. + self.layer_model = None + logger.warning( + "[BCG] Could not resolve inner layer_model on %s. BCG is " + "disabled for this model; prefill will fall back to eager.", + type(language_model).__name__, + ) + return # Memory pool if get_global_graph_memory_pool() is None: @@ -330,6 +335,8 @@ def _capture_all(self): self.output_buffers[num_tokens] = output def can_run(self, forward_batch: "ForwardBatch"): + if self.layer_model is None: + return False if forward_batch.forward_mode.is_target_verify(): return False if forward_batch.input_embeds is not None: