From d04bb6b2bab86d2c207a28dfbc8c071efa00eb29 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 00:19:02 +0900 Subject: [PATCH 1/8] fix --- python/sglang/srt/managers/tp_worker.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index f37138a72749..9822ef285e5c 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -239,8 +239,11 @@ def __init__( is_draft_model=is_draft_worker, ) + # Init DLLM algorithm if server_args.dllm_algorithm is not None: self.dllm_algorithm = DllmAlgorithm.from_server_args(server_args) + else: + self.dllm_algorithm = None self._model_runner = ModelRunner( model_config=self.model_config, @@ -349,7 +352,7 @@ def get_worker_info(self): ) def is_dllm(self): - return hasattr(self, "dllm_algorithm") + return self.dllm_config is not None def forward_batch_generation( self, From cf4000518f4f940510ff764a25860f89c7de6a6f Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 00:27:01 +0900 Subject: [PATCH 2/8] fix --- python/sglang/srt/managers/schedule_batch.py | 28 +++++++++----------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index f712fe0164e4..72dbec7a0ae3 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -710,7 +710,6 @@ def __init__( self.dimensions = dimensions # For diffusion LLM - self.dllm_ids = [] self.dllm_block_offset = 0 self.dllm_config = dllm_config @@ -786,22 +785,21 @@ def finished(self) -> bool: def is_dllm(self): return self.dllm_config is not None + def _get_fill_ids_for_dllm(self): + if not self.fill_ids: + dllm_ids = ( + self.origin_input_ids + + [self.dllm_config.mask_id] * self.dllm_config.block_size + ) + else: + self.dllm_block_offset += self.dllm_config.block_size + dllm_ids += [self.dllm_config.mask_id] * self.dllm_config.block_size + + return dllm_ids + def init_next_round_input(self, tree_cache: Optional[BasePrefixCache] = None): if self.is_dllm(): - if not self.fill_ids: - self.dllm_ids = ( - self.origin_input_ids - + [ - self.dllm_config.mask_id, - ] - * self.dllm_config.block_size - ) - else: - self.dllm_block_offset += self.dllm_config.block_size - self.dllm_ids += [ - self.dllm_config.mask_id - ] * self.dllm_config.block_size - self.fill_ids = self.dllm_ids + self.fill_ids = self._get_fill_ids_for_dllm() else: self.fill_ids = self.origin_input_ids + self.output_ids From 38d6f3c2c21af1a30d750a20e183fdc9bf241adf Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 00:28:05 +0900 Subject: [PATCH 3/8] fix --- python/sglang/srt/managers/tp_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 9822ef285e5c..215728bb9800 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -352,7 +352,7 @@ def get_worker_info(self): ) def is_dllm(self): - return self.dllm_config is not None + return self.dllm_algorithm is not None def forward_batch_generation( self, From 87d99fd5b6b8efe54dc7719e4e04a4365e822e9c Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 00:29:19 +0900 Subject: [PATCH 4/8] fix --- python/sglang/srt/managers/schedule_batch.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 72dbec7a0ae3..db438f73bca2 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1320,9 +1320,11 @@ def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int]) ), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}" def prepare_for_extend(self): - self.forward_mode = ( - ForwardMode.DLLM_EXTEND if self.is_dllm() else ForwardMode.EXTEND - ) + self.forward_mode = ForwardMode.EXTEND + + if self.is_dllm(): + # For DLLM, we use a separate forward mode + self.forward_mode = ForwardMode.DLLM_EXTEND # Init tensors reqs = self.reqs From 3784037d05509a5b8ffe34192575880db3d2343e Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 00:37:50 +0900 Subject: [PATCH 5/8] fix --- .../srt/model_executor/cuda_graph_runner.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 1fd483da3752..9d67023ceed9 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -832,16 +832,17 @@ def replay( graph_key = self.bs self.graphs[graph_key].replay() output = self.output_buffers[graph_key] + if isinstance(output, LogitsProcessorOutput): + if self.is_dllm: + next_token_logits = full_logits = None + else: + next_token_logits = output.next_token_logits[: self.raw_num_token] + full_logits = output.full_logits[: self.raw_num_token] + return LogitsProcessorOutput( - next_token_logits=( - output.next_token_logits[: self.raw_num_token] - if not self.is_dllm - else None - ), - full_logits=( - output.full_logits[: self.raw_num_token] if self.is_dllm else None - ), + next_token_logits=next_token_logits, + full_logits=full_logits, hidden_states=( output.hidden_states[: self.raw_num_token] if output.hidden_states is not None From c974c5277e45b545aa522c907b86cf599cd82d2c Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 00:44:50 +0900 Subject: [PATCH 6/8] fix --- python/sglang/srt/managers/tp_worker.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 215728bb9800..758f0ffc9571 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -354,6 +354,18 @@ def get_worker_info(self): def is_dllm(self): return self.dllm_algorithm is not None + def _forward_batch_generation_dllm( + self, forward_batch: ForwardBatch + ) -> GenerationBatchResult: + logits_output, next_token_ids, can_run_cuda_graph = self.dllm_algorithm.run( + self.model_runner, forward_batch + ) + return GenerationBatchResult( + logits_output=logits_output, + next_token_ids=next_token_ids, + can_run_cuda_graph=can_run_cuda_graph, + ) + def forward_batch_generation( self, model_worker_batch: ModelWorkerBatch, @@ -383,14 +395,7 @@ def forward_batch_generation( if self.pp_group.is_last_rank: if self.is_dllm(): - logits_output, next_token_ids, can_run_cuda_graph = ( - self.dllm_algorithm.run(self.model_runner, forward_batch) - ) - return GenerationBatchResult( - logits_output=logits_output, - next_token_ids=next_token_ids, - can_run_cuda_graph=can_run_cuda_graph, - ) + return self._forward_batch_generation_dllm(forward_batch) logits_output, can_run_cuda_graph = self.model_runner.forward( forward_batch, From a95a5a43ec0e2c21f195ac630c2f8b326c87d8ca Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 00:56:45 +0900 Subject: [PATCH 7/8] fix --- python/sglang/srt/model_executor/cuda_graph_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 9d67023ceed9..06726733aaeb 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -835,10 +835,11 @@ def replay( if isinstance(output, LogitsProcessorOutput): if self.is_dllm: - next_token_logits = full_logits = None + next_token_logits = None + full_logits = output.full_logits[: self.raw_num_token] else: + full_logits = None next_token_logits = output.next_token_logits[: self.raw_num_token] - full_logits = output.full_logits[: self.raw_num_token] return LogitsProcessorOutput( next_token_logits=next_token_logits, From 8558e4979661389dafc730ef49c3e26d02ce8069 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 11 Dec 2025 01:16:14 +0900 Subject: [PATCH 8/8] fix --- python/sglang/srt/managers/schedule_batch.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index db438f73bca2..baeba5298182 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -785,21 +785,19 @@ def finished(self) -> bool: def is_dllm(self): return self.dllm_config is not None - def _get_fill_ids_for_dllm(self): + def _init_fill_ids_for_dllm(self): if not self.fill_ids: - dllm_ids = ( + self.fill_ids = ( self.origin_input_ids + [self.dllm_config.mask_id] * self.dllm_config.block_size ) else: self.dllm_block_offset += self.dllm_config.block_size - dllm_ids += [self.dllm_config.mask_id] * self.dllm_config.block_size - - return dllm_ids + self.fill_ids += [self.dllm_config.mask_id] * self.dllm_config.block_size def init_next_round_input(self, tree_cache: Optional[BasePrefixCache] = None): if self.is_dllm(): - self.fill_ids = self._get_fill_ids_for_dllm() + self._init_fill_ids_for_dllm() else: self.fill_ids = self.origin_input_ids + self.output_ids