@@ -169,7 +169,6 @@ def __init__(self,
169169 self .draft_model_engine = draft_model_engine
170170
171171 # enqueue and _fetch_new_requests used data
172- self .active = True
173172 self .next_req_id = max_batch_size # The first max_batch_size request IDs are reserved for dummy requests
174173 self .max_beam_width = max_beam_width
175174 self .max_draft_len = max_draft_len
@@ -196,7 +195,6 @@ def __init__(self,
196195 self .max_num_active_requests = model_engine .get_max_num_sequences ()
197196 self .active_requests : List [LlmRequest ] = []
198197 self .expected_num_active_requests = 0
199- self .has_context_request = False
200198 self .ctx_in_transmission_requests = []
201199 self .previous_batch : Optional [BatchState ] = None
202200 self .num_scheduled_requests : int = 0
@@ -1148,7 +1146,7 @@ def _check_disagg_gen_transfer_status(self):
11481146 @nvtx_range ("_pad_attention_dp_dummy_request" )
11491147 def _pad_attention_dp_dummy_request (self ):
11501148 """
1151- Pad with a dummy request, if required, to ensure every attention_dp rank has at least one active request.
1149+ Pad with a generation dummy request, if required, to ensure every attention_dp rank has at least one active request.
11521150 """
11531151 if not self .enable_attention_dp :
11541152 return
@@ -1166,8 +1164,8 @@ def _pad_attention_dp_dummy_request(self):
11661164 if self .expected_num_active_requests - num_active_request > 0 and num_active_request == 0 :
11671165 llm_request = self .kv_cache_manager .add_dummy_requests (
11681166 request_ids = [0 ],
1169- is_gen = not self . has_context_request ,
1170- prepare_resource = not self . has_context_request ,
1167+ is_gen = True ,
1168+ prepare_resource = True ,
11711169 max_num_draft_tokens = self .max_draft_len ,
11721170 )[0 ]
11731171 llm_request .is_attention_dp_dummy = True
0 commit comments