dominicshanshan
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 15 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 31 additions & 14 deletions b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 31 additions & 14 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/guided_decoder.py‎
Lines changed: 2 additions & 6 deletions b/‎tensorrt_llm/_torch/pyexecutor/guided_decoder.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 51 additions & 54 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 51 additions & 54 deletions
@@ -5,6 +5,7 @@
 import torch
 from torch._prims_common import DeviceLikeType
 
+from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
 from tensorrt_llm._utils import nvtx_range
 
 from ...._utils import mpi_rank, mpi_world_size
@@ -256,6 +257,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
     assert isinstance(executor_config.pytorch_backend_config, LlmArgs), msg
     ad_config: LlmArgs = executor_config.pytorch_backend_config
 
+    max_num_sequences = ad_config.max_batch_size * dist_mapping.pp_size
     # some derivative properties
     max_draft_tokens = (
         0 if ad_config.speculative_config is None else ad_config.speculative_config.max_draft_tokens
@@ -272,7 +274,13 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
         max_seq_len=ad_config.max_seq_len,
         max_batch_size=ad_config.max_batch_size,
     )
-    resource_manager = ResourceManager({ResourceManagerType.KV_CACHE_MANAGER: kv_cache_manager})
+    seq_slot_manager = SeqSlotManager(max_num_sequences=max_num_sequences)
+    resource_manager = ResourceManager(
+        {
+            ResourceManagerType.KV_CACHE_MANAGER: kv_cache_manager,
+            ResourceManagerType.SEQ_SLOT_MANAGER: seq_slot_manager,
+        }
+    )
     resource_manager.resource_managers.move_to_end(ResourceManagerType.KV_CACHE_MANAGER, last=True)
 
     # scheduling
@@ -287,10 +295,14 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
     # https://github.com/NVIDIA/TensorRT-LLM/issues/5254
     # We should expose mixed_sample to our build_and_run_ad script so we can configure this
     # correctly for models as needed.
-    sampler = TorchSampler(
+    sampler_args = TorchSampler.Args(
         max_seq_len=ad_config.max_seq_len,
+        max_draft_tokens=max_draft_tokens,
+        max_num_sequences=max_num_sequences,
+        max_beam_width=executor_config.max_beam_width,
         mixed_sampler=ad_config.mixed_sampler,
     )
+    sampler = TorchSampler(sampler_args)
 
     # creating the executor object
     py_executor = PyExecutor(
@@ -299,6 +311,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
         model_engine=engine,
         sampler=sampler,
         dist=mpi_dist,
+        max_num_sequences=max_num_sequences,
         disable_overlap_scheduler=ad_config.disable_overlap_scheduler,
         max_input_len=ad_config.max_input_len,
         max_batch_size=ad_config.max_batch_size,
 
@@ -26,8 +26,7 @@
 from .resource_manager import (KVCacheManager, MambaHybridCacheManager,
                                PeftCacheManager, ResourceManager,
                                ResourceManagerType)
-from .sampler import (EarlyStopSampler, TorchSampler, TorchStarAttentionSampler,
-                      TRTLLMSampler)
+from .sampler import EarlyStopSampler, TorchSampler, TRTLLMSampler
 from .scheduler import (BindCapacityScheduler, BindMicroBatchScheduler,
                         SimpleScheduler)
 from .seq_slot_manager import SeqSlotManager
@@ -514,6 +513,7 @@ def create_py_executor_instance(
         sampler=sampler,
         drafter=drafter,
         dist=dist,
+        max_num_sequences=max_num_sequences,
         disable_overlap_scheduler=pytorch_backend_config.
         disable_overlap_scheduler,
         max_batch_size=executor_config.max_batch_size,
@@ -525,27 +525,44 @@ def create_py_executor_instance(
         garbage_collection_gen0_threshold=garbage_collection_gen0_threshold)
 
 
-def instantiate_sampler(model_engine: PyTorchModelEngine,
+def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
+                              *, max_seq_len: int, mixed_sampler: bool):
+    max_num_sequences = executor_config.max_batch_size * mapping.pp_size
+    max_draft_tokens = (0 if executor_config.speculative_config is None else
+                        executor_config.speculative_config.max_draft_tokens)
+    return TorchSampler.Args(
+        max_seq_len=max_seq_len,
+        max_draft_tokens=max_draft_tokens,
+        max_num_sequences=max_num_sequences,
+        max_beam_width=executor_config.max_beam_width,
+        mixed_sampler=mixed_sampler,
+    )
+
+
+def instantiate_sampler(engine: PyTorchModelEngine,
                         executor_config: ExecutorConfig,
                         pytorch_backend_config: PyTorchConfig,
                         mapping: Mapping):
+    sampler_args = create_torch_sampler_args(
+        executor_config,
+        mapping,
+        max_seq_len=engine.max_seq_len,
+        mixed_sampler=pytorch_backend_config.mixed_sampler)
     if mapping.cp_config.get('cp_type') == 'star_attention':
         assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
-        return TorchStarAttentionSampler(max_seq_len=model_engine.max_seq_len)
-    spec_config = model_engine.spec_config
-    if spec_config is not None and spec_config.spec_dec_mode.has_spec_decoder():
-        return get_spec_decoder(max_seq_len=model_engine.max_seq_len,
-                                spec_config=spec_config)
+        return TorchSampler(sampler_args)
+    if engine.spec_config is not None and engine.spec_config.spec_dec_mode.has_spec_decoder(
+    ):
+        return get_spec_decoder(sampler_args, engine.spec_config)
     if pytorch_backend_config.enable_trtllm_sampler:
-        return TRTLLMSampler(executor_config, model_engine.model,
-                             model_engine.dtype, mapping,
-                             get_decoding_mode(executor_config),
+        decoding_mode = get_decoding_mode(executor_config)
+        return TRTLLMSampler(executor_config, engine.model, engine.dtype,
+                             mapping, decoding_mode,
                              pytorch_backend_config.disable_overlap_scheduler)
-    elif not model_engine.model.model_config.is_generation:
+    if not engine.model.model_config.is_generation:
         # NOTE: choose sampler based on model type
         return EarlyStopSampler()
-    return TorchSampler(max_seq_len=model_engine.max_seq_len,
-                        mixed_sampler=pytorch_backend_config.mixed_sampler)
+    return TorchSampler(sampler_args)
 
 
 def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode:
 
@@ -1,4 +1,3 @@
-import itertools
 import math
 from typing import List, Optional
 
@@ -52,8 +51,7 @@ def bitmask_size(self) -> int:
 
     def build(self, scheduled_requests: ScheduledRequests,
               resource_manager: SeqSlotManager) -> None:
-        for llm_req in itertools.chain(scheduled_requests.context_requests,
-                                       scheduled_requests.generation_requests):
+        for llm_req in scheduled_requests.all_requests():
             if llm_req.guided_decoding_params is None:
                 continue
             slot = resource_manager.slot_manager.get_slot(llm_req.request_id)
@@ -84,9 +82,7 @@ def execute(self, scheduled_requests: ScheduledRequests,
         torch.cuda.current_stream().wait_stream(self._stream)
 
         batched_logits, batched_bitmask = [], []
-        for i, llm_req in enumerate(
-                itertools.chain(scheduled_requests.context_requests,
-                                scheduled_requests.generation_requests)):
+        for i, llm_req in enumerate(scheduled_requests.all_requests()):
             if llm_req.guided_decoding_params is None:
                 continue
             if llm_req.is_context_init_state and not llm_req.is_last_context_chunk:
 
@@ -254,6 +254,7 @@ def __init__(
             exclude_last_generation_logits: bool = False,
             return_perf_metrics: bool = False,
             stop_words_list: list[list[int]] | None = None,
+            is_draft: bool = False,
             **kwargs):
         self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
                                                     None)
@@ -288,6 +289,7 @@ def __init__(
         self.py_return_context_logits = return_context_logits
         self.py_return_generation_logits = return_generation_logits
         self.py_return_logits_device_memory = return_logits_device_memory
+        self.py_is_draft = is_draft
 
         # TODO: remove this when use DynamicDecodeOp in pytorch flow.
         # currently, keep py_stop_words_list as python list, rather than tensor.
 
@@ -4,7 +4,6 @@
 import gc
 import glob
 import inspect
-import itertools
 import math
 import multiprocessing
 import os
@@ -21,6 +20,7 @@
 import torch._dynamo.config
 
 import tensorrt_llm.bindings.internal.userbuffers as ub
+from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
 from tensorrt_llm._torch.pyexecutor.sampler import SampleStateTensors
 from tensorrt_llm._torch.speculative.mtp import SampleStateTensorsMTP
 from tensorrt_llm._utils import (is_trace_enabled, local_mpi_rank,
@@ -319,6 +319,7 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
 
 
 class PyTorchModelEngine(ModelEngine):
+    BEAM_WIDTH = 1
 
     def __init__(
         self,
@@ -659,13 +660,12 @@ def get_autotune_warmup_request():
             return result
 
         @contextlib.contextmanager
-        def release_batch(result):
+        def release_batch(result: ScheduledRequests | None):
             try:
                 yield result
             finally:
                 if result is not None:
-                    for req in itertools.chain(result.generation_requests,
-                                               result.context_requests):
+                    for req in result.all_requests():
                         kv_cache_manager.free_resources(req)
                         if spec_resource_manager is not None:
                             spec_resource_manager.free_resources(req)
@@ -1153,7 +1153,15 @@ def _prepare_tp_inputs(
         draft_lens = []
         mrope_config = defaultdict(list)
 
-        batch_idx = 0
+        mtp_batch_idx = 0  # Temporary: MTP (and Eagle3OneModel) remain the only samplers to index new_tokens serially
+
+        def py_batch_idx(request: LlmRequest) -> int:
+            if not self.without_logits:
+                return request.seq_slot
+            nonlocal mtp_batch_idx
+            batch_idx = mtp_batch_idx
+            mtp_batch_idx += 1
+            return batch_idx
 
         for request in scheduled_requests.context_requests:
             request_ids.append(request.py_request_id)
@@ -1184,10 +1192,9 @@ def _prepare_tp_inputs(
                 ) if mrope_rotary_cos_sin.device == 'cpu' else mrope_rotary_cos_sin
                 mrope_config['mrope_rotary_cos_sin'].append(
                     mrope_rotary_cos_sin.to('cuda', non_blocking=True))
-            request.py_batch_idx = batch_idx
-            batch_idx += 1
+            request.py_batch_idx = py_batch_idx(request)
 
-        num_ctx_requests = batch_idx
+        num_ctx_requests = len(scheduled_requests.context_requests)
         num_ctx_tokens = len(input_ids)
         new_tokens_device, new_tokens_lens_device, next_draft_tokens_device = None, None, None
         if new_tensors_device is not None:
@@ -1227,7 +1234,7 @@ def _prepare_tp_inputs(
             assert spec_dec_mode.support_overlap_scheduler(
             ), f"{self.spec_config.spec_dec_name} does not support overlap scheduler"
 
-        # will contain previous batch incices of generation requests
+        # will contain previous batch indices of generation requests
         previous_batch_indices = []
         previous_pos_indices = []
         for request in extend_requests:
@@ -1272,8 +1279,7 @@ def _prepare_tp_inputs(
             else:
                 # update batch index
                 previous_batch_idx = request.py_batch_idx
-                request.py_batch_idx = batch_idx
-                batch_idx += 1
+                request.py_batch_idx = py_batch_idx(request)
                 # inputs
                 # overlap scheduler can only support the speculative decoding
                 # methods with a fixed number of draft tokens
@@ -1324,8 +1330,18 @@ def _prepare_tp_inputs(
             prompt_lengths.append(request.py_prompt_len)
             draft_lens.append(0)
 
-            request.py_batch_idx = batch_idx
-            batch_idx += 1
+            request.py_batch_idx = py_batch_idx(request)
+
+        previous_batch_len = len(previous_batch_indices)
+
+        def previous_seq_slots_device():
+            previous_batch_indices_host = torch.tensor(previous_batch_indices,
+                                                       dtype=torch.int,
+                                                       pin_memory=True)
+            previous_slots = self.previous_batch_indices_cuda[:
+                                                              previous_batch_len]
+            previous_slots.copy_(previous_batch_indices_host, non_blocking=True)
+            return previous_slots
 
         num_tokens = len(input_ids)
         num_draft_tokens = len(draft_tokens)
@@ -1347,29 +1363,22 @@ def _prepare_tp_inputs(
             self.draft_tokens_cuda[:len(draft_tokens)].copy_(draft_tokens,
                                                              non_blocking=True)
         if next_draft_tokens_device is not None:
-            if len(previous_batch_indices) > 0:
-                previous_batch_indices = torch.tensor(previous_batch_indices,
-                                                      dtype=torch.int,
-                                                      pin_memory=True)
-                self.previous_batch_indices_cuda[:previous_batchs].copy_(
-                    previous_batch_indices, non_blocking=True)
+            if previous_batch_len > 0:
+                previous_slots = previous_seq_slots_device()
                 # previous input ids
-                previous_batch_tokens = previous_batchs * (1 +
-                                                           self.max_draft_len)
-                self.input_ids_cuda[
-                    num_tokens:num_tokens +
-                    previous_batch_tokens].copy_(new_tokens_device[
-                        self.previous_batch_indices_cuda[:previous_batchs], :].
-                                                 flatten(),
-                                                 non_blocking=True)
+                previous_batch_tokens = previous_batch_len * (
+                    1 + self.max_draft_len)
+                new_tokens = new_tokens_device[previous_slots, :].flatten()
+                self.input_ids_cuda[num_tokens:num_tokens +
+                                    previous_batch_tokens].copy_(
+                                        new_tokens, non_blocking=True)
                 # previous draft tokens
-                previous_batch_draft_tokens = previous_batchs * self.max_draft_len
-                self.draft_tokens_cuda[
-                    num_draft_tokens:num_draft_tokens +
-                    previous_batch_draft_tokens].copy_(next_draft_tokens_device[
-                        self.previous_batch_indices_cuda[:previous_batchs], :].
-                                                       flatten(),
-                                                       non_blocking=True)
+                previous_batch_draft_tokens = previous_batch_len * self.max_draft_len
+                self.draft_tokens_cuda[num_draft_tokens:num_draft_tokens +
+                                       previous_batch_draft_tokens].copy_(
+                                           next_draft_tokens_device[
+                                               previous_slots, :].flatten(),
+                                           non_blocking=True)
                 # prepare data for the preprocess inputs
                 kv_len_offsets_device = new_tokens_lens_device - self.max_draft_len - 1
                 previous_pos_indices = torch.tensor(previous_pos_indices,
@@ -1398,16 +1407,13 @@ def _prepare_tp_inputs(
                 self.previous_pos_id_offsets_cuda *= 0
                 self.previous_kv_lens_offsets_cuda *= 0
         elif new_tokens_device is not None:
-            previous_batch_tokens = len(previous_batch_indices)
-            previous_batch_indices = torch.tensor(previous_batch_indices,
-                                                  dtype=torch.int,
-                                                  pin_memory=True)
-            self.previous_batch_indices_cuda[:previous_batch_tokens].copy_(
-                previous_batch_indices, non_blocking=True)
-            self.input_ids_cuda[num_tokens:num_tokens + previous_batchs].copy_(
-                new_tokens_device[
-                    self.previous_batch_indices_cuda[:previous_batchs]],
-                non_blocking=True)
+            seq_slots_device = previous_seq_slots_device()
+            max_draft_len = max(draft_lens)
+            new_tokens = new_tokens_device[:max_draft_len + 1,
+                                           seq_slots_device, :self.BEAM_WIDTH]
+            self.input_ids_cuda[num_tokens:num_tokens +
+                                previous_batch_len].copy_(new_tokens.flatten(),
+                                                          non_blocking=True)
 
         position_ids = torch.tensor(position_ids,
                                     dtype=torch.int,
@@ -1645,7 +1651,6 @@ def _prepare_star_attention_inputs(self,
         # for star attention, we need customized block ids
         block_ids_per_seq = []
         num_cached_tokens_per_seq = []
-        output_token_idx = 0
         for request in scheduled_requests.context_requests:
             request_ids.append(request.py_request_id)
             prompt_lengths.append(request.py_prompt_len)
@@ -1702,8 +1707,6 @@ def _prepare_star_attention_inputs(self,
             sequence_lengths.append(len(input_id))
             block_ids_per_seq.extend([all_cache_indices])
             num_cached_tokens_per_seq.append(past_seen_token_num)
-            request.output_token_idx = output_token_idx
-            output_token_idx += 1
         num_contexts = len(sequence_lengths)
         for request in scheduled_requests.context_requests:
             ctx_iter = request.ctx_iters
@@ -1743,8 +1746,6 @@ def _prepare_star_attention_inputs(self,
             sequence_lengths.append(len(input_id))
             block_ids_per_seq.extend([all_cache_indices])
             num_cached_tokens_per_seq.append(past_seen_token_num)
-            request.output_token_idx = output_token_idx
-            output_token_idx += 1
         num_queries = len(sequence_lengths) - num_contexts
 
         # Requests with draft tokens are treated like extend requests.
@@ -1802,8 +1803,6 @@ def _prepare_star_attention_inputs(self,
             position_ids.append(last_query_pos_id + request.gen_iters + 1)
             block_ids_per_seq.extend([all_cache_indices])
             num_cached_tokens_per_seq.append(past_seen_token_num)
-            request.output_token_idx = output_token_idx
-            output_token_idx += 1
 
         num_tokens = len(input_ids)
         assert num_tokens <= self.max_num_tokens, (
@@ -2171,9 +2170,7 @@ def _execute_logit_post_processors(self,
         num_ctx_req = len(scheduled_requests.context_requests)
         logits_tensor = outputs["logits"]
 
-        for idx, request in enumerate(
-                itertools.chain(scheduled_requests.context_requests,
-                                scheduled_requests.generation_requests)):
+        for idx, request in enumerate(scheduled_requests.all_requests()):
             logits_processors = getattr(request, "py_logits_post_processors",
                                         None)
             if not logits_processors: