Fix a few issues

ziyixiong-nv · ziyixiong-nv · commit f021f5e101af · 2025-09-19T01:10:10.000-07:00
Signed-off-by: ziyixiong-nv &lt;219238287+ziyixiong-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -41,11 +41,9 @@
 from .py_executor import PyExecutor
 
 
-# Development flag to control chain drafter feature
+# Development function to control chain drafter feature
 def _get_allow_chain_drafter() -> bool:
-    """Get the chain drafter flag from environment variable."""
-    # Use environment variable for cross-process compatibility
-    return os.getenv("TRTLLM_ALLOW_CHAIN_DRAFTER", "1") != "0"
+    return True
 
 
 class _ExecutorCreationStage(enum.Enum):
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -396,6 +396,7 @@ def _convert_draft_tensors(
         new_tokens_lens = None
         next_draft_tokens = None
         has_draft_tokens = False
+        batch_size = new_tokens.shape[1]
         # Iterate through generation requests and copy tokens based on accepted draft tokens
         for request in scheduled_batch.all_requests():
             idx = request.py_seq_slot
@@ -411,9 +412,8 @@ def _convert_draft_tensors(
 
         if has_draft_tokens:
             # We already updated the target state, so the new_tokens_lens should be all ones.
-            new_tokens_lens = torch.ones(scheduled_batch.batch_size,
-                                         device=device)
-            next_draft_tokens = torch.zeros(scheduled_batch.batch_size,
+            new_tokens_lens = torch.ones(batch_size, device=device)
+            next_draft_tokens = torch.zeros(batch_size,
                                             self.max_draft_tokens,
                                             device=device)
 
@@ -438,13 +438,14 @@ def _update_target_inputs_with_draft_tokens(
         Update target inputs with new draft tokens from sample state.
         """
         if draft_tensors is not None:
-            for request in draft_batch.all_requests():
+            for req_idx, request in enumerate(draft_batch.all_requests()):
                 # Skip prefill requests
                 if target_inputs.next_draft_tokens is None:
                     continue
 
                 # Get the index of the draft/target tokens in the device tensor
-                draft_idx = request.py_seq_slot
+                # For static draft loops, use the enumerated index; for dynamic loops, use py_seq_slot
+                draft_idx = req_idx if self.use_static_draft_loop else request.py_seq_slot
                 target_idx = req_id_to_old_request[
                     request.py_request_id].py_seq_slot
                 target_inputs.new_tokens[draft_position + 1:draft_position +
diff --git a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py
@@ -1,7 +1,7 @@
 import os
 import sys
 import unittest
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import pytest
 import torch
@@ -15,101 +15,97 @@
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 
 
+@pytest.fixture(scope="function")
+def enforce_single_worker(monkeypatch):
+    monkeypatch.setenv("TLLM_WORKER_USE_SINGLE_PROCESS", "1")
+    yield
+
+
 @pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
 @pytest.mark.high_cuda_memory
-def test_dynamic_spec_decode(disable_overlap_scheduler: bool):
-    # Store original value and set environment variable
-    original_value = os.environ.get("TLLM_WORKER_USE_SINGLE_PROCESS")
+def test_dynamic_spec_decode(enforce_single_worker,
+                             disable_overlap_scheduler: bool):
     # mock_should_use_spec_decode doesn't work with multiple processes,
-    # so we set the environment variable to 1 in this test.
-    os.environ["TLLM_WORKER_USE_SINGLE_PROCESS"] = "1"
-
-    try:
-        total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
-        if total_mem_gb < 35:
-            pytest.skip("Not enough memory to load target + draft model")
-
-        models_path = llm_models_root()
-        eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
-        target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
-
-        max_batch_size = 1
-        max_draft_len = 4
-        kv_cache_config = KvCacheConfig(enable_block_reuse=True,
-                                        max_tokens=8192)
-        cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
-
-        llm_common_config = dict(
-            model=target_model_dir,
-            attn_backend="TRTLLM",
-            disable_overlap_scheduler=disable_overlap_scheduler,
-            cuda_graph_config=cuda_graph_config,
-            max_batch_size=max_batch_size,
-            kv_cache_config=kv_cache_config,
-            # This max_seq_len is larger than the one specified
-            # in the llama 3 8B eagle's config. We want to make sure
-            # that the draft model won't go above its max in warmup
-            # in this test.
-            max_seq_len=8192,
-        )
-
-        spec_config = EagleDecodingConfig(
-            max_draft_len=max_draft_len,
-            speculative_model_dir=eagle_model_dir,
-            # Llama 3 does not support one model eagle.
-            eagle3_one_model=False,
-        )
-
-        # Mock should_use_spec_decode to turn on/off spec decode dynamically.
-        def mock_should_use_spec_decode(requests, max_batch_size,
-                                        max_num_tokens, max_draft_len):
-            if not hasattr(mock_should_use_spec_decode, 'call_count'):
-                mock_should_use_spec_decode.call_count = 0
-
-            for req in requests:
-                if req.state != LlmRequestState.GENERATION_IN_PROGRESS:
-                    continue
-
-                mock_should_use_spec_decode.call_count += 1
-                # Turn off spec decode when we've called it 5 times.
-                # In the current case, at the 5th call, there are 2 accepted draft tokens,
-                # so we can have better coverage for the switching between spec decode on and off.
-                if mock_should_use_spec_decode.call_count > 5:
-                    return False
-            return True
-
-        llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
-        sampling_params = SamplingParams(max_tokens=128, temperature=0)
-
-        # Output tests
-        prompts = [
-            "The president of the United States is",
-        ]
-        sampling_params = SamplingParams(max_tokens=20, temperature=0)
-
-        with patch(
-                'tensorrt_llm._torch.speculative.model_drafter.ModelDrafter.should_use_spec_decode',
-                side_effect=mock_should_use_spec_decode):
-            results_spec = llm_spec.generate(prompts, sampling_params)
-        generated_text_spec = [
-            result.outputs[0].text for result in results_spec
-        ]
-        llm_spec.shutdown()
-
-        llm_ref = LLM(**llm_common_config)
-        results_ref = llm_ref.generate(prompts, sampling_params)
-        generated_text_ref = [result.outputs[0].text for result in results_ref]
-        llm_ref.shutdown()
-
-        for text_spec, text_ref in zip(generated_text_spec, generated_text_ref):
-            # The spec decode algorithm currently guarantees identical results
-            assert text_spec == text_ref
-    finally:
-        # Restore original environment variable value
-        if original_value is None:
-            os.environ.pop("TLLM_WORKER_USE_SINGLE_PROCESS", None)
-        else:
-            os.environ["TLLM_WORKER_USE_SINGLE_PROCESS"] = original_value
+    # so we use the enforce_single_worker fixture to set the environment variable.
+    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+    if total_mem_gb < 35:
+        pytest.skip("Not enough memory to load target + draft model")
+
+    models_path = llm_models_root()
+    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
+    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
+
+    max_batch_size = 1
+    max_draft_len = 4
+    kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
+    cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
+
+    llm_common_config = dict(
+        model=target_model_dir,
+        attn_backend="TRTLLM",
+        disable_overlap_scheduler=disable_overlap_scheduler,
+        cuda_graph_config=cuda_graph_config,
+        max_batch_size=max_batch_size,
+        kv_cache_config=kv_cache_config,
+        # This max_seq_len is larger than the one specified
+        # in the llama 3 8B eagle's config. We want to make sure
+        # that the draft model won't go above its max in warmup
+        # in this test.
+        max_seq_len=8192,
+    )
+
+    spec_config = EagleDecodingConfig(
+        max_draft_len=max_draft_len,
+        speculative_model_dir=eagle_model_dir,
+        # Llama 3 does not support one model eagle.
+        eagle3_one_model=False,
+    )
+
+    llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
+    sampling_params = SamplingParams(max_tokens=128, temperature=0)
+
+    # Output tests
+    prompts = [
+        "The president of the United States is",
+    ]
+    sampling_params = SamplingParams(max_tokens=20, temperature=0)
+
+    # Mock should_use_spec_decode to turn on/off spec decode dynamically.
+    def mock_should_use_spec_decode(requests, max_batch_size, max_num_tokens,
+                                    max_draft_len):
+        for req in requests:
+            if req.state != LlmRequestState.GENERATION_IN_PROGRESS:
+                continue
+
+            mock_should_use_spec_decode.call_count += 1
+            # Turn off spec decode when we've called it 5 times.
+            # In the current case, at the 5th call, there are 2 accepted draft tokens,
+            # so we can have better coverage for the switching between spec decode on and off.
+            if mock_should_use_spec_decode.call_count > 5:
+                return False
+        return True
+
+    # Create a Mock object with the mock function as side_effect
+    mock_should_use_spec_decode = Mock(side_effect=mock_should_use_spec_decode)
+    # Reset mock state before using it
+    mock_should_use_spec_decode.reset_mock()
+    mock_should_use_spec_decode.call_count = 0
+
+    with patch(
+            'tensorrt_llm._torch.speculative.model_drafter.ModelDrafter.should_use_spec_decode',
+            mock_should_use_spec_decode):
+        results_spec = llm_spec.generate(prompts, sampling_params)
+    generated_text_spec = [result.outputs[0].text for result in results_spec]
+    llm_spec.shutdown()
+
+    llm_ref = LLM(**llm_common_config)
+    results_ref = llm_ref.generate(prompts, sampling_params)
+    generated_text_ref = [result.outputs[0].text for result in results_ref]
+    llm_ref.shutdown()
+
+    for text_spec, text_ref in zip(generated_text_spec, generated_text_ref):
+        # The spec decode algorithm currently guarantees identical results
+        assert text_spec == text_ref
 
 
 def test_should_use_spec_decode():
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py