[https://nvbugs/5513423][fix] Correctly respect min_tokens in PyTorch workflow using TorchSampler

stnie · stnie · commit ad4d9d44efc9 · 2025-09-17T17:55:20.000+02:00
- Added `py_min_length` attribute to `LlmRequest` to store minimum length configuration.
- Implemented `_apply_min_length_penalty` method in `TorchSampler` to adjust logits based on minimum length requirements. (Mimics PenaltyLayer)
- Updated test case for `min_tokens` to reflect new maximum sequence length and output length constraints from the model

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -329,6 +329,7 @@ def __init__(
         self.py_prompt_len = self.prompt_len
         self.py_orig_prompt_len = self.orig_prompt_len
         self.py_max_new_tokens = self.max_new_tokens
+        self.py_min_length = self.sampling_config.min_length
         self.py_batch_idx = None
         self.py_draft_pages_allocated = 0
         self.py_rewind_len = 0
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -690,6 +690,20 @@ def _apply_embedding_bias(
 
         return logits
 
+    @staticmethod
+    def _apply_min_length_penalty(logits: torch.Tensor,
+                                  requests: list[LlmRequest]):
+
+        if not any(
+                r.py_min_length and r.max_beam_num_tokens < r.py_min_length[0]
+                for r in requests):
+            return logits
+        logits = logits.clone()
+        for index, r in enumerate(requests):
+            if r.py_min_length and r.max_beam_num_tokens < r.py_min_length[0]:
+                logits[index, [r.py_end_id]] = float('-inf')
+        return logits
+
     @staticmethod
     def _longest_stop_word_len(requests: Iterable[LlmRequest]) -> int:
         max_stop_word_len = 0
@@ -905,6 +919,7 @@ def _process_requests(self,
             raw_logits = model_outputs["logits"]
 
         requests = scheduled_requests.all_requests()
+        raw_logits = self._apply_min_length_penalty(raw_logits, requests)
         num_steps = [1 + get_draft_token_length(req) for req in requests]
         sum_steps = sum(num_steps)
         no_draft_tokens = len(requests) == sum_steps
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -861,15 +861,14 @@ def test_llm_with_proxy_error():
 
 
 @pytest.mark.part0
-@pytest.mark.xfail(reason="https://nvbugs/5513423")
 def test_min_tokens():
     """Check min_tokens is respected."""
     llm = LLM(model=llama_model_path,
               kv_cache_config=global_kvcache_config,
               enable_mixed_sampler=True,
-              max_seq_len=20000)
+              max_seq_len=2048)
 
-    output_len = 5000
+    output_len = 2000
     sampling_params = SamplingParams(max_tokens=output_len,
                                      min_tokens=output_len,
                                      temperature=1)