target model

syuoni · syuoni · commit fb523e306494 · 2025-07-23T12:45:11.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
@@ -16,11 +16,19 @@ class GrammarMatcher(ABC):
     def accept_token(self, token_id: int) -> bool:
         pass
 
+    @abstractmethod
+    def rollback(self, num_tokens: int) -> None:
+        pass
+
     @abstractmethod
     def fill_next_token_bitmask(self, next_token_bitmask: torch.Tensor,
                                 index: int) -> None:
         pass
 
+    @abstractmethod
+    def is_terminated(self) -> bool:
+        pass
+
 
 class GrammarMatcherFactory(ABC):
 
@@ -39,15 +47,23 @@ def __init__(self, matcher: xgrammar.GrammarMatcher):
     def accept_token(self, token_id: int) -> bool:
         return self._matcher.accept_token(token_id)
 
+    def rollback(self, num_tokens: int) -> None:
+        self._matcher.rollback(num_tokens)
+
     def fill_next_token_bitmask(self, next_token_bitmask: torch.Tensor,
                                 index: int) -> None:
         self._matcher.fill_next_token_bitmask(next_token_bitmask, index)
 
+    def is_terminated(self) -> bool:
+        return self._matcher.is_terminated()
+
 
 class XGrammarMatcherFactory(GrammarMatcherFactory):
 
-    def __init__(self, guided_decoding_config: GuidedDecodingConfig,
-                 vocab_size_padded: int):
+    def __init__(self,
+                 guided_decoding_config: GuidedDecodingConfig,
+                 vocab_size_padded: int,
+                 max_num_draft_tokens: int = 0):
         super().__init__()
         if guided_decoding_config.tokenizer_str is not None:
             metadata = xgrammar.TokenizerInfo._detect_metadata_from_hf(
@@ -72,6 +88,7 @@ def __init__(self, guided_decoding_config: GuidedDecodingConfig,
             cache_enabled=True,
             cache_limit_bytes=cache_limit_bytes,
         )
+        self.max_num_draft_tokens = max_num_draft_tokens
 
     def create(self,
                guided_decoding_params: GuidedDecodingParams) -> XGrammarMatcher:
@@ -106,7 +123,8 @@ def create(self,
             case _:
                 raise ValueError(f"Unsupported guide type: {guide_type}.")
 
-        matcher = xgrammar.GrammarMatcher(compiled_grammar)
+        matcher = xgrammar.GrammarMatcher(
+            compiled_grammar, max_rollback_tokens=self.max_num_draft_tokens)
         return XGrammarMatcher(matcher)
 
 
@@ -121,12 +139,19 @@ def accept_token(self, token_id: int) -> bool:
         self._check_err()
         return result
 
+    def rollback(self, num_tokens: int) -> None:
+        self._matcher.rollback(num_tokens)
+        self._check_err()
+
     def fill_next_token_bitmask(self, next_token_bitmask: torch.Tensor,
                                 index: int) -> None:
         llguidance.torch.fill_next_token_bitmask(self._matcher,
                                                  next_token_bitmask, index)
         self._check_err()
 
+    def is_terminated(self) -> bool:
+        return self._matcher.is_stopped()
+
     def _check_err(self) -> None:
         if self._matcher.is_error():
             raise ValueError(
diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
@@ -13,19 +13,25 @@
 class GuidedDecoder:
     bitmask_dtype = torch.int32
 
-    def __init__(self, guided_decoding_config: GuidedDecodingConfig,
-                 max_num_sequences: int, vocab_size_padded: int):
+    def __init__(self,
+                 guided_decoding_config: GuidedDecodingConfig,
+                 max_num_sequences: int,
+                 vocab_size_padded: int,
+                 max_num_draft_tokens: int = 0):
         self.guided_decoding_backend = guided_decoding_config.backend
         self.max_num_sequences = max_num_sequences
         self.vocab_size_padded = vocab_size_padded
+        self.max_num_draft_tokens = max_num_draft_tokens
 
         self.grammar_matcher_factory: Optional[GrammarMatcherFactory] = None
         self.grammar_matchers: List[
             Optional[GrammarMatcher]] = [None] * self.max_num_sequences
 
         if self.guided_decoding_backend == GuidedDecodingConfig.GuidedDecodingBackend.XGRAMMAR:
             self.grammar_matcher_factory = XGrammarMatcherFactory(
-                guided_decoding_config, vocab_size_padded)
+                guided_decoding_config,
+                vocab_size_padded,
+                max_num_draft_tokens=max_num_draft_tokens)
         elif self.guided_decoding_backend == GuidedDecodingConfig.GuidedDecodingBackend.LLGUIDANCE:
             self.grammar_matcher_factory = LLGuidanceMatcherFactory(
                 guided_decoding_config, vocab_size_padded)
@@ -35,14 +41,16 @@ def __init__(self, guided_decoding_config: GuidedDecodingConfig,
             )
 
         self.bitmask = torch.empty(self.max_num_sequences,
+                                   self.max_num_draft_tokens + 1,
                                    self.bitmask_size,
                                    dtype=self.bitmask_dtype,
                                    device='cuda')
         self.bitmask_host = torch.empty(self.max_num_sequences,
+                                        self.max_num_draft_tokens + 1,
                                         self.bitmask_size,
                                         dtype=self.bitmask_dtype,
                                         pin_memory=True)
-
+        self.num_guided_tokens: List[int] = [0] * self.max_num_sequences
         self._stream = torch.cuda.Stream()
 
     @property
@@ -52,44 +60,77 @@ def bitmask_size(self) -> int:
     @nvtx_range("GuidedDecoder.build")
     def build(self, scheduled_requests: ScheduledRequests) -> None:
         for llm_req in scheduled_requests.all_requests():
-            if llm_req.guided_decoding_params is None:
-                continue
-            slot = llm_req.py_seq_slot
-            if llm_req.is_context_init_state and llm_req.context_current_position == llm_req.prepopulated_prompt_len:
-                self.grammar_matchers[
-                    slot] = self.grammar_matcher_factory.create(
-                        llm_req.guided_decoding_params)
+            slot: int = llm_req.py_seq_slot
+            require_guided: bool = True
 
-            elif llm_req.is_generation_in_progress_state:
-                # The request is in a generation forward step.
-                # Currently, guided decoding does not support with beam search.
-                self.grammar_matchers[slot].accept_token(
-                    llm_req.get_last_tokens(0))
+            if llm_req.guided_decoding_params is None:
+                require_guided = False
             else:
-                continue
-
-            # Fill the bitmask on host and asynchorously copy to device.
-            self.grammar_matchers[slot].fill_next_token_bitmask(
-                self.bitmask_host, slot)
-            with torch.cuda.stream(self._stream):
-                self.bitmask[slot].copy_(self.bitmask_host[slot],
-                                         non_blocking=True)
+                if llm_req.is_context_init_state and llm_req.is_last_context_chunk:
+                    # The request is in the last chunk of a context forward step.
+                    matcher = self.grammar_matcher_factory.create(
+                        llm_req.guided_decoding_params)
+                    self.grammar_matchers[slot] = matcher
+                elif llm_req.is_generation_in_progress_state:
+                    # The request is in a generation forward step.
+                    matcher = self.grammar_matchers[slot]
+                    # Rollback the grammar matcher to the last accepted token.
+                    num_rollback_tokens = self.num_guided_tokens[slot] - (
+                        1 + llm_req.py_num_accepted_draft_tokens)
+                    assert num_rollback_tokens >= 0
+                    matcher.rollback(num_rollback_tokens)
+
+                    # Currently, guided decoding does not support with beam search.
+                    accepted = matcher.accept_token(llm_req.get_last_tokens(0))
+                    # TODO: Make this an error response.
+                    if not accepted:
+                        raise ValueError(
+                            f"Failed to accept new token: {llm_req.get_last_tokens(0)}."
+                        )
+                else:
+                    require_guided = False
+
+            num_guided_tokens: int = 0
+            if require_guided:
+                if not matcher.is_terminated():
+                    matcher.fill_next_token_bitmask(self.bitmask_host[slot], 0)
+                    num_guided_tokens += 1
+                # Process draft tokens
+                for i, tid in enumerate(llm_req.py_draft_tokens, 1):
+                    accepted = matcher.accept_token(tid)
+                    if matcher.is_terminated():
+                        matcher.rollback(1)
+                        accepted = False
+                    if accepted:
+                        matcher.fill_next_token_bitmask(self.bitmask_host[slot],
+                                                        i)
+                        num_guided_tokens += 1
+                    else:
+                        break
+
+            self.num_guided_tokens[slot] = num_guided_tokens
+            if num_guided_tokens > 0:
+                with torch.cuda.stream(self._stream):
+                    self.bitmask[slot, :num_guided_tokens].copy_(
+                        self.bitmask_host[slot, :num_guided_tokens],
+                        non_blocking=True)
 
     @nvtx_range("GuidedDecoder.execute")
     def execute(self, scheduled_requests: ScheduledRequests,
                 logits: torch.Tensor) -> None:
-        assert logits.size(0) == len(scheduled_requests.context_requests) + len(
-            scheduled_requests.generation_requests)
         torch.cuda.current_stream().wait_stream(self._stream)
 
         batched_logits, batched_bitmask = [], []
-        for i, llm_req in enumerate(scheduled_requests.all_requests()):
-            if llm_req.guided_decoding_params is None:
-                continue
-            if llm_req.is_context_init_state and not llm_req.is_last_context_chunk:
-                continue
-            batched_logits.append(logits[i])
-            batched_bitmask.append(self.bitmask[llm_req.py_seq_slot])
+        offset = 0
+        for llm_req in scheduled_requests.all_requests():
+            slot: int = llm_req.py_seq_slot
+            num_guided_tokens: int = self.num_guided_tokens[slot]
+            for i in range(num_guided_tokens):
+                batched_logits.append(logits[offset + i])
+                batched_bitmask.append(self.bitmask[slot, i])
+            offset += len(llm_req.py_draft_tokens) + 1
+
+        assert offset == logits.size(0)
 
         if len(batched_logits) > 0:
             torch.ops.trtllm.logits_bitmask(batched_logits, batched_bitmask)
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -301,6 +301,7 @@ def __init__(
         self.py_orig_prompt_len = self.orig_prompt_len
         self.py_max_new_tokens = self.max_new_tokens
         self.py_batch_idx = None
+        self.py_draft_pages_allocated = 0
         self.py_rewind_len = 0
         self.py_draft_tokens = [] if self.draft_tokens is None else self.draft_tokens
         self.py_last_draft_tokens = None
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -332,14 +332,15 @@ def create_py_executor(
 
     guided_decoder: Optional[GuidedDecoder] = None
     if executor_config.guided_decoding_config is not None:
-        if spec_config is not None:
-            raise ValueError(
-                "Guided decoding is not supported with speculative decoding.")
         if mapping.is_last_pp_rank():
+            max_num_draft_tokens = 0
+            if spec_config is not None:
+                max_num_draft_tokens = spec_config.max_draft_len
             guided_decoder = GuidedDecoder(
                 executor_config.guided_decoding_config,
                 executor_config.max_batch_size,
-                model_engine.model.vocab_size_padded)
+                model_engine.model.vocab_size_padded,
+                max_num_draft_tokens=max_num_draft_tokens)
 
     resources = {}
     estimating_kv_cache = False
diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py
@@ -18,6 +18,7 @@
 
 import click
 import datasets
+import jsonschema
 import numpy as np
 
 from .. import LLM as PyTorchLLM
@@ -65,23 +66,31 @@ def generate_samples(self) -> Iterable[tuple]:
             sampling_args = {
                 "guided_decoding": GuidedDecodingParams(json=schema)
             }
-            yield sample["prompt"], sampling_args, sample["completion"]
+            yield sample["prompt"], sampling_args, sample["completion"], sample[
+                "schema"]
 
-    def compute_score(self, outputs: List[RequestOutput],
-                      references: List[str]) -> float:
-        all_corrections = []
-        for output, ref in zip(outputs, references):
+    def compute_score(self, outputs: List[RequestOutput], references: List[str],
+                      schemas: List[str]) -> float:
+        all_corrections, all_grammar_corrections = [], []
+        for output, ref, schema in zip(outputs, references, schemas):
             try:
                 output_json = json.loads(output.outputs[0].text)
-            except json.JSONDecodeError:
+                jsonschema.validate(output_json, json.loads(schema))
+            except (json.JSONDecodeError, jsonschema.ValidationError):
                 all_corrections.append(False)
-                continue
+                all_grammar_corrections.append(False)
+            else:
+                all_grammar_corrections.append(True)
             ref_json = json.loads(ref)
             all_corrections.append(output_json == ref_json)
 
         acc = np.mean(all_corrections) * 100
         logger.info(
             f"JSON Mode Eval accuracy: {acc:.2f} ({len(all_corrections)})")
+        grammar_acc = np.mean(all_grammar_corrections) * 100
+        logger.info(
+            f"JSON Mode Eval grammar accuracy: {grammar_acc:.2f} ({len(all_grammar_corrections)})"
+        )
         return acc
 
     @click.command("json_mode_eval")