From f3f85f27ef6fca42617485f04bde9a92500afc56 Mon Sep 17 00:00:00 2001
From: yihonglie <hyi@amd.com>
Date: Fri, 8 May 2026 05:16:53 +0000
Subject: [PATCH 1/2] [Kimi] support Eagle3 speculative decoding for Kimi K2.5

Adds Eagle3 spec decode for Kimi K2.5 (MLA target + standard MHA draft):

- Eagle3LlamaModel: 1-layer Llama draft (dual-norm input, wide QKV,
  independent embed/lm_head) matching the lightseekorg/kimi-k2.5-eagle3
  checkpoint
- Eagle3DraftBuilder: implements the post-#659 builder protocol
  (compute_block_bytes / allocate_kv_cache_tensors / build_kv_cache_tensor)
  for the draft's independent non-MLA KV cache, attached to the runner from
  EagleProposer.__init__ via runner.eagle3_draft_builder. ModelRunner
  delegates KV pool sizing, allocation, and per-module binding through this
  hook with no eagle3-specific code in the runner KV path
- Aux hidden state pipeline: target forward returns
  (hidden, aux_hidden_states), captured through CUDAGraph via
  graph_aux_hidden and fed to the draft's combine_hidden_states (fc) as
  input
- SpeculativeConfig: --method eagle3 + --draft-model CLI; eagle3 vs MTP
  branching at construction time; fail-fast if draft is MLA
- Scheduler: spec_stats only updated when speculation actually ran
  (matches vLLM's gating)
- propose: draft-perspective predicate `draft_uses_mha = hasattr(runner,
  "eagle3_draft_builder")` drives both the metadata-flow special-cases
  (slot_mapping re-slice, context_lens += 1, tuple-unpack of the draft
  return value); is_eagle3 string comparison is gone from the hot path

Result on Kimi-K2.5-MXFP4 + kimi-k2.5-eagle3, 8x MI355X, gsm8k 5-shot:
acceptance 67.85%, accuracy 93.78%.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 atom/config.py                    |  28 +++
 atom/model_engine/arg_utils.py    |  30 ++-
 atom/model_engine/model_runner.py | 135 +++++++++++---
 atom/model_engine/scheduler.py    |  13 +-
 atom/model_ops/attention_mha.py   |  15 +-
 atom/model_ops/linear.py          |  18 +-
 atom/models/deepseek_v2.py        |  28 ++-
 atom/models/eagle3_llama.py       | 301 ++++++++++++++++++++++++++++++
 atom/models/kimi_k25.py           |   6 +
 atom/spec_decode/eagle.py         | 240 ++++++++++++++++++++++--
 10 files changed, 752 insertions(+), 62 deletions(-)
 create mode 100644 atom/models/eagle3_llama.py

diff --git a/atom/config.py b/atom/config.py
index 2f42471389..5fc9e700c9 100644
--- a/atom/config.py
+++ b/atom/config.py
@@ -723,6 +723,8 @@ class SpeculativeConfig:
     model: Optional[str] = None
     num_speculative_tokens: Optional[int] = None
     draft_model_hf_config: Optional[PretrainedConfig] = None
+    use_aux_hidden_state: bool = False
+    eagle3_aux_layer_ids: list[int] = field(default_factory=list)
 
     # model_type → mtp_model_type mapping
     _MTP_TYPE_MAP: ClassVar[dict[str, str]] = {
@@ -753,8 +755,34 @@ def __post_init__(self):
             self.draft_model_hf_config = self.draft_model_hf_config.text_config
         self.hf_config_override(self.draft_model_hf_config)
 
+        if self.method == "eagle3":
+            if getattr(self.draft_model_hf_config, "kv_lora_rank", None):
+                raise NotImplementedError(
+                    "Eagle3 draft model with MLA attention is not supported"
+                )
+            # Aux hidden state layers: prefer the draft checkpoint's
+            # eagle_config; if absent or the list is empty, ModelRunner
+            # falls back to model.get_eagle3_aux_hidden_state_layers(),
+            # which defaults to 3 layers — early / middle / late
+            # (see DeepseekV2ForCausalLM.get_eagle3_aux_hidden_state_layers,
+            # returns `(2, num_layers // 2, num_layers - 3)`, aligned with vLLM).
+            eagle_cfg = getattr(self.draft_model_hf_config, "eagle_config", None)
+            if eagle_cfg:
+                self.use_aux_hidden_state = eagle_cfg.get("use_aux_hidden_state", False)
+                if self.use_aux_hidden_state and not self.eagle3_aux_layer_ids:
+                    self.eagle3_aux_layer_ids = eagle_cfg.get(
+                        "eagle_aux_hidden_state_layer_ids", []
+                    )
+            else:
+                self.use_aux_hidden_state = True
+
     @staticmethod
     def hf_config_override(hf_config: PretrainedConfig) -> None:
+        # Eagle3 architecture mapping (architecture-level, not model_type)
+        arch = (getattr(hf_config, "architectures", None) or [""])[0]
+        if arch == "LlamaForCausalLMEagle3":
+            hf_config.architectures = ["Eagle3LlamaModel"]
+
         # Step 1: resolve model_type → mtp model_type
         mtp_type = SpeculativeConfig._MTP_TYPE_MAP.get(hf_config.model_type)
         if mtp_type is not None:
diff --git a/atom/model_engine/arg_utils.py b/atom/model_engine/arg_utils.py
index 1358c04052..e5970603f6 100644
--- a/atom/model_engine/arg_utils.py
+++ b/atom/model_engine/arg_utils.py
@@ -51,6 +51,7 @@ class EngineArgs:
     method: Optional[str] = None
     num_speculative_tokens: int = 1
     kv_transfer_config: str = "{}"
+    draft_model: Optional[str] = None
     mark_trace: bool = False
 
     @staticmethod
@@ -163,7 +164,7 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
             "--method",
             type=str,
             default=None,
-            choices=["mtp"],
+            choices=["mtp", "eagle3"],
             help="Speculative method",
         )
         parser.add_argument(
@@ -172,6 +173,12 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
             default=1,
             help="Number of speculative tokens to generate per iteration (draft model runs this many times autoregressively)",
         )
+        parser.add_argument(
+            "--draft-model",
+            type=str,
+            default=None,
+            help="Path to external Eagle3 draft model. Required when --method eagle3.",
+        )
         parser.add_argument(
             "--max-num-batched-tokens",
             type=int,
@@ -243,14 +250,25 @@ def _get_engine_kwargs(self) -> dict:
             ),
         )
         if self.method and self.num_speculative_tokens > 0:
-            kwargs["speculative_config"] = SpeculativeConfig(
-                method=kwargs.pop("method"),
-                model=self.model,
-                num_speculative_tokens=kwargs.pop("num_speculative_tokens"),
-            )
+            method = kwargs.pop("method")
+            num_spec_tokens = kwargs.pop("num_speculative_tokens")
+            draft_model = kwargs.pop("draft_model")
+            if method == "eagle3":
+                kwargs["speculative_config"] = SpeculativeConfig(
+                    method=method,
+                    model=draft_model,
+                    num_speculative_tokens=num_spec_tokens,
+                )
+            else:
+                kwargs["speculative_config"] = SpeculativeConfig(
+                    method=method,
+                    model=self.model,
+                    num_speculative_tokens=num_spec_tokens,
+                )
         else:
             kwargs.pop("method")
             kwargs.pop("num_speculative_tokens")
+            kwargs.pop("draft_model")
             kwargs["speculative_config"] = None
 
         # --enable-tbo [prefill|all] → enable_tbo + enable_tbo_decode
diff --git a/atom/model_engine/model_runner.py b/atom/model_engine/model_runner.py
index ce607625df..3561d27b52 100644
--- a/atom/model_engine/model_runner.py
+++ b/atom/model_engine/model_runner.py
@@ -559,6 +559,13 @@ def __init__(self, rank: int, config: Config):
         self.num_spec_tokens = (
             self.config.speculative_config.num_speculative_tokens if use_spec else 0
         )
+        self.eagle3_mode = (
+            self.config.speculative_config is not None
+            and self.config.speculative_config.method == "eagle3"
+        )
+
+        self.use_aux_hidden_state_outputs = False
+        self._aux_hidden_states = None
         self.tokenID_processor = tokenIDProcessor(
             self,
             self.config.max_num_batched_tokens,
@@ -621,6 +628,18 @@ def __init__(self, rank: int, config: Config):
             torch.set_default_device(None)
             logger.info("Loading drafter model...")
             self.drafter.load_model(self.model)
+
+        if self.eagle3_mode and self.config.speculative_config.use_aux_hidden_state:
+            aux_ids = self.config.speculative_config.eagle3_aux_layer_ids
+            if not aux_ids and hasattr(
+                self.model, "get_eagle3_aux_hidden_state_layers"
+            ):
+                aux_ids = list(self.model.get_eagle3_aux_hidden_state_layers())
+            if aux_ids:
+                self.model.set_aux_hidden_state_layers(tuple(aux_ids))
+                self.use_aux_hidden_state_outputs = True
+                logger.info(f"Eagle3 aux hidden state layers: {aux_ids}")
+
         torch.set_default_device(self.device)
         self.async_execute_stream = torch.cuda.Stream(self.device)
         self.allocate_forward_vars()
@@ -1075,24 +1094,35 @@ def _get_num_kv_heads(self):
             return 1
 
     def _get_total_num_layers(self):
-        """Return total layer count including draft (MTP) layers."""
+        """Return total layer count including draft (MTP) layers.
+
+        Drafts that own an independent KV cache via their own builder
+        (e.g. Eagle3 MHA draft on an MLA target) account for their layers
+        through that builder, so they are NOT added here. Only MTP-style
+        drafts that share the target's KV pool contribute.
+        """
         total = self.config.hf_config.num_hidden_layers
         if self.config.speculative_config and hasattr(self, "drafter"):
-            draft_hf = self.config.speculative_config.draft_model_hf_config
-            total += getattr(draft_hf, "num_nextn_predict_layers", 1)
+            if not hasattr(self, "eagle3_draft_builder"):
+                draft_hf = self.config.speculative_config.draft_model_hf_config
+                total += getattr(draft_hf, "num_nextn_predict_layers", 1)
         return total
 
     def _compute_block_bytes(self):
         """Per-block bytes for the unified KV pool budget.
 
-        Delegates to the attention builder, which knows its own tensor
-        layout (MLA 576-dim packed, GDN-hybrid full-attn-only, MiMo-V2
-        per-layer-type, standard MHA split-K/V). Mirror of
-        `attn_metadata_builder.allocate_kv_cache_tensors()` so the budget
-        math matches what's actually allocated. Per-request cache bytes
-        are accounted for separately via `compute_per_req_cache_bytes()`.
+        Sum across all attention builders attached to this runner: the
+        target builder always, plus an optional `eagle3_draft_builder`
+        when a heterogeneous spec-decode draft owns its own KV pool. Each
+        builder knows its own tensor layout (MLA 576-dim packed, GDN-hybrid
+        full-attn-only, MiMo-V2 per-layer-type, standard MHA split-K/V,
+        Eagle3 independent MHA). Per-request cache bytes are accounted
+        for separately via `compute_per_req_cache_bytes()`.
         """
-        return self.attn_metadata_builder.compute_block_bytes()
+        block_bytes = self.attn_metadata_builder.compute_block_bytes()
+        if hasattr(self, "eagle3_draft_builder"):
+            block_bytes += self.eagle3_draft_builder.compute_block_bytes()
+        return block_bytes
 
     def _estimate_cudagraph_overhead(self):
         """Estimate GPU memory consumed by CUDA graph capture.
@@ -1255,13 +1285,24 @@ def allocate_kv_cache(self, num_kvcache_blocks):
         num_draft_layers = 0
         if self.config.speculative_config and hasattr(self, "drafter"):
             draft_hf_config = self.config.speculative_config.draft_model_hf_config
-            # For MTP, use num_nextn_predict_layers instead of num_hidden_layers
-            num_draft_layers = getattr(draft_hf_config, "num_nextn_predict_layers", 1)
-            total_num_layers += num_draft_layers
-            logger.info(
-                f"Allocating KV cache for {hf_config.num_hidden_layers} target layers + "
-                f"{num_draft_layers} draft (MTP) layers = {total_num_layers} total layers"
-            )
+            if hasattr(self, "eagle3_draft_builder"):
+                # Heterogeneous draft (e.g. Eagle3 MHA on MLA target) owns
+                # its own KV pool via its builder; don't add to target's count.
+                num_draft_layers = draft_hf_config.num_hidden_layers
+                logger.info(
+                    f"Allocating KV cache for {hf_config.num_hidden_layers} target layers + "
+                    f"{num_draft_layers} Eagle3 draft layers (separate non-MLA cache)"
+                )
+            else:
+                # For MTP, use num_nextn_predict_layers instead of num_hidden_layers
+                num_draft_layers = getattr(
+                    draft_hf_config, "num_nextn_predict_layers", 1
+                )
+                total_num_layers += num_draft_layers
+                logger.info(
+                    f"Allocating KV cache for {hf_config.num_hidden_layers} target layers + "
+                    f"{num_draft_layers} draft (MTP) layers = {total_num_layers} total layers"
+                )
 
         # Primary KV cache allocation (model-agnostic, delegated to the
         # attention builder). Each builder owns its tensor layout: MLA →
@@ -1277,6 +1318,16 @@ def allocate_kv_cache(self, num_kvcache_blocks):
         for name, value in main_kv.items():
             setattr(self, name, value)
 
+        # Heterogeneous draft (e.g. Eagle3 MHA alongside an MLA target) owns
+        # its own KV pool through a sibling builder; same protocol as above,
+        # tensors land under namespaced keys (eagle3_kv_cache, eagle3_kv_scale).
+        if hasattr(self, "eagle3_draft_builder"):
+            draft_kv = self.eagle3_draft_builder.allocate_kv_cache_tensors(
+                num_kv_heads, num_draft_layers
+            )
+            for name, value in draft_kv.items():
+                setattr(self, name, value)
+
         # Per-request cache allocation (model-agnostic, delegated to the
         # attention metadata builder). For GDN this returns
         # `{"mamba_k_cache": ..., "mamba_v_cache": ...}`; for stateless
@@ -1302,10 +1353,12 @@ def allocate_kv_cache(self, num_kvcache_blocks):
         kv_cache_tensors = []
         layer_id = 0
         # Promote to self so the attention builder's build_kv_cache_tensor()
-        # can access it without recomputing from drafter state.
+        # can access it without recomputing from drafter state. Heterogeneous
+        # drafts (Eagle3) own their own layer space via their builder, so
+        # leave mtp_start_layer_idx at hf_config.num_hidden_layers in that mode.
         self.mtp_start_layer_idx = (
             self.drafter.model.model.mtp_start_layer_idx
-            if hasattr(self, "drafter")
+            if hasattr(self, "drafter") and not hasattr(self, "eagle3_draft_builder")
             else hf_config.num_hidden_layers
         )
         for model_name, model in models_to_bind:
@@ -1314,6 +1367,18 @@ def allocate_kv_cache(self, num_kvcache_blocks):
             )
 
             for module in model.modules():
+                # Drafts that own an independent KV pool (Eagle3) bind through
+                # their sibling builder first; for unrecognized modules it
+                # returns None and we fall through to the target builder.
+                if model_name == "draft" and hasattr(self, "eagle3_draft_builder"):
+                    kv_cache_tensor = self.eagle3_draft_builder.build_kv_cache_tensor(
+                        layer_id, module
+                    )
+                    if kv_cache_tensor is not None:
+                        kv_cache_tensors.append(kv_cache_tensor)
+                        layer_id += 1
+                        continue
+
                 # Per-attention-type binding is owned by the attention
                 # metadata builder; ModelRunner only walks modules and
                 # collects the resulting KVCacheTensor entries. The builder
@@ -1625,7 +1690,12 @@ def run_model(
                 label += f" tok={batch.total_tokens_num} ctx={ctx_str}"
             label += "]"
             with record_function(label):
-                hidden_states = self.model(input_ids, positions)
+                model_output = self.model(input_ids, positions)
+                if self.use_aux_hidden_state_outputs:
+                    hidden_states, self._aux_hidden_states = model_output
+                else:
+                    hidden_states = model_output
+                    self._aux_hidden_states = None
                 logits = self.model.compute_logits(hidden_states)
         else:
             # decode[bs=128 tok=128 d=128]  or  decode[bs=128 tok=128 p=2 d=126 spec=3]
@@ -1645,6 +1715,12 @@ def run_model(
                 self.graphs[graph_key].replay()
                 num_tokens = context.batch_size * max_q_len
                 hidden_states = self.forward_vars["outputs"][:num_tokens]
+                if graph_key in self.graph_aux_hidden:
+                    self._aux_hidden_states = [
+                        aux[:num_tokens] for aux in self.graph_aux_hidden[graph_key]
+                    ]
+                else:
+                    self._aux_hidden_states = None
                 if self.logits_in_graph:
                     logits = self.graph_logits[graph_key][:num_tokens]
                 else:
@@ -1833,6 +1909,7 @@ def propose_draft_token_ids(
             num_reject_tokens=num_reject_tokens,
             next_token_ids=next_token_ids,
             last_token_indices=last_token_indices,
+            aux_hidden_states=self._aux_hidden_states,
         )
         return self.tokenID_processor.prepare_draft_ids(batch, draft_token)
 
@@ -1880,6 +1957,7 @@ def capture_cudagraph(self):
 
         self.graphs: dict[tuple[int, int], torch.cuda.CUDAGraph] = dict()
         self.graph_logits: dict[tuple[int, int], torch.Tensor] = dict()
+        self.graph_aux_hidden: dict[tuple[int, int], list[torch.Tensor]] = dict()
         self.graph_pool = None
         is_tbo = self.config.enable_tbo and isinstance(self.model, UBatchWrapper)
         # TBO graphs don't capture compute_logits, so disable logits_in_graph.
@@ -1930,9 +2008,13 @@ def capture_cudagraph(self):
                 )
 
                 # Warmup
-                outputs[:num_tokens] = self.model(
+                model_output = self.model(
                     input_ids[:num_tokens], positions[:num_tokens]
                 )
+                if self.use_aux_hidden_state_outputs:
+                    outputs[:num_tokens] = model_output[0]
+                else:
+                    outputs[:num_tokens] = model_output
                 if self.logits_in_graph:
                     self.model.compute_logits(outputs[:num_tokens])
 
@@ -1951,13 +2033,20 @@ def capture_cudagraph(self):
                             gc.stream,
                             output_buffer=outputs[:num_tokens],
                         )
+                        graph_aux = None
                     else:
                         # Standard single-stream capture
                         graph = torch.cuda.CUDAGraph()
                         with torch.cuda.graph(graph, self.graph_pool, stream=gc.stream):
-                            outputs[:num_tokens] = self.model(
+                            model_output = self.model(
                                 input_ids[:num_tokens], positions[:num_tokens]
                             )
+                            if self.use_aux_hidden_state_outputs:
+                                outputs[:num_tokens] = model_output[0]
+                                graph_aux = model_output[1]
+                            else:
+                                outputs[:num_tokens] = model_output
+                                graph_aux = None
                             if self.logits_in_graph:
                                 graph_logits = self.model.compute_logits(
                                     outputs[:num_tokens]
@@ -1967,6 +2056,8 @@ def capture_cudagraph(self):
                 self.graphs[(bs, max_q_len)] = graph
                 if self.logits_in_graph and ubatch_slices is None:
                     self.graph_logits[(bs, max_q_len)] = graph_logits
+                if graph_aux is not None:
+                    self.graph_aux_hidden[(bs, max_q_len)] = graph_aux
                 torch.cuda.synchronize()
         self.graph_bs.sort(reverse=False)
 
diff --git a/atom/model_engine/scheduler.py b/atom/model_engine/scheduler.py
index 9e9acd6ced..3d65369d4a 100644
--- a/atom/model_engine/scheduler.py
+++ b/atom/model_engine/scheduler.py
@@ -676,12 +676,21 @@ def postprocess(
                 continue
             token_ids = prev_token_ids[idx]
             num_new_token = len(token_ids)
-            if self.spec_stats:
-                self.spec_stats.update(num_new_token)
             if is_deferred_out or self.use_spec:
                 num_rejected = fwd_output.num_rejected[idx]
                 num_bonus = fwd_output.num_bonus[idx]
                 offset = 0 if (num_new_token + num_rejected) == 1 else self.mtp_k
+                # Align stats with vLLM: only count steps that actually ran
+                # speculation (drafts proposed and validated). Skip the
+                # prefill-only step where no draft tokens were scored against
+                # the target — vLLM gates this via
+                # `if scheduled_spec_token_ids and generated_token_ids`.
+                if (
+                    self.spec_stats
+                    and num_new_token > 0
+                    and (num_new_token + num_rejected) > 1
+                ):
+                    self.spec_stats.update(num_new_token)
                 seq.num_rejected = num_rejected
                 seq.num_bonus_tokens = num_bonus
                 for i, el in enumerate(token_ids):
diff --git a/atom/model_ops/attention_mha.py b/atom/model_ops/attention_mha.py
index c3ebd525a8..dd65fd9f33 100644
--- a/atom/model_ops/attention_mha.py
+++ b/atom/model_ops/attention_mha.py
@@ -127,7 +127,12 @@ def rope_cache(self, q, k, v, qkv, position, fwd_ctx: ForwardContext):
         k_scale = kv_cache_data[f"layer_{self.layer_num}"].k_scale
         v_scale = kv_cache_data[f"layer_{self.layer_num}"].v_scale
 
-        use_triton_attn = self.sliding_window != -1 or self.head_dim != 128
+        # MTP MHA must go through triton/gluon; aiter ASM non-persistent path may have some unexpected behavior.
+        use_triton_attn = (
+            self.sliding_window != -1
+            or self.head_dim != 128
+            or self.num_heads == self.num_kv_heads
+        )
         self.use_triton_attn = use_triton_attn
 
         if (
@@ -501,9 +506,7 @@ def prefill_attention(
         # variable lenth attention use key value as input
         attn_metadata = fwd_ctx.attn_metadata
         sliding_window = (
-            (self.sliding_window, 0, 0)
-            if self.sliding_window is not None
-            else (-1, -1, 0)
+            (self.sliding_window, 0, 0) if self.sliding_window > 0 else (-1, -1, 0)
         )
         o = aiter.flash_attn_varlen_func(
             q,
@@ -545,9 +548,7 @@ def prefill_attention_triton(
         o = torch.empty_like(q)
         descale_shape = (attn_metadata.cu_seqlens_q.shape[0] - 1, k.shape[1])
         sliding_window = (
-            (self.sliding_window - 1, 0)
-            if self.sliding_window is not None
-            else (-1, -1)
+            (self.sliding_window - 1, 0) if self.sliding_window > 0 else (-1, -1)
         )
         unified_attention(
             q,
diff --git a/atom/model_ops/linear.py b/atom/model_ops/linear.py
index c3a09e829e..8441b06094 100644
--- a/atom/model_ops/linear.py
+++ b/atom/model_ops/linear.py
@@ -318,11 +318,19 @@ def weight_loader_process(
         loaded_weight: torch.Tensor,
         post_process_func: Callable = lambda a: a,
     ):
-        if (
-            param.data.dtype != loaded_weight.dtype
-            and param.data.element_size() == loaded_weight.element_size()
-        ):
-            param.data = param.data.view(loaded_weight.dtype)
+        if param.data.dtype != loaded_weight.dtype:
+            if param.data.element_size() == loaded_weight.element_size():
+                # Same byte-width: use view for raw-bit-compatible pairs
+                # (e.g. fp8 variants) but convert for semantically different
+                # formats (float16 ↔ bfloat16) where bit reinterpretation
+                # would corrupt values.
+                incompatible = {torch.float16, torch.bfloat16}
+                if {param.data.dtype, loaded_weight.dtype} == incompatible:
+                    loaded_weight = loaded_weight.to(param.data.dtype)
+                else:
+                    param.data = param.data.view(loaded_weight.dtype)
+            else:
+                loaded_weight = loaded_weight.to(param.data.dtype)
         loaded_weight = post_process_func(loaded_weight)
         if (
             loaded_weight.shape != param.data.shape
diff --git a/atom/models/deepseek_v2.py b/atom/models/deepseek_v2.py
index c9ad8e53f4..1ef7c7387f 100644
--- a/atom/models/deepseek_v2.py
+++ b/atom/models/deepseek_v2.py
@@ -1837,6 +1837,8 @@ def __init__(
             )
         else:
             self.norm = PPMissingLayer()
+        self.aux_hidden_state_layers: tuple[int, ...] = tuple()
+
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
             ["hidden_states", "residual"], config.hidden_size
         )
@@ -1850,7 +1852,9 @@ def forward(
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> Union[
+        torch.Tensor, IntermediateTensors, Tuple[torch.Tensor, list[torch.Tensor]]
+    ]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -1862,7 +1866,13 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer : self.end_layer]:
+        aux_hidden_states = []
+        for idx in range(self.start_layer, self.end_layer):
+            layer = self.layers[idx]
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(
+                    hidden_states if residual is None else hidden_states + residual
+                )
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
@@ -1871,6 +1881,9 @@ def forward(
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if aux_hidden_states:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
@@ -1971,6 +1984,17 @@ def make_empty_intermediate_tensors(
             }
         )
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """Default Eagle3 aux hidden-state layer ids: early / middle / late
+        of the target model. Aligned with vLLM's default (see
+        vllm/model_executor/models/deepseek_v2.py).
+        """
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return self.model.get_expert_mapping()
 
diff --git a/atom/models/eagle3_llama.py b/atom/models/eagle3_llama.py
new file mode 100644
index 0000000000..67837907b4
--- /dev/null
+++ b/atom/models/eagle3_llama.py
@@ -0,0 +1,301 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Eagle3 draft model (Llama full-attention) for speculative decoding.
+
+Implements the Eagle3 draft model matching the lightseekorg/kimi-k2.5-eagle3
+checkpoint layout:
+
+    embed_tokens.weight   — independent embedding
+    fc.weight             — aux fusion projection (hidden*3 -> hidden)
+    midlayer.*            — single decoder layer (dual-norm, wide QKV)
+    norm.weight           — final RMSNorm
+    lm_head.weight        — independent lm_head
+
+Weight keys map directly to model attribute paths; no key rewriting needed.
+"""
+
+import torch
+from aiter.dist.parallel_state import get_tensor_model_parallel_world_size
+from aiter.rotary_embedding import get_rope
+from atom.config import Config
+from atom.model_ops.activation import SiluAndMul
+from atom.model_ops.base_attention import Attention
+from atom.model_ops.embed_head import ParallelLMHead, VocabParallelEmbedding
+from atom.model_ops.layernorm import RMSNorm
+from atom.model_ops.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from atom.utils.decorators import support_torch_compile
+from torch import nn
+
+
+class Eagle3LlamaAttention(nn.Module):
+    """Llama full-attention with input_size = hidden_size * 2.
+
+    The QKV projection accepts the concatenation of normalized embeddings
+    and fc output, hence input_size is doubled compared to standard Llama.
+    """
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        cache_config: str = "bf16",
+        prefix: str = "",
+        layer_num: int = 0,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        # QKV input_size = hidden_size * 2 (concat of embed + fc_output)
+        attn_input_size = hidden_size * 2
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=attn_input_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=False,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=False,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=True,
+        )
+
+        sliding_window = -1
+        if getattr(config, "use_sliding_window", False) and getattr(
+            config, "sliding_window", None
+        ):
+            sliding_window = config.sliding_window
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            kv_cache_dtype=cache_config,
+            layer_num=layer_num,
+            prefix=f"{prefix}.attn",
+            rotary_emb=self.rotary_emb,
+            per_layer_sliding_window=sliding_window,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = torch.split(qkv, [self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v, positions)
+        output = self.o_proj(attn_output)
+        return output
+
+
+class Eagle3LlamaDecoderLayer(nn.Module):
+    """Single decoder layer for Eagle3 with dual-norm input.
+
+    Unlike standard LlamaDecoderLayer, this layer has:
+    - input_layernorm: normalizes the embedding input
+    - hidden_norm: normalizes the fc output (projected aux hidden states)
+    - Attention input is concat(normed_embed, normed_hidden) -> [N, hidden*2]
+    """
+
+    def __init__(
+        self,
+        config,
+        cache_config: str = "bf16",
+        prefix: str = "",
+        layer_num: int = 0,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Eagle3LlamaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+            layer_num=layer_num,
+        )
+
+        self.mlp = Eagle3LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            prefix=f"{prefix}.mlp",
+        )
+
+        # Dual norms matching checkpoint keys: midlayer.input_layernorm, midlayer.hidden_norm
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        embeds: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        normed_embeds = self.input_layernorm(embeds)
+        normed_hidden = self.hidden_norm(hidden_states)
+        # Concat for attention input: [N, hidden*2]
+        attn_input = torch.cat([normed_embeds, normed_hidden], dim=-1)
+        attn_output = self.self_attn(positions, attn_input)
+        # Residual connection on hidden_states
+        hidden_states = hidden_states + attn_output
+        # MLP with pre-norm + residual
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Eagle3LlamaMLP(nn.Module):
+    """Simple Llama MLP (gate+up fused, silu activation, down projection)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=False,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x = self.down_proj(x)
+        return x
+
+
+@support_torch_compile
+class Eagle3LlamaModel(nn.Module):
+    """Eagle3 draft model (Llama full-attention, single decoder layer).
+
+    Matches the lightseekorg/kimi-k2.5-eagle3 checkpoint layout:
+        embed_tokens.weight   [163840, 7168]  independent embedding
+        fc.weight             [7168, 21504]   aux fusion (hidden*3 -> hidden)
+        midlayer.*            single decoder layer
+        norm.weight           final RMSNorm
+        lm_head.weight        [163840, 7168]  independent lm_head
+    """
+
+    packed_modules_mapping = {
+        "q_proj": ("qkv_proj", "q"),
+        "k_proj": ("qkv_proj", "k"),
+        "v_proj": ("qkv_proj", "v"),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(self, atom_config: Config, prefix: str = "", layer_offset: int = 0):
+        super().__init__()
+        config = atom_config.hf_config
+        cache_config = atom_config.kv_cache_dtype
+        self.config = config
+
+        # Independent embedding (vocab matches target model)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+
+        # Aux fusion: concatenated aux hidden states [N, hidden*3] -> [N, hidden]
+        self.fc = ReplicatedLinear(
+            config.hidden_size * 3, config.hidden_size, bias=False
+        )
+
+        # Draft attention layer_num must start from the target model's layer
+        # count so kv_cache_data["layer_N"] maps to the correct cache entry.
+        self.midlayer = Eagle3LlamaDecoderLayer(
+            config=config,
+            cache_config=cache_config,
+            prefix="midlayer",
+            layer_num=layer_offset,
+        )
+
+        # Final norm
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Independent lm_head (not shared with target model)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+
+    def combine_hidden_states(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Project concatenated aux hidden states through fc.
+
+        Args:
+            hidden_states: [N, hidden_size * 3] (3 aux layers concatenated)
+
+        Returns:
+            [N, hidden_size] projected hidden states
+        """
+        return self.fc(hidden_states)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        embeds = self.embed_tokens(input_ids)
+        hidden_states = self.midlayer(positions, embeds, hidden_states)
+        hidden_states_prenorm = hidden_states
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, hidden_states_prenorm
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.lm_head(hidden_states)
diff --git a/atom/models/kimi_k25.py b/atom/models/kimi_k25.py
index eb4aaafb61..c0a002a3c8 100644
--- a/atom/models/kimi_k25.py
+++ b/atom/models/kimi_k25.py
@@ -88,5 +88,11 @@ def compute_logits(
     ) -> Optional[torch.Tensor]:
         return self.language_model.compute_logits(hidden_states)
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.language_model.set_aux_hidden_state_layers(layers)
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        return self.language_model.get_eagle3_aux_hidden_state_layers()
+
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return self.language_model.get_expert_mapping()
diff --git a/atom/spec_decode/eagle.py b/atom/spec_decode/eagle.py
index aa3aa4c8b1..bfa5fffe57 100644
--- a/atom/spec_decode/eagle.py
+++ b/atom/spec_decode/eagle.py
@@ -1,10 +1,13 @@
+import copy
 import logging
+from typing import Optional
 
 import numpy as np
 import torch
 import torch.nn as nn
+from aiter import dtypes
 from aiter.dist.parallel_state import get_pp_group
-from atom.config import CompilationLevel, Config
+from atom.config import CompilationLevel, Config, KVCacheTensor
 from atom.model_loader.loader import load_model
 from atom.utils import CpuGpuBuffer, resolve_obj_by_qualname
 from atom.utils.forward_context import SpecDecodeMetadata, get_forward_context
@@ -18,9 +21,137 @@
     "Qwen3NextMTPModel": "atom.models.qwen3_next_mtp.Qwen3NextMTP",
     "MiMoV2FlashMTPModel": "atom.models.mimo_v2_flash_mtp.MiMoV2FlashMTP",
     "Qwen3_5MTPModel": "atom.models.qwen3_5_mtp.Qwen3_5MTP",
+    "Eagle3LlamaModel": "atom.models.eagle3_llama.Eagle3LlamaModel",
 }
 
 
+class Eagle3DraftBuilder:
+    """KV cache subsystem for an Eagle3 MHA draft alongside a non-MHA target.
+
+    Implements the same subset of `AttentionMetadataBuilder` hooks that
+    ModelRunner consults during KV pool sizing and per-module binding —
+    `compute_block_bytes`, `allocate_kv_cache_tensors`, and
+    `build_kv_cache_tensor` — so the draft's independent non-MLA cache
+    fits the post-#659 builder protocol without leaking into the target's
+    builder. The draft does NOT drive prepare_decode/prepare_prefill;
+    it piggybacks on the target builder's metadata flow during propose.
+    """
+
+    def __init__(self, model_runner, draft_hf):
+        self.model_runner = model_runner
+        self.draft_hf = draft_hf
+        self.block_size = model_runner.block_size
+        self.num_kv_heads = draft_hf.num_key_value_heads // model_runner.world_size
+        self.num_layers = draft_hf.num_hidden_layers
+        self.head_dim = draft_hf.head_dim
+        self._next_layer_id = 0  # consumed by build_kv_cache_tensor
+        self.num_blocks = 0  # set in allocate_kv_cache_tensors
+
+    def compute_block_bytes(self) -> int:
+        """Per-block bytes for the draft's independent non-MLA KV cache."""
+        kv_dtype_size = dtypes.d_dtypes[
+            self.model_runner.config.kv_cache_dtype
+        ].itemsize
+        bb = (
+            2
+            * self.num_layers
+            * self.block_size
+            * self.num_kv_heads
+            * self.head_dim
+            * kv_dtype_size
+        )
+        if self.model_runner.config.kv_cache_dtype == "fp8":
+            # fp8 KV cache needs an extra per-(layer, block, kv_head) scale
+            # tensor (one fp32 per element) to dequantize fp8 → bf16 at
+            # attention time. Reserve that space alongside the cache.
+            bb += (
+                2
+                * self.num_layers
+                * self.block_size
+                * self.num_kv_heads
+                * dtypes.fp32.itemsize
+            )
+        return bb
+
+    def allocate_kv_cache_tensors(self, num_kv_heads, num_draft_layers) -> dict:
+        """Allocate the draft's [2, L, blocks, block_size, kv_heads, head_dim]
+        cache and matching fp32 scale; ModelRunner setattr's both onto itself
+        under namespaced keys so they don't collide with the target builder's
+        `kv_cache` / `kv_scale`.
+        """
+        runner = self.model_runner
+        config = runner.config
+        # Draft's block budget scales with the target pool: same total token
+        # capacity, just paged at the draft's own block size.
+        self.num_blocks = (
+            config.num_kvcache_blocks * runner.block_size // self.block_size
+        )
+        cache = torch.zeros(
+            2,
+            self.num_layers,
+            self.num_blocks,
+            self.block_size,
+            self.num_kv_heads,
+            self.head_dim,
+            dtype=dtypes.d_dtypes[config.kv_cache_dtype],
+            device="cuda",
+        )
+        scale = torch.zeros(
+            2,
+            self.num_layers,
+            self.num_blocks,
+            self.num_kv_heads,
+            self.block_size,
+            dtype=dtypes.fp32,
+            device="cuda",
+        )
+        logger.info(f"Allocated Eagle3 draft KV cache: {cache.shape}")
+        return {"eagle3_kv_cache": cache, "eagle3_kv_scale": scale}
+
+    def build_kv_cache_tensor(self, layer_id: int, module):
+        """Bind one Eagle3 draft attention module to its slice of the
+        independent draft KV cache. Returns None for non-MHA modules so
+        ModelRunner falls through to the target builder.
+        """
+        if not (
+            hasattr(module, "base_attention")
+            and hasattr(module, "use_mla")
+            and not module.use_mla
+        ):
+            return None
+        runner = self.model_runner
+        idx = self._next_layer_id
+        self._next_layer_id += 1
+        cache = runner.eagle3_kv_cache
+        x = 16 // cache.element_size()
+        k_cache = cache[0, idx].view(
+            self.num_blocks,
+            self.num_kv_heads,
+            self.head_dim // x,
+            self.block_size,
+            x,
+        )
+        v_cache = cache[1, idx].view(
+            self.num_blocks,
+            self.num_kv_heads,
+            self.head_dim,
+            self.block_size,
+        )
+        module.max_model_len = runner.config.max_model_len
+        if runner.config.kv_cache_dtype == "fp8":
+            module.k_scale = runner.eagle3_kv_scale[0, idx]
+            module.v_scale = runner.eagle3_kv_scale[1, idx]
+        module.k_cache = k_cache
+        module.v_cache = v_cache
+        return KVCacheTensor(
+            layer_num=layer_id,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            k_scale=getattr(module, "k_scale", None),
+            v_scale=getattr(module, "v_scale", None),
+        )
+
+
 class EagleProposer:
 
     def __init__(
@@ -49,7 +180,30 @@ def __init__(
         self.device = device
         draft_model_hf_config = self.speculative_config.draft_model_hf_config
         model_class = resolve_obj_by_qualname(support_eagle_model_arch_dict[draft_model_hf_config.architectures[0]])  # type: ignore
-        self.model = model_class(self.config)
+
+        if self.speculative_config.method == "eagle3":
+            # Eagle3 draft model has its own architecture (Llama, not MLA),
+            # so it must be constructed with the draft model's hf_config.
+            # Also disable torch.compile for the draft model to avoid
+            # Dynamo tracing issues with the separate KV cache binding.
+            draft_atom_config = copy.deepcopy(atom_config)
+            draft_atom_config.hf_config = draft_model_hf_config
+            draft_atom_config.compilation_config.level = CompilationLevel.NO_COMPILATION
+            # Draft attention layer_num must continue from the target model's
+            # layer count so it maps to the correct kv_cache_data entry.
+            self.model = model_class(
+                draft_atom_config,
+                layer_offset=atom_config.hf_config.num_hidden_layers,
+            )
+            # Attach the draft's KV-cache builder to the runner. ModelRunner
+            # consults `runner.eagle3_draft_builder` from `_compute_block_bytes`
+            # / `allocate_kv_cache` to size + allocate + bind the draft's
+            # independent non-MLA cache through the standard builder protocol.
+            runner.eagle3_draft_builder = Eagle3DraftBuilder(
+                runner, draft_model_hf_config
+            )
+        else:
+            self.model = model_class(self.config)
 
         i32_kwargs = {"dtype": torch.int32, "device": self.device}
         i64_kwargs = {"dtype": torch.int64, "device": self.device}
@@ -78,6 +232,23 @@ def _share_if_not_loaded(
             setattr(owner, attr, source)
 
     def load_model(self, target_model: nn.Module) -> None:
+        if self.speculative_config.method == "eagle3":
+            # Eagle3: load from a separate draft model checkpoint with
+            # independent embed_tokens and lm_head (no sharing).
+            load_model(
+                self.model,
+                self.speculative_config.model,
+                self.speculative_config.draft_model_hf_config,
+                self.config.load_dummy,
+                False,
+            )
+            logger.info(
+                "Eagle3 draft model loaded from %s (independent embed/lm_head)",
+                self.speculative_config.model,
+            )
+            return
+
+        # MTP: load from the target model checkpoint and share embeddings/lm_head.
         loaded = load_model(
             self.model,
             self.config.model,
@@ -101,11 +272,6 @@ def load_model(self, target_model: nn.Module) -> None:
             )
             del self.model.model.embed_tokens
             self.model.model.embed_tokens = target_base.model.embed_tokens
-        else:
-            logger.info(
-                "The EAGLE head's vocab embedding will be loaded separately"
-                " from the target model."
-            )
 
         # Share lm_head from target if not loaded from checkpoint.
         # Case 1: per-layer shared_head.head (DeepSeek MTP)
@@ -148,6 +314,7 @@ def propose(
         num_reject_tokens: torch.Tensor,
         next_token_ids: torch.Tensor,
         last_token_indices: torch.Tensor,
+        aux_hidden_states: Optional[list[torch.Tensor]] = None,
     ) -> torch.Tensor:
 
         forward_context = get_forward_context()
@@ -161,21 +328,48 @@ def propose(
         # input_ids[last_token_indices] = next_token_ids
         input_ids.scatter_(0, last_token_indices, next_token_ids)
         positions = target_positions + 1
-        hidden_states = target_hidden_states
+
+        # Eagle3: project concatenated aux hidden states through fc
+        if aux_hidden_states is not None:
+            concat_aux = torch.cat(aux_hidden_states, dim=-1)
+            hidden_states = self.model.combine_hidden_states(concat_aux)
+        else:
+            hidden_states = target_hidden_states
 
         draft_token_ids = torch.empty(
             bs, self.mtp_k, dtype=next_token_ids.dtype, device=next_token_ids.device
         )
-        # return draft_token_ids.fill_(1) # for debug
         var = self.runner.forward_vars
-        use_mla = self.runner.use_mla
+        target_uses_mla = self.runner.use_mla
+        # Eaale3 only support mha currently
+        draft_uses_mha = hasattr(self.runner, "eagle3_draft_builder")
+
+        # Eagle3 MLA: re-slice slot_mapping to len(input_ids).
+        # Target's MLA prepare_decode sized it
+        # to bs*max_q_len; after rejection len(input_ids) may be smaller,
+        # and the MHA cache-write kernel asserts slot_mapping <= q.
+        # Other fields (block_tables, context_lens, slot_mapping values
+        # themselves) are already in a draft-compatible format because
+        # MLA's prepare_decode uses the same runner.block_size as the draft.
+        if draft_uses_mha:
+            attn_metadata.slot_mapping = var["slot_mapping"].gpu[: len(input_ids)]
+
         for i in range(self.mtp_k):
             with record_function(f"draft[{i}/{self.mtp_k} bs={bs}]"):
-                ret_hidden_states = self.model(
+                model_output = self.model(
                     input_ids=input_ids,
                     positions=positions,
                     hidden_states=hidden_states,
                 )
+                # Eagle3 draft (the only draft_uses_mha case under narrow
+                # semantics) returns (post_norm, pre_norm); MTP drafts return
+                # a single hidden tensor.
+                if draft_uses_mha:
+                    ret_hidden_states, ret_hidden_prenorm = model_output
+                else:
+                    ret_hidden_states = model_output
+                    ret_hidden_prenorm = None
+
                 sample_hidden_states = (
                     torch.index_select(ret_hidden_states, 0, last_token_indices)
                     if i == 0
@@ -204,15 +398,15 @@ def propose(
                         attn_metadata.kv_indices = kv_indices
                         attn_metadata.cu_seqlens_q = cu_seqlens_q
                         attn_metadata.slot_mapping = slot_mapping
-                        if use_mla:
+                        if target_uses_mla:
                             kv_last_page_lens = var["kv_last_page_lens"].gpu[:bs]
                             attn_metadata.kv_last_page_lens = kv_last_page_lens
-                        else:
-                            # MHA needs block_tables and context_lens
+                        if not target_uses_mla or draft_uses_mha:
+                            # MHA drafts needs to update by itself, as MLA target wouldn't handle it but MHA target does.
                             attn_metadata.block_tables = var["block_tables"].gpu[:bs]
                             attn_metadata.context_lens = var["context_lens"].gpu[:bs]
                         cu_seqlens_q[: bs + 1] = self.arrange_bs[: bs + 1]
-                        if use_mla:
+                        if target_uses_mla:
                             # MLA: block_size=1, kv_indptr tracks tokens
                             kv_indptr[1 : bs + 1] -= torch.cumsum(
                                 num_reject_tokens, dim=0
@@ -222,8 +416,8 @@ def propose(
 
                     # update metadata
                     attn_metadata.max_seqlen_k += 1
-                    if not use_mla:
-                        # MHA: update context_lens for this draft step
+                    # MHA drafts needs to update by itself, as MLA wouldn't handle it
+                    if draft_uses_mha or not target_uses_mla:
                         attn_metadata.context_lens[:bs] += 1
                     workinfos = self.runner.attn_metadata_builder.prepare_mtp_decode(
                         bs,
@@ -239,9 +433,19 @@ def propose(
                     for k, v in workinfos.items():
                         attn_metadata.__dict__[k] = v
                     slot_mapping[:] = kv_indices[kv_indptr[1 : bs + 1] - 1]
+
                     input_ids = new_draft_ids
                     positions += 1
-                    hidden_states = sample_hidden_states
+                    if ret_hidden_prenorm is not None:
+                        hidden_states = (
+                            torch.index_select(
+                                ret_hidden_prenorm, 0, last_token_indices
+                            )
+                            if i == 0
+                            else ret_hidden_prenorm
+                        )
+                    else:
+                        hidden_states = sample_hidden_states
 
         # self.runner.debug(f"final {draft_token_ids=}")
         # [batch_size, mtp_k]

From f551b68524d7179d935a9b5c31097d230cf998d2 Mon Sep 17 00:00:00 2001
From: yihonglie <hyi@amd.com>
Date: Fri, 8 May 2026 16:18:21 +0000
Subject: [PATCH 2/2] ci: add nightly Eagle3 spec-decode accuracy test for
 Kimi-K2.5

Reuses the base Kimi-K2.5-MXFP4 model + lightseekorg/kimi-k2.5-eagle3
draft, runs at TP=8 (Eagle3 draft KV needs full 8-rank sharding) under
nightly schedule. Local case_verify_v9_gluon measured GSM8K 5-shot
flexible-extract = 0.9257 (vLLM = 0.9280); threshold set to 0.91 with
~1.5pp noise headroom.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/benchmark/models_accuracy.json | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/benchmark/models_accuracy.json b/.github/benchmark/models_accuracy.json
index a86ca34914..c1e84579b9 100644
--- a/.github/benchmark/models_accuracy.json
+++ b/.github/benchmark/models_accuracy.json
@@ -155,6 +155,18 @@
     "accuracy_baseline_model": "moonshotai/Kimi-K2.5",
     "_baseline_note": "HF: amd/Kimi-K2.5-MXFP4 card shows Kimi-K2.5 baseline=0.9409"
   },
+  {
+    "model_name": "Kimi-K2.5-MXFP4 Eagle3",
+    "model_path": "amd/Kimi-K2.5-MXFP4",
+    "extraArgs": "--kv_cache_dtype fp8 -tp 8 --trust-remote-code --method eagle3 --draft-model lightseekorg/kimi-k2.5-eagle3 --num-speculative-tokens 3",
+    "env_vars": "HSA_NO_SCRATCH_RECLAIM=1",
+    "runner": "atom-mi355-8gpu.predownload",
+    "test_level": "nightly",
+    "accuracy_threshold": 0.91,
+    "accuracy_baseline": 0.9257,
+    "accuracy_baseline_model": "amd/Kimi-K2.5-MXFP4 + lightseekorg/kimi-k2.5-eagle3",
+    "_baseline_note": "Eagle3 spec decode on Kimi-K2.5-MXFP4. Local case_verify_v9_gluon GSM8K 5-shot flexible-extract=0.9257 (vLLM=0.9280, within ±0.71% se). Threshold 0.91 leaves ~1.5pp headroom for noise. -tp 8 (vs base entry's tp=4) because Eagle3 draft KV needs the full 8-rank sharding."
+  },
   {
     "model_name": "GLM-5-FP8",
     "model_path": "zai-org/GLM-5-FP8",