From f3f85f27ef6fca42617485f04bde9a92500afc56 Mon Sep 17 00:00:00 2001 From: yihonglie Date: Fri, 8 May 2026 05:16:53 +0000 Subject: [PATCH 1/2] [Kimi] support Eagle3 speculative decoding for Kimi K2.5 Adds Eagle3 spec decode for Kimi K2.5 (MLA target + standard MHA draft): - Eagle3LlamaModel: 1-layer Llama draft (dual-norm input, wide QKV, independent embed/lm_head) matching the lightseekorg/kimi-k2.5-eagle3 checkpoint - Eagle3DraftBuilder: implements the post-#659 builder protocol (compute_block_bytes / allocate_kv_cache_tensors / build_kv_cache_tensor) for the draft's independent non-MLA KV cache, attached to the runner from EagleProposer.__init__ via runner.eagle3_draft_builder. ModelRunner delegates KV pool sizing, allocation, and per-module binding through this hook with no eagle3-specific code in the runner KV path - Aux hidden state pipeline: target forward returns (hidden, aux_hidden_states), captured through CUDAGraph via graph_aux_hidden and fed to the draft's combine_hidden_states (fc) as input - SpeculativeConfig: --method eagle3 + --draft-model CLI; eagle3 vs MTP branching at construction time; fail-fast if draft is MLA - Scheduler: spec_stats only updated when speculation actually ran (matches vLLM's gating) - propose: draft-perspective predicate `draft_uses_mha = hasattr(runner, "eagle3_draft_builder")` drives both the metadata-flow special-cases (slot_mapping re-slice, context_lens += 1, tuple-unpack of the draft return value); is_eagle3 string comparison is gone from the hot path Result on Kimi-K2.5-MXFP4 + kimi-k2.5-eagle3, 8x MI355X, gsm8k 5-shot: acceptance 67.85%, accuracy 93.78%. Co-Authored-By: Claude Opus 4.7 (1M context) --- atom/config.py | 28 +++ atom/model_engine/arg_utils.py | 30 ++- atom/model_engine/model_runner.py | 135 +++++++++++--- atom/model_engine/scheduler.py | 13 +- atom/model_ops/attention_mha.py | 15 +- atom/model_ops/linear.py | 18 +- atom/models/deepseek_v2.py | 28 ++- atom/models/eagle3_llama.py | 301 ++++++++++++++++++++++++++++++ atom/models/kimi_k25.py | 6 + atom/spec_decode/eagle.py | 240 ++++++++++++++++++++++-- 10 files changed, 752 insertions(+), 62 deletions(-) create mode 100644 atom/models/eagle3_llama.py diff --git a/atom/config.py b/atom/config.py index 2f42471389..5fc9e700c9 100644 --- a/atom/config.py +++ b/atom/config.py @@ -723,6 +723,8 @@ class SpeculativeConfig: model: Optional[str] = None num_speculative_tokens: Optional[int] = None draft_model_hf_config: Optional[PretrainedConfig] = None + use_aux_hidden_state: bool = False + eagle3_aux_layer_ids: list[int] = field(default_factory=list) # model_type → mtp_model_type mapping _MTP_TYPE_MAP: ClassVar[dict[str, str]] = { @@ -753,8 +755,34 @@ def __post_init__(self): self.draft_model_hf_config = self.draft_model_hf_config.text_config self.hf_config_override(self.draft_model_hf_config) + if self.method == "eagle3": + if getattr(self.draft_model_hf_config, "kv_lora_rank", None): + raise NotImplementedError( + "Eagle3 draft model with MLA attention is not supported" + ) + # Aux hidden state layers: prefer the draft checkpoint's + # eagle_config; if absent or the list is empty, ModelRunner + # falls back to model.get_eagle3_aux_hidden_state_layers(), + # which defaults to 3 layers — early / middle / late + # (see DeepseekV2ForCausalLM.get_eagle3_aux_hidden_state_layers, + # returns `(2, num_layers // 2, num_layers - 3)`, aligned with vLLM). + eagle_cfg = getattr(self.draft_model_hf_config, "eagle_config", None) + if eagle_cfg: + self.use_aux_hidden_state = eagle_cfg.get("use_aux_hidden_state", False) + if self.use_aux_hidden_state and not self.eagle3_aux_layer_ids: + self.eagle3_aux_layer_ids = eagle_cfg.get( + "eagle_aux_hidden_state_layer_ids", [] + ) + else: + self.use_aux_hidden_state = True + @staticmethod def hf_config_override(hf_config: PretrainedConfig) -> None: + # Eagle3 architecture mapping (architecture-level, not model_type) + arch = (getattr(hf_config, "architectures", None) or [""])[0] + if arch == "LlamaForCausalLMEagle3": + hf_config.architectures = ["Eagle3LlamaModel"] + # Step 1: resolve model_type → mtp model_type mtp_type = SpeculativeConfig._MTP_TYPE_MAP.get(hf_config.model_type) if mtp_type is not None: diff --git a/atom/model_engine/arg_utils.py b/atom/model_engine/arg_utils.py index 1358c04052..e5970603f6 100644 --- a/atom/model_engine/arg_utils.py +++ b/atom/model_engine/arg_utils.py @@ -51,6 +51,7 @@ class EngineArgs: method: Optional[str] = None num_speculative_tokens: int = 1 kv_transfer_config: str = "{}" + draft_model: Optional[str] = None mark_trace: bool = False @staticmethod @@ -163,7 +164,7 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: "--method", type=str, default=None, - choices=["mtp"], + choices=["mtp", "eagle3"], help="Speculative method", ) parser.add_argument( @@ -172,6 +173,12 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: default=1, help="Number of speculative tokens to generate per iteration (draft model runs this many times autoregressively)", ) + parser.add_argument( + "--draft-model", + type=str, + default=None, + help="Path to external Eagle3 draft model. Required when --method eagle3.", + ) parser.add_argument( "--max-num-batched-tokens", type=int, @@ -243,14 +250,25 @@ def _get_engine_kwargs(self) -> dict: ), ) if self.method and self.num_speculative_tokens > 0: - kwargs["speculative_config"] = SpeculativeConfig( - method=kwargs.pop("method"), - model=self.model, - num_speculative_tokens=kwargs.pop("num_speculative_tokens"), - ) + method = kwargs.pop("method") + num_spec_tokens = kwargs.pop("num_speculative_tokens") + draft_model = kwargs.pop("draft_model") + if method == "eagle3": + kwargs["speculative_config"] = SpeculativeConfig( + method=method, + model=draft_model, + num_speculative_tokens=num_spec_tokens, + ) + else: + kwargs["speculative_config"] = SpeculativeConfig( + method=method, + model=self.model, + num_speculative_tokens=num_spec_tokens, + ) else: kwargs.pop("method") kwargs.pop("num_speculative_tokens") + kwargs.pop("draft_model") kwargs["speculative_config"] = None # --enable-tbo [prefill|all] → enable_tbo + enable_tbo_decode diff --git a/atom/model_engine/model_runner.py b/atom/model_engine/model_runner.py index ce607625df..3561d27b52 100644 --- a/atom/model_engine/model_runner.py +++ b/atom/model_engine/model_runner.py @@ -559,6 +559,13 @@ def __init__(self, rank: int, config: Config): self.num_spec_tokens = ( self.config.speculative_config.num_speculative_tokens if use_spec else 0 ) + self.eagle3_mode = ( + self.config.speculative_config is not None + and self.config.speculative_config.method == "eagle3" + ) + + self.use_aux_hidden_state_outputs = False + self._aux_hidden_states = None self.tokenID_processor = tokenIDProcessor( self, self.config.max_num_batched_tokens, @@ -621,6 +628,18 @@ def __init__(self, rank: int, config: Config): torch.set_default_device(None) logger.info("Loading drafter model...") self.drafter.load_model(self.model) + + if self.eagle3_mode and self.config.speculative_config.use_aux_hidden_state: + aux_ids = self.config.speculative_config.eagle3_aux_layer_ids + if not aux_ids and hasattr( + self.model, "get_eagle3_aux_hidden_state_layers" + ): + aux_ids = list(self.model.get_eagle3_aux_hidden_state_layers()) + if aux_ids: + self.model.set_aux_hidden_state_layers(tuple(aux_ids)) + self.use_aux_hidden_state_outputs = True + logger.info(f"Eagle3 aux hidden state layers: {aux_ids}") + torch.set_default_device(self.device) self.async_execute_stream = torch.cuda.Stream(self.device) self.allocate_forward_vars() @@ -1075,24 +1094,35 @@ def _get_num_kv_heads(self): return 1 def _get_total_num_layers(self): - """Return total layer count including draft (MTP) layers.""" + """Return total layer count including draft (MTP) layers. + + Drafts that own an independent KV cache via their own builder + (e.g. Eagle3 MHA draft on an MLA target) account for their layers + through that builder, so they are NOT added here. Only MTP-style + drafts that share the target's KV pool contribute. + """ total = self.config.hf_config.num_hidden_layers if self.config.speculative_config and hasattr(self, "drafter"): - draft_hf = self.config.speculative_config.draft_model_hf_config - total += getattr(draft_hf, "num_nextn_predict_layers", 1) + if not hasattr(self, "eagle3_draft_builder"): + draft_hf = self.config.speculative_config.draft_model_hf_config + total += getattr(draft_hf, "num_nextn_predict_layers", 1) return total def _compute_block_bytes(self): """Per-block bytes for the unified KV pool budget. - Delegates to the attention builder, which knows its own tensor - layout (MLA 576-dim packed, GDN-hybrid full-attn-only, MiMo-V2 - per-layer-type, standard MHA split-K/V). Mirror of - `attn_metadata_builder.allocate_kv_cache_tensors()` so the budget - math matches what's actually allocated. Per-request cache bytes - are accounted for separately via `compute_per_req_cache_bytes()`. + Sum across all attention builders attached to this runner: the + target builder always, plus an optional `eagle3_draft_builder` + when a heterogeneous spec-decode draft owns its own KV pool. Each + builder knows its own tensor layout (MLA 576-dim packed, GDN-hybrid + full-attn-only, MiMo-V2 per-layer-type, standard MHA split-K/V, + Eagle3 independent MHA). Per-request cache bytes are accounted + for separately via `compute_per_req_cache_bytes()`. """ - return self.attn_metadata_builder.compute_block_bytes() + block_bytes = self.attn_metadata_builder.compute_block_bytes() + if hasattr(self, "eagle3_draft_builder"): + block_bytes += self.eagle3_draft_builder.compute_block_bytes() + return block_bytes def _estimate_cudagraph_overhead(self): """Estimate GPU memory consumed by CUDA graph capture. @@ -1255,13 +1285,24 @@ def allocate_kv_cache(self, num_kvcache_blocks): num_draft_layers = 0 if self.config.speculative_config and hasattr(self, "drafter"): draft_hf_config = self.config.speculative_config.draft_model_hf_config - # For MTP, use num_nextn_predict_layers instead of num_hidden_layers - num_draft_layers = getattr(draft_hf_config, "num_nextn_predict_layers", 1) - total_num_layers += num_draft_layers - logger.info( - f"Allocating KV cache for {hf_config.num_hidden_layers} target layers + " - f"{num_draft_layers} draft (MTP) layers = {total_num_layers} total layers" - ) + if hasattr(self, "eagle3_draft_builder"): + # Heterogeneous draft (e.g. Eagle3 MHA on MLA target) owns + # its own KV pool via its builder; don't add to target's count. + num_draft_layers = draft_hf_config.num_hidden_layers + logger.info( + f"Allocating KV cache for {hf_config.num_hidden_layers} target layers + " + f"{num_draft_layers} Eagle3 draft layers (separate non-MLA cache)" + ) + else: + # For MTP, use num_nextn_predict_layers instead of num_hidden_layers + num_draft_layers = getattr( + draft_hf_config, "num_nextn_predict_layers", 1 + ) + total_num_layers += num_draft_layers + logger.info( + f"Allocating KV cache for {hf_config.num_hidden_layers} target layers + " + f"{num_draft_layers} draft (MTP) layers = {total_num_layers} total layers" + ) # Primary KV cache allocation (model-agnostic, delegated to the # attention builder). Each builder owns its tensor layout: MLA → @@ -1277,6 +1318,16 @@ def allocate_kv_cache(self, num_kvcache_blocks): for name, value in main_kv.items(): setattr(self, name, value) + # Heterogeneous draft (e.g. Eagle3 MHA alongside an MLA target) owns + # its own KV pool through a sibling builder; same protocol as above, + # tensors land under namespaced keys (eagle3_kv_cache, eagle3_kv_scale). + if hasattr(self, "eagle3_draft_builder"): + draft_kv = self.eagle3_draft_builder.allocate_kv_cache_tensors( + num_kv_heads, num_draft_layers + ) + for name, value in draft_kv.items(): + setattr(self, name, value) + # Per-request cache allocation (model-agnostic, delegated to the # attention metadata builder). For GDN this returns # `{"mamba_k_cache": ..., "mamba_v_cache": ...}`; for stateless @@ -1302,10 +1353,12 @@ def allocate_kv_cache(self, num_kvcache_blocks): kv_cache_tensors = [] layer_id = 0 # Promote to self so the attention builder's build_kv_cache_tensor() - # can access it without recomputing from drafter state. + # can access it without recomputing from drafter state. Heterogeneous + # drafts (Eagle3) own their own layer space via their builder, so + # leave mtp_start_layer_idx at hf_config.num_hidden_layers in that mode. self.mtp_start_layer_idx = ( self.drafter.model.model.mtp_start_layer_idx - if hasattr(self, "drafter") + if hasattr(self, "drafter") and not hasattr(self, "eagle3_draft_builder") else hf_config.num_hidden_layers ) for model_name, model in models_to_bind: @@ -1314,6 +1367,18 @@ def allocate_kv_cache(self, num_kvcache_blocks): ) for module in model.modules(): + # Drafts that own an independent KV pool (Eagle3) bind through + # their sibling builder first; for unrecognized modules it + # returns None and we fall through to the target builder. + if model_name == "draft" and hasattr(self, "eagle3_draft_builder"): + kv_cache_tensor = self.eagle3_draft_builder.build_kv_cache_tensor( + layer_id, module + ) + if kv_cache_tensor is not None: + kv_cache_tensors.append(kv_cache_tensor) + layer_id += 1 + continue + # Per-attention-type binding is owned by the attention # metadata builder; ModelRunner only walks modules and # collects the resulting KVCacheTensor entries. The builder @@ -1625,7 +1690,12 @@ def run_model( label += f" tok={batch.total_tokens_num} ctx={ctx_str}" label += "]" with record_function(label): - hidden_states = self.model(input_ids, positions) + model_output = self.model(input_ids, positions) + if self.use_aux_hidden_state_outputs: + hidden_states, self._aux_hidden_states = model_output + else: + hidden_states = model_output + self._aux_hidden_states = None logits = self.model.compute_logits(hidden_states) else: # decode[bs=128 tok=128 d=128] or decode[bs=128 tok=128 p=2 d=126 spec=3] @@ -1645,6 +1715,12 @@ def run_model( self.graphs[graph_key].replay() num_tokens = context.batch_size * max_q_len hidden_states = self.forward_vars["outputs"][:num_tokens] + if graph_key in self.graph_aux_hidden: + self._aux_hidden_states = [ + aux[:num_tokens] for aux in self.graph_aux_hidden[graph_key] + ] + else: + self._aux_hidden_states = None if self.logits_in_graph: logits = self.graph_logits[graph_key][:num_tokens] else: @@ -1833,6 +1909,7 @@ def propose_draft_token_ids( num_reject_tokens=num_reject_tokens, next_token_ids=next_token_ids, last_token_indices=last_token_indices, + aux_hidden_states=self._aux_hidden_states, ) return self.tokenID_processor.prepare_draft_ids(batch, draft_token) @@ -1880,6 +1957,7 @@ def capture_cudagraph(self): self.graphs: dict[tuple[int, int], torch.cuda.CUDAGraph] = dict() self.graph_logits: dict[tuple[int, int], torch.Tensor] = dict() + self.graph_aux_hidden: dict[tuple[int, int], list[torch.Tensor]] = dict() self.graph_pool = None is_tbo = self.config.enable_tbo and isinstance(self.model, UBatchWrapper) # TBO graphs don't capture compute_logits, so disable logits_in_graph. @@ -1930,9 +2008,13 @@ def capture_cudagraph(self): ) # Warmup - outputs[:num_tokens] = self.model( + model_output = self.model( input_ids[:num_tokens], positions[:num_tokens] ) + if self.use_aux_hidden_state_outputs: + outputs[:num_tokens] = model_output[0] + else: + outputs[:num_tokens] = model_output if self.logits_in_graph: self.model.compute_logits(outputs[:num_tokens]) @@ -1951,13 +2033,20 @@ def capture_cudagraph(self): gc.stream, output_buffer=outputs[:num_tokens], ) + graph_aux = None else: # Standard single-stream capture graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph, self.graph_pool, stream=gc.stream): - outputs[:num_tokens] = self.model( + model_output = self.model( input_ids[:num_tokens], positions[:num_tokens] ) + if self.use_aux_hidden_state_outputs: + outputs[:num_tokens] = model_output[0] + graph_aux = model_output[1] + else: + outputs[:num_tokens] = model_output + graph_aux = None if self.logits_in_graph: graph_logits = self.model.compute_logits( outputs[:num_tokens] @@ -1967,6 +2056,8 @@ def capture_cudagraph(self): self.graphs[(bs, max_q_len)] = graph if self.logits_in_graph and ubatch_slices is None: self.graph_logits[(bs, max_q_len)] = graph_logits + if graph_aux is not None: + self.graph_aux_hidden[(bs, max_q_len)] = graph_aux torch.cuda.synchronize() self.graph_bs.sort(reverse=False) diff --git a/atom/model_engine/scheduler.py b/atom/model_engine/scheduler.py index 9e9acd6ced..3d65369d4a 100644 --- a/atom/model_engine/scheduler.py +++ b/atom/model_engine/scheduler.py @@ -676,12 +676,21 @@ def postprocess( continue token_ids = prev_token_ids[idx] num_new_token = len(token_ids) - if self.spec_stats: - self.spec_stats.update(num_new_token) if is_deferred_out or self.use_spec: num_rejected = fwd_output.num_rejected[idx] num_bonus = fwd_output.num_bonus[idx] offset = 0 if (num_new_token + num_rejected) == 1 else self.mtp_k + # Align stats with vLLM: only count steps that actually ran + # speculation (drafts proposed and validated). Skip the + # prefill-only step where no draft tokens were scored against + # the target — vLLM gates this via + # `if scheduled_spec_token_ids and generated_token_ids`. + if ( + self.spec_stats + and num_new_token > 0 + and (num_new_token + num_rejected) > 1 + ): + self.spec_stats.update(num_new_token) seq.num_rejected = num_rejected seq.num_bonus_tokens = num_bonus for i, el in enumerate(token_ids): diff --git a/atom/model_ops/attention_mha.py b/atom/model_ops/attention_mha.py index c3ebd525a8..dd65fd9f33 100644 --- a/atom/model_ops/attention_mha.py +++ b/atom/model_ops/attention_mha.py @@ -127,7 +127,12 @@ def rope_cache(self, q, k, v, qkv, position, fwd_ctx: ForwardContext): k_scale = kv_cache_data[f"layer_{self.layer_num}"].k_scale v_scale = kv_cache_data[f"layer_{self.layer_num}"].v_scale - use_triton_attn = self.sliding_window != -1 or self.head_dim != 128 + # MTP MHA must go through triton/gluon; aiter ASM non-persistent path may have some unexpected behavior. + use_triton_attn = ( + self.sliding_window != -1 + or self.head_dim != 128 + or self.num_heads == self.num_kv_heads + ) self.use_triton_attn = use_triton_attn if ( @@ -501,9 +506,7 @@ def prefill_attention( # variable lenth attention use key value as input attn_metadata = fwd_ctx.attn_metadata sliding_window = ( - (self.sliding_window, 0, 0) - if self.sliding_window is not None - else (-1, -1, 0) + (self.sliding_window, 0, 0) if self.sliding_window > 0 else (-1, -1, 0) ) o = aiter.flash_attn_varlen_func( q, @@ -545,9 +548,7 @@ def prefill_attention_triton( o = torch.empty_like(q) descale_shape = (attn_metadata.cu_seqlens_q.shape[0] - 1, k.shape[1]) sliding_window = ( - (self.sliding_window - 1, 0) - if self.sliding_window is not None - else (-1, -1) + (self.sliding_window - 1, 0) if self.sliding_window > 0 else (-1, -1) ) unified_attention( q, diff --git a/atom/model_ops/linear.py b/atom/model_ops/linear.py index c3a09e829e..8441b06094 100644 --- a/atom/model_ops/linear.py +++ b/atom/model_ops/linear.py @@ -318,11 +318,19 @@ def weight_loader_process( loaded_weight: torch.Tensor, post_process_func: Callable = lambda a: a, ): - if ( - param.data.dtype != loaded_weight.dtype - and param.data.element_size() == loaded_weight.element_size() - ): - param.data = param.data.view(loaded_weight.dtype) + if param.data.dtype != loaded_weight.dtype: + if param.data.element_size() == loaded_weight.element_size(): + # Same byte-width: use view for raw-bit-compatible pairs + # (e.g. fp8 variants) but convert for semantically different + # formats (float16 ↔ bfloat16) where bit reinterpretation + # would corrupt values. + incompatible = {torch.float16, torch.bfloat16} + if {param.data.dtype, loaded_weight.dtype} == incompatible: + loaded_weight = loaded_weight.to(param.data.dtype) + else: + param.data = param.data.view(loaded_weight.dtype) + else: + loaded_weight = loaded_weight.to(param.data.dtype) loaded_weight = post_process_func(loaded_weight) if ( loaded_weight.shape != param.data.shape diff --git a/atom/models/deepseek_v2.py b/atom/models/deepseek_v2.py index c9ad8e53f4..1ef7c7387f 100644 --- a/atom/models/deepseek_v2.py +++ b/atom/models/deepseek_v2.py @@ -1837,6 +1837,8 @@ def __init__( ) else: self.norm = PPMissingLayer() + self.aux_hidden_state_layers: tuple[int, ...] = tuple() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size ) @@ -1850,7 +1852,9 @@ def forward( positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: + ) -> Union[ + torch.Tensor, IntermediateTensors, Tuple[torch.Tensor, list[torch.Tensor]] + ]: if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds @@ -1862,7 +1866,13 @@ def forward( hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer : self.end_layer]: + aux_hidden_states = [] + for idx in range(self.start_layer, self.end_layer): + layer = self.layers[idx] + if idx in self.aux_hidden_state_layers: + aux_hidden_states.append( + hidden_states if residual is None else hidden_states + residual + ) hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: @@ -1871,6 +1881,9 @@ def forward( ) hidden_states, _ = self.norm(hidden_states, residual) + + if aux_hidden_states: + return hidden_states, aux_hidden_states return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: @@ -1971,6 +1984,17 @@ def make_empty_intermediate_tensors( } ) + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + """Default Eagle3 aux hidden-state layer ids: early / middle / late + of the target model. Aligned with vLLM's default (see + vllm/model_executor/models/deepseek_v2.py). + """ + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() diff --git a/atom/models/eagle3_llama.py b/atom/models/eagle3_llama.py new file mode 100644 index 0000000000..67837907b4 --- /dev/null +++ b/atom/models/eagle3_llama.py @@ -0,0 +1,301 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +"""Eagle3 draft model (Llama full-attention) for speculative decoding. + +Implements the Eagle3 draft model matching the lightseekorg/kimi-k2.5-eagle3 +checkpoint layout: + + embed_tokens.weight — independent embedding + fc.weight — aux fusion projection (hidden*3 -> hidden) + midlayer.* — single decoder layer (dual-norm, wide QKV) + norm.weight — final RMSNorm + lm_head.weight — independent lm_head + +Weight keys map directly to model attribute paths; no key rewriting needed. +""" + +import torch +from aiter.dist.parallel_state import get_tensor_model_parallel_world_size +from aiter.rotary_embedding import get_rope +from atom.config import Config +from atom.model_ops.activation import SiluAndMul +from atom.model_ops.base_attention import Attention +from atom.model_ops.embed_head import ParallelLMHead, VocabParallelEmbedding +from atom.model_ops.layernorm import RMSNorm +from atom.model_ops.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from atom.utils.decorators import support_torch_compile +from torch import nn + + +class Eagle3LlamaAttention(nn.Module): + """Llama full-attention with input_size = hidden_size * 2. + + The QKV projection accepts the concatenation of normalized embeddings + and fc output, hence input_size is doubled compared to standard Llama. + """ + + def __init__( + self, + config, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + cache_config: str = "bf16", + prefix: str = "", + layer_num: int = 0, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + assert self.total_num_kv_heads % tp_size == 0 + else: + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + head_dim = getattr(config, "head_dim", None) + if head_dim is None: + head_dim = hidden_size // self.total_num_heads + self.head_dim = head_dim + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + + # QKV input_size = hidden_size * 2 (concat of embed + fc_output) + attn_input_size = hidden_size * 2 + self.qkv_proj = QKVParallelLinear( + hidden_size=attn_input_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=False, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=False, + prefix=f"{prefix}.o_proj", + ) + + rope_theta = getattr(config, "rope_theta", 10000) + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + is_neox_style=True, + ) + + sliding_window = -1 + if getattr(config, "use_sliding_window", False) and getattr( + config, "sliding_window", None + ): + sliding_window = config.sliding_window + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + kv_cache_dtype=cache_config, + layer_num=layer_num, + prefix=f"{prefix}.attn", + rotary_emb=self.rotary_emb, + per_layer_sliding_window=sliding_window, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv = self.qkv_proj(hidden_states) + q, k, v = torch.split(qkv, [self.q_size, self.kv_size, self.kv_size], dim=-1) + attn_output = self.attn(q, k, v, positions) + output = self.o_proj(attn_output) + return output + + +class Eagle3LlamaDecoderLayer(nn.Module): + """Single decoder layer for Eagle3 with dual-norm input. + + Unlike standard LlamaDecoderLayer, this layer has: + - input_layernorm: normalizes the embedding input + - hidden_norm: normalizes the fc output (projected aux hidden states) + - Attention input is concat(normed_embed, normed_hidden) -> [N, hidden*2] + """ + + def __init__( + self, + config, + cache_config: str = "bf16", + prefix: str = "", + layer_num: int = 0, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = Eagle3LlamaAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr( + config, "num_key_value_heads", config.num_attention_heads + ), + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + layer_num=layer_num, + ) + + self.mlp = Eagle3LlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + prefix=f"{prefix}.mlp", + ) + + # Dual norms matching checkpoint keys: midlayer.input_layernorm, midlayer.hidden_norm + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + embeds: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + normed_embeds = self.input_layernorm(embeds) + normed_hidden = self.hidden_norm(hidden_states) + # Concat for attention input: [N, hidden*2] + attn_input = torch.cat([normed_embeds, normed_hidden], dim=-1) + attn_output = self.self_attn(positions, attn_input) + # Residual connection on hidden_states + hidden_states = hidden_states + attn_output + # MLP with pre-norm + residual + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class Eagle3LlamaMLP(nn.Module): + """Simple Llama MLP (gate+up fused, silu activation, down projection).""" + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[intermediate_size] * 2, + bias=False, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=False, + prefix=f"{prefix}.down_proj", + ) + self.act_fn = SiluAndMul() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + gate_up = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x = self.down_proj(x) + return x + + +@support_torch_compile +class Eagle3LlamaModel(nn.Module): + """Eagle3 draft model (Llama full-attention, single decoder layer). + + Matches the lightseekorg/kimi-k2.5-eagle3 checkpoint layout: + embed_tokens.weight [163840, 7168] independent embedding + fc.weight [7168, 21504] aux fusion (hidden*3 -> hidden) + midlayer.* single decoder layer + norm.weight final RMSNorm + lm_head.weight [163840, 7168] independent lm_head + """ + + packed_modules_mapping = { + "q_proj": ("qkv_proj", "q"), + "k_proj": ("qkv_proj", "k"), + "v_proj": ("qkv_proj", "v"), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + + def __init__(self, atom_config: Config, prefix: str = "", layer_offset: int = 0): + super().__init__() + config = atom_config.hf_config + cache_config = atom_config.kv_cache_dtype + self.config = config + + # Independent embedding (vocab matches target model) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size + ) + + # Aux fusion: concatenated aux hidden states [N, hidden*3] -> [N, hidden] + self.fc = ReplicatedLinear( + config.hidden_size * 3, config.hidden_size, bias=False + ) + + # Draft attention layer_num must start from the target model's layer + # count so kv_cache_data["layer_N"] maps to the correct cache entry. + self.midlayer = Eagle3LlamaDecoderLayer( + config=config, + cache_config=cache_config, + prefix="midlayer", + layer_num=layer_offset, + ) + + # Final norm + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + # Independent lm_head (not shared with target model) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + + def combine_hidden_states(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Project concatenated aux hidden states through fc. + + Args: + hidden_states: [N, hidden_size * 3] (3 aux layers concatenated) + + Returns: + [N, hidden_size] projected hidden states + """ + return self.fc(hidden_states) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + embeds = self.embed_tokens(input_ids) + hidden_states = self.midlayer(positions, embeds, hidden_states) + hidden_states_prenorm = hidden_states + hidden_states = self.norm(hidden_states) + return hidden_states, hidden_states_prenorm + + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.lm_head(hidden_states) diff --git a/atom/models/kimi_k25.py b/atom/models/kimi_k25.py index eb4aaafb61..c0a002a3c8 100644 --- a/atom/models/kimi_k25.py +++ b/atom/models/kimi_k25.py @@ -88,5 +88,11 @@ def compute_logits( ) -> Optional[torch.Tensor]: return self.language_model.compute_logits(hidden_states) + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.language_model.set_aux_hidden_state_layers(layers) + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + return self.language_model.get_eagle3_aux_hidden_state_layers() + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.language_model.get_expert_mapping() diff --git a/atom/spec_decode/eagle.py b/atom/spec_decode/eagle.py index aa3aa4c8b1..bfa5fffe57 100644 --- a/atom/spec_decode/eagle.py +++ b/atom/spec_decode/eagle.py @@ -1,10 +1,13 @@ +import copy import logging +from typing import Optional import numpy as np import torch import torch.nn as nn +from aiter import dtypes from aiter.dist.parallel_state import get_pp_group -from atom.config import CompilationLevel, Config +from atom.config import CompilationLevel, Config, KVCacheTensor from atom.model_loader.loader import load_model from atom.utils import CpuGpuBuffer, resolve_obj_by_qualname from atom.utils.forward_context import SpecDecodeMetadata, get_forward_context @@ -18,9 +21,137 @@ "Qwen3NextMTPModel": "atom.models.qwen3_next_mtp.Qwen3NextMTP", "MiMoV2FlashMTPModel": "atom.models.mimo_v2_flash_mtp.MiMoV2FlashMTP", "Qwen3_5MTPModel": "atom.models.qwen3_5_mtp.Qwen3_5MTP", + "Eagle3LlamaModel": "atom.models.eagle3_llama.Eagle3LlamaModel", } +class Eagle3DraftBuilder: + """KV cache subsystem for an Eagle3 MHA draft alongside a non-MHA target. + + Implements the same subset of `AttentionMetadataBuilder` hooks that + ModelRunner consults during KV pool sizing and per-module binding — + `compute_block_bytes`, `allocate_kv_cache_tensors`, and + `build_kv_cache_tensor` — so the draft's independent non-MLA cache + fits the post-#659 builder protocol without leaking into the target's + builder. The draft does NOT drive prepare_decode/prepare_prefill; + it piggybacks on the target builder's metadata flow during propose. + """ + + def __init__(self, model_runner, draft_hf): + self.model_runner = model_runner + self.draft_hf = draft_hf + self.block_size = model_runner.block_size + self.num_kv_heads = draft_hf.num_key_value_heads // model_runner.world_size + self.num_layers = draft_hf.num_hidden_layers + self.head_dim = draft_hf.head_dim + self._next_layer_id = 0 # consumed by build_kv_cache_tensor + self.num_blocks = 0 # set in allocate_kv_cache_tensors + + def compute_block_bytes(self) -> int: + """Per-block bytes for the draft's independent non-MLA KV cache.""" + kv_dtype_size = dtypes.d_dtypes[ + self.model_runner.config.kv_cache_dtype + ].itemsize + bb = ( + 2 + * self.num_layers + * self.block_size + * self.num_kv_heads + * self.head_dim + * kv_dtype_size + ) + if self.model_runner.config.kv_cache_dtype == "fp8": + # fp8 KV cache needs an extra per-(layer, block, kv_head) scale + # tensor (one fp32 per element) to dequantize fp8 → bf16 at + # attention time. Reserve that space alongside the cache. + bb += ( + 2 + * self.num_layers + * self.block_size + * self.num_kv_heads + * dtypes.fp32.itemsize + ) + return bb + + def allocate_kv_cache_tensors(self, num_kv_heads, num_draft_layers) -> dict: + """Allocate the draft's [2, L, blocks, block_size, kv_heads, head_dim] + cache and matching fp32 scale; ModelRunner setattr's both onto itself + under namespaced keys so they don't collide with the target builder's + `kv_cache` / `kv_scale`. + """ + runner = self.model_runner + config = runner.config + # Draft's block budget scales with the target pool: same total token + # capacity, just paged at the draft's own block size. + self.num_blocks = ( + config.num_kvcache_blocks * runner.block_size // self.block_size + ) + cache = torch.zeros( + 2, + self.num_layers, + self.num_blocks, + self.block_size, + self.num_kv_heads, + self.head_dim, + dtype=dtypes.d_dtypes[config.kv_cache_dtype], + device="cuda", + ) + scale = torch.zeros( + 2, + self.num_layers, + self.num_blocks, + self.num_kv_heads, + self.block_size, + dtype=dtypes.fp32, + device="cuda", + ) + logger.info(f"Allocated Eagle3 draft KV cache: {cache.shape}") + return {"eagle3_kv_cache": cache, "eagle3_kv_scale": scale} + + def build_kv_cache_tensor(self, layer_id: int, module): + """Bind one Eagle3 draft attention module to its slice of the + independent draft KV cache. Returns None for non-MHA modules so + ModelRunner falls through to the target builder. + """ + if not ( + hasattr(module, "base_attention") + and hasattr(module, "use_mla") + and not module.use_mla + ): + return None + runner = self.model_runner + idx = self._next_layer_id + self._next_layer_id += 1 + cache = runner.eagle3_kv_cache + x = 16 // cache.element_size() + k_cache = cache[0, idx].view( + self.num_blocks, + self.num_kv_heads, + self.head_dim // x, + self.block_size, + x, + ) + v_cache = cache[1, idx].view( + self.num_blocks, + self.num_kv_heads, + self.head_dim, + self.block_size, + ) + module.max_model_len = runner.config.max_model_len + if runner.config.kv_cache_dtype == "fp8": + module.k_scale = runner.eagle3_kv_scale[0, idx] + module.v_scale = runner.eagle3_kv_scale[1, idx] + module.k_cache = k_cache + module.v_cache = v_cache + return KVCacheTensor( + layer_num=layer_id, + k_cache=k_cache, + v_cache=v_cache, + k_scale=getattr(module, "k_scale", None), + v_scale=getattr(module, "v_scale", None), + ) + + class EagleProposer: def __init__( @@ -49,7 +180,30 @@ def __init__( self.device = device draft_model_hf_config = self.speculative_config.draft_model_hf_config model_class = resolve_obj_by_qualname(support_eagle_model_arch_dict[draft_model_hf_config.architectures[0]]) # type: ignore - self.model = model_class(self.config) + + if self.speculative_config.method == "eagle3": + # Eagle3 draft model has its own architecture (Llama, not MLA), + # so it must be constructed with the draft model's hf_config. + # Also disable torch.compile for the draft model to avoid + # Dynamo tracing issues with the separate KV cache binding. + draft_atom_config = copy.deepcopy(atom_config) + draft_atom_config.hf_config = draft_model_hf_config + draft_atom_config.compilation_config.level = CompilationLevel.NO_COMPILATION + # Draft attention layer_num must continue from the target model's + # layer count so it maps to the correct kv_cache_data entry. + self.model = model_class( + draft_atom_config, + layer_offset=atom_config.hf_config.num_hidden_layers, + ) + # Attach the draft's KV-cache builder to the runner. ModelRunner + # consults `runner.eagle3_draft_builder` from `_compute_block_bytes` + # / `allocate_kv_cache` to size + allocate + bind the draft's + # independent non-MLA cache through the standard builder protocol. + runner.eagle3_draft_builder = Eagle3DraftBuilder( + runner, draft_model_hf_config + ) + else: + self.model = model_class(self.config) i32_kwargs = {"dtype": torch.int32, "device": self.device} i64_kwargs = {"dtype": torch.int64, "device": self.device} @@ -78,6 +232,23 @@ def _share_if_not_loaded( setattr(owner, attr, source) def load_model(self, target_model: nn.Module) -> None: + if self.speculative_config.method == "eagle3": + # Eagle3: load from a separate draft model checkpoint with + # independent embed_tokens and lm_head (no sharing). + load_model( + self.model, + self.speculative_config.model, + self.speculative_config.draft_model_hf_config, + self.config.load_dummy, + False, + ) + logger.info( + "Eagle3 draft model loaded from %s (independent embed/lm_head)", + self.speculative_config.model, + ) + return + + # MTP: load from the target model checkpoint and share embeddings/lm_head. loaded = load_model( self.model, self.config.model, @@ -101,11 +272,6 @@ def load_model(self, target_model: nn.Module) -> None: ) del self.model.model.embed_tokens self.model.model.embed_tokens = target_base.model.embed_tokens - else: - logger.info( - "The EAGLE head's vocab embedding will be loaded separately" - " from the target model." - ) # Share lm_head from target if not loaded from checkpoint. # Case 1: per-layer shared_head.head (DeepSeek MTP) @@ -148,6 +314,7 @@ def propose( num_reject_tokens: torch.Tensor, next_token_ids: torch.Tensor, last_token_indices: torch.Tensor, + aux_hidden_states: Optional[list[torch.Tensor]] = None, ) -> torch.Tensor: forward_context = get_forward_context() @@ -161,21 +328,48 @@ def propose( # input_ids[last_token_indices] = next_token_ids input_ids.scatter_(0, last_token_indices, next_token_ids) positions = target_positions + 1 - hidden_states = target_hidden_states + + # Eagle3: project concatenated aux hidden states through fc + if aux_hidden_states is not None: + concat_aux = torch.cat(aux_hidden_states, dim=-1) + hidden_states = self.model.combine_hidden_states(concat_aux) + else: + hidden_states = target_hidden_states draft_token_ids = torch.empty( bs, self.mtp_k, dtype=next_token_ids.dtype, device=next_token_ids.device ) - # return draft_token_ids.fill_(1) # for debug var = self.runner.forward_vars - use_mla = self.runner.use_mla + target_uses_mla = self.runner.use_mla + # Eaale3 only support mha currently + draft_uses_mha = hasattr(self.runner, "eagle3_draft_builder") + + # Eagle3 MLA: re-slice slot_mapping to len(input_ids). + # Target's MLA prepare_decode sized it + # to bs*max_q_len; after rejection len(input_ids) may be smaller, + # and the MHA cache-write kernel asserts slot_mapping <= q. + # Other fields (block_tables, context_lens, slot_mapping values + # themselves) are already in a draft-compatible format because + # MLA's prepare_decode uses the same runner.block_size as the draft. + if draft_uses_mha: + attn_metadata.slot_mapping = var["slot_mapping"].gpu[: len(input_ids)] + for i in range(self.mtp_k): with record_function(f"draft[{i}/{self.mtp_k} bs={bs}]"): - ret_hidden_states = self.model( + model_output = self.model( input_ids=input_ids, positions=positions, hidden_states=hidden_states, ) + # Eagle3 draft (the only draft_uses_mha case under narrow + # semantics) returns (post_norm, pre_norm); MTP drafts return + # a single hidden tensor. + if draft_uses_mha: + ret_hidden_states, ret_hidden_prenorm = model_output + else: + ret_hidden_states = model_output + ret_hidden_prenorm = None + sample_hidden_states = ( torch.index_select(ret_hidden_states, 0, last_token_indices) if i == 0 @@ -204,15 +398,15 @@ def propose( attn_metadata.kv_indices = kv_indices attn_metadata.cu_seqlens_q = cu_seqlens_q attn_metadata.slot_mapping = slot_mapping - if use_mla: + if target_uses_mla: kv_last_page_lens = var["kv_last_page_lens"].gpu[:bs] attn_metadata.kv_last_page_lens = kv_last_page_lens - else: - # MHA needs block_tables and context_lens + if not target_uses_mla or draft_uses_mha: + # MHA drafts needs to update by itself, as MLA target wouldn't handle it but MHA target does. attn_metadata.block_tables = var["block_tables"].gpu[:bs] attn_metadata.context_lens = var["context_lens"].gpu[:bs] cu_seqlens_q[: bs + 1] = self.arrange_bs[: bs + 1] - if use_mla: + if target_uses_mla: # MLA: block_size=1, kv_indptr tracks tokens kv_indptr[1 : bs + 1] -= torch.cumsum( num_reject_tokens, dim=0 @@ -222,8 +416,8 @@ def propose( # update metadata attn_metadata.max_seqlen_k += 1 - if not use_mla: - # MHA: update context_lens for this draft step + # MHA drafts needs to update by itself, as MLA wouldn't handle it + if draft_uses_mha or not target_uses_mla: attn_metadata.context_lens[:bs] += 1 workinfos = self.runner.attn_metadata_builder.prepare_mtp_decode( bs, @@ -239,9 +433,19 @@ def propose( for k, v in workinfos.items(): attn_metadata.__dict__[k] = v slot_mapping[:] = kv_indices[kv_indptr[1 : bs + 1] - 1] + input_ids = new_draft_ids positions += 1 - hidden_states = sample_hidden_states + if ret_hidden_prenorm is not None: + hidden_states = ( + torch.index_select( + ret_hidden_prenorm, 0, last_token_indices + ) + if i == 0 + else ret_hidden_prenorm + ) + else: + hidden_states = sample_hidden_states # self.runner.debug(f"final {draft_token_ids=}") # [batch_size, mtp_k] From f551b68524d7179d935a9b5c31097d230cf998d2 Mon Sep 17 00:00:00 2001 From: yihonglie Date: Fri, 8 May 2026 16:18:21 +0000 Subject: [PATCH 2/2] ci: add nightly Eagle3 spec-decode accuracy test for Kimi-K2.5 Reuses the base Kimi-K2.5-MXFP4 model + lightseekorg/kimi-k2.5-eagle3 draft, runs at TP=8 (Eagle3 draft KV needs full 8-rank sharding) under nightly schedule. Local case_verify_v9_gluon measured GSM8K 5-shot flexible-extract = 0.9257 (vLLM = 0.9280); threshold set to 0.91 with ~1.5pp noise headroom. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/benchmark/models_accuracy.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/benchmark/models_accuracy.json b/.github/benchmark/models_accuracy.json index a86ca34914..c1e84579b9 100644 --- a/.github/benchmark/models_accuracy.json +++ b/.github/benchmark/models_accuracy.json @@ -155,6 +155,18 @@ "accuracy_baseline_model": "moonshotai/Kimi-K2.5", "_baseline_note": "HF: amd/Kimi-K2.5-MXFP4 card shows Kimi-K2.5 baseline=0.9409" }, + { + "model_name": "Kimi-K2.5-MXFP4 Eagle3", + "model_path": "amd/Kimi-K2.5-MXFP4", + "extraArgs": "--kv_cache_dtype fp8 -tp 8 --trust-remote-code --method eagle3 --draft-model lightseekorg/kimi-k2.5-eagle3 --num-speculative-tokens 3", + "env_vars": "HSA_NO_SCRATCH_RECLAIM=1", + "runner": "atom-mi355-8gpu.predownload", + "test_level": "nightly", + "accuracy_threshold": 0.91, + "accuracy_baseline": 0.9257, + "accuracy_baseline_model": "amd/Kimi-K2.5-MXFP4 + lightseekorg/kimi-k2.5-eagle3", + "_baseline_note": "Eagle3 spec decode on Kimi-K2.5-MXFP4. Local case_verify_v9_gluon GSM8K 5-shot flexible-extract=0.9257 (vLLM=0.9280, within ±0.71% se). Threshold 0.91 leaves ~1.5pp headroom for noise. -tp 8 (vs base entry's tp=4) because Eagle3 draft KV needs the full 8-rank sharding." + }, { "model_name": "GLM-5-FP8", "model_path": "zai-org/GLM-5-FP8",