From 8fac4b1cc8689bcddd7816889f63c34ef2121232 Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Sun, 14 Dec 2025 18:12:33 +0000 Subject: [PATCH 1/9] feat: add EAGLE3 speculative decoding support EAGLE3 is an encoder-decoder based speculative decoding method: - Extracts features from target model at specific layers - Uses feature fusion layer to compress target features - Generates draft tokens with single-layer decoder - Maps draft vocabulary to target vocabulary via d2t tensor Key changes: - Add LLM_ARCH_EAGLE3 architecture - Add EAGLE3 encoder/decoder graph (src/models/eagle3.cpp) - Add feature extraction from target model layers - Add g_embeddings handling for decoder input - Add GGML_TENSOR_FLAG_SYNC for GPU synchronization - Add --eagle3 flag for speculative-simple example - Add EAGLE3 model conversion in convert_hf_to_gguf.py --- common/arg.cpp | 7 + common/common.h | 2 + common/speculative.cpp | 199 +++++++++++++++++ common/speculative.h | 7 + convert_hf_to_gguf.py | 120 +++++++++- .../speculative-simple/speculative-simple.cpp | 145 ++++++++++-- ggml/include/ggml.h | 2 + ggml/src/ggml-backend.cpp | 14 ++ ggml/src/ggml.c | 4 + gguf-py/gguf/constants.py | 29 +++ include/llama.h | 24 ++ src/CMakeLists.txt | 1 + src/llama-arch.cpp | 32 +++ src/llama-arch.h | 7 + src/llama-context.cpp | 208 +++++++++++++++++- src/llama-context.h | 12 + src/llama-cparams.h | 1 + src/llama-graph.cpp | 1 + src/llama-graph.h | 26 +++ src/llama-hparams.h | 7 + src/llama-model.cpp | 87 ++++++++ src/llama-model.h | 10 + src/models/eagle3.cpp | 187 ++++++++++++++++ src/models/llama.cpp | 10 + src/models/models.h | 8 + 25 files changed, 1119 insertions(+), 31 deletions(-) create mode 100644 src/models/eagle3.cpp diff --git a/common/arg.cpp b/common/arg.cpp index aaa7b92a2e9..de8f0355db1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3007,6 +3007,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.p_min = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN")); + add_opt(common_arg( + {"--eagle3"}, + "use EAGLE3 speculative decoding with the draft model", + [](common_params & params) { + params.speculative.eagle3 = true; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), diff --git a/common/common.h b/common/common.h index 4edb74b7066..7ba288f188f 100644 --- a/common/common.h +++ b/common/common.h @@ -241,6 +241,8 @@ struct common_params_speculative { int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) float p_split = 0.1f; // speculative decoding split probability float p_min = 0.75f; // minimum speculative decoding probability (greedy) + + bool eagle3 = false; // use EAGLE3 speculative decoding std::vector> replacements; // main to speculative model replacements std::vector tensor_buft_overrides; diff --git a/common/speculative.cpp b/common/speculative.cpp index 1e12383ae6b..058e75b7961 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -22,6 +22,11 @@ struct common_speculative { llama_tokens prompt_dft; bool vocab_dft_compatible = true; // whether retokenization is needed std::map tgt_dft_replacements = {}; + + // EAGLE3 specific + struct llama_context * eagle3_encoder = nullptr; + struct llama_context * eagle3_decoder = nullptr; + int32_t eagle3_n_past = 0; // number of verified positions in decoder KV cache }; struct common_speculative * common_speculative_init( @@ -74,6 +79,35 @@ struct common_speculative * common_speculative_init( return result; } +struct common_speculative * common_speculative_init_eagle3( + struct llama_context * ctx_tgt, + struct llama_context * ctx_encoder, + struct llama_context * ctx_decoder) { + + auto * result = new common_speculative { + /* .ctx_tgt = */ ctx_tgt, + /* .ctx_dft = */ nullptr, // Not used for EAGLE3 + /* .smpl = */ nullptr, + /* .batch = */ llama_batch_init(llama_n_batch(ctx_decoder), 0, 1), + /* .prompt_dft = */ {}, + /* .vocab_dft_compatible = */ true, // EAGLE3 uses same vocab + /* .tgt_dft_replacements = */ {}, + /* .eagle3_encoder = */ ctx_encoder, + /* .eagle3_decoder = */ ctx_decoder, + }; + + // Initialize sampler for EAGLE3 decoder + { + common_params_sampling params; + params.no_perf = false; + params.top_k = 10; // set 1 for greedy sampling (argmax) to match vLLM's default behavior but >1 always gets higher acceptance rate for eagle3 + params.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; + result->smpl = common_sampler_init(llama_get_model(ctx_decoder), params); + } + + return result; +} + void common_speculative_free(struct common_speculative * spec) { if (spec == nullptr) { return; @@ -81,6 +115,14 @@ void common_speculative_free(struct common_speculative * spec) { common_sampler_free(spec->smpl); + // EAGLE3 cleanup + if (spec->eagle3_encoder) { + llama_free(spec->eagle3_encoder); + } + if (spec->eagle3_decoder) { + llama_free(spec->eagle3_decoder); + } + llama_batch_free(spec->batch); delete spec; @@ -181,12 +223,169 @@ static std::string replace_to_tgt( return result; } +// EAGLE3 Draft Generation with KV Cache Reuse +// +// ============================================================================ +// EXAMPLE: Two rounds of speculative decoding +// ============================================================================ +// +// ROUND 1 (Initial): +// Prompt: [t0, t1, t2, t3, t4], target generates t5 +// prompt_tgt = [t0, t1, t2, t3, t4], id_last = t5 (GENERATED) +// n = 5, n_past = 0, n_new = 5 +// +// Step 1: Encoder +// features: [f0, f1, f2, f3, f4] → g_embeddings: [g0, g1, g2, g3, g4] +// +// Step 2: Decoder batch (positions 0-4) +// tokens: [t1, t2, t3, t4, t5] ← prompt[1:] + id_last +// g_embd: [g0, g1, g2, g3, g4] +// positions: [0, 1, 2, 3, 4 ] +// → KV cache: [0, 1, 2, 3, 4] +// → sample d1 from logits[4] +// +// Step 3: Autoregressive (positions 5, 6, ...) +// pos 5: token=d1, g_embd=prenorm[4] → KV cache: [0,1,2,3,4,5] → d2 +// pos 6: token=d2, g_embd=prenorm → KV cache: [0,1,2,3,4,5,6] → d3 +// +// Output: [d1, d2, d3] +// Update: n_past = 5 (verified positions from batch decode) +// +// ROUND 2 (assuming d1 accepted, d2/d3 rejected): +// prompt_tgt = [t0, t1, t2, t3, t4, t5, d1], id_last = t6 (new target output) +// n = 7, n_past = 5, n_new = 2 +// +// Step 1: Clear KV cache [5, inf) - remove draft positions +// KV cache: [0, 1, 2, 3, 4] (reuse from round 1!) +// +// Step 2: Encoder (only new tokens) +// features: [f5, f6] → g_embeddings: [g5, g6] +// +// Step 3: Decoder batch (only new positions 5-6) +// tokens: [d1, t6] (prompt_tgt[6], id_last) +// g_embd: [g5, g6] +// positions: [5, 6 ] +// → KV cache: [0,1,2,3,4] + [5,6] = [0,1,2,3,4,5,6] +// → sample d1' from logits[1] (last position in batch) +// +// Step 4: Autoregressive... +// +// ============================================================================ +// +// Key insight: Decoder KV cache stores K/V computed from (tok_embd + g_embd). +// For verified positions, both tok_embd and g_embd are fixed (encoder output), +// so KV cache can be reused. Draft positions use prenorm as g_embd, which +// differs from encoder output, so they must be cleared and recomputed. +// +static llama_tokens gen_eagle3_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt_tgt, + llama_token id_last) { + + auto * ctx_tgt = spec->ctx_tgt; + auto * ctx_encoder = spec->eagle3_encoder; + auto * ctx_decoder = spec->eagle3_decoder; + auto * smpl = spec->smpl; + auto & batch = spec->batch; + + const int n_embd = llama_model_n_embd(llama_get_model(ctx_encoder)); + const int n = (int)prompt_tgt.size(); + const int n_new = n - spec->eagle3_n_past; + + GGML_ASSERT(n >= 1 && "prompt_tgt is empty"); + GGML_ASSERT(n_new >= 1 && "must have at least 1 new token"); + + // Clear draft positions from decoder KV cache [n_past, inf) + llama_memory_seq_rm(llama_get_memory(ctx_decoder), 0, spec->eagle3_n_past, -1); + + // Encoder: features → g_embeddings + const float * features = llama_get_eagle3_target_features(ctx_tgt); + GGML_ASSERT(features && "no target features"); + + llama_batch enc_batch = { + /*.n_tokens =*/ n_new, + /*.token =*/ nullptr, + /*.embd =*/ const_cast(features), + /*.pos =*/ nullptr, + /*.n_seq_id =*/ nullptr, + /*.seq_id =*/ nullptr, + /*.logits =*/ nullptr, + }; + GGML_ASSERT(llama_encode(ctx_encoder, enc_batch) == 0); + + const float * g_embd = llama_get_embeddings(ctx_encoder); + GGML_ASSERT(g_embd && "encoder output failed"); + + // Decoder batch: process new tokens with KV cache reuse + llama_set_eagle3_g_embeddings(ctx_decoder, g_embd, n_embd, n_new); + + common_batch_clear(batch); + for (int i = 0; i < n_new; i++) { + const int pos = spec->eagle3_n_past + i; + const llama_token tok = (pos < n - 1) ? prompt_tgt[pos + 1] : id_last; + common_batch_add(batch, tok, pos, {0}, true); + } + + GGML_ASSERT(llama_decode(ctx_decoder, batch) == 0); + + spec->eagle3_n_past = n; // update verified positions + + // Sample draft tokens + llama_tokens result; + common_sampler_reset(smpl); + + // Sample and check probability (consistent with standard speculative decoding) + auto sample_and_check = [&](int idx) -> bool { + common_sampler_sample(smpl, ctx_decoder, idx); + + const auto * cur_p = common_sampler_get_candidates(smpl, true); + const llama_token id = cur_p->data[0].id; + + common_sampler_accept(smpl, id, true); + result.push_back(id); + + return cur_p->data[0].p >= params.p_min; + }; + + // First draft token from batch decode + if (!sample_and_check(n_new - 1)) { + return result; + } + + // Autoregressive: use prenorm as g_embd (-1 = last output) + const float * prenorm = llama_get_embeddings_ith(ctx_decoder, -1); + + for (int i = 1; i < params.n_draft; i++) { + GGML_ASSERT(prenorm && "prenorm failed"); + llama_set_eagle3_g_embeddings(ctx_decoder, prenorm, n_embd, 1); + + common_batch_clear(batch); + common_batch_add(batch, result.back(), n - 1 + i, {0}, true); + GGML_ASSERT(llama_decode(ctx_decoder, batch) == 0); + + prenorm = llama_get_embeddings_ith(ctx_decoder, -1); + + if (!sample_and_check(0)) { + break; + } + } + + return result; +} llama_tokens common_speculative_gen_draft( struct common_speculative * spec, struct common_speculative_params params, const llama_tokens & prompt_tgt_main_model, // specified in target model vocab llama_token id_last) { + + // EAGLE3 path + if (spec->eagle3_encoder && spec->eagle3_decoder) { + return gen_eagle3_draft(spec, params, prompt_tgt_main_model, id_last); + } + + // Standard draft model path auto & batch = spec->batch; auto & ctx_tgt = spec->ctx_tgt; auto & ctx_dft = spec->ctx_dft; diff --git a/common/speculative.h b/common/speculative.h index e69d7aaa1eb..feef3c768fa 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -17,6 +17,13 @@ struct common_speculative * common_speculative_init( struct llama_context * ctx_dft ); +// EAGLE3: Initialize speculative decoding with EAGLE3 encoder and decoder contexts +struct common_speculative * common_speculative_init_eagle3( + struct llama_context * ctx_tgt, + struct llama_context * ctx_encoder, + struct llama_context * ctx_decoder +); + void common_speculative_free(struct common_speculative * spec); bool common_speculative_are_compatible( diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3f861f2a6a5..0f29fbd3fed 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -97,6 +97,7 @@ class ModelBase: metadata_override: Path | None dir_model_card: Path remote_hf_model_id: str | None + target_model_dir: Path | None # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -116,7 +117,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, - sentence_transformers_dense_modules: bool = False): + sentence_transformers_dense_modules: bool = False, target_model_dir: Path | None = None): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -135,6 +136,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.target_model_dir = target_model_dir self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) @@ -2373,7 +2375,55 @@ def __init__(self, *args, **kwargs): if self.hf_arch == "VLlama3ForCausalLM": self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + # detect EAGLE-3 llama checkpoint + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + self.is_eagle3 = True + self.model_arch = gguf.MODEL_ARCH.EAGLE3 + logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture") + # Re-initialize tensor_map with EAGLE3 architecture + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + # Update gguf_writer architecture + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + if not hasattr(self, 'target_model_dir') or not self.target_model_dir: + raise ValueError( + "EAGLE3 model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory to read config.json" + ) + # Read both EAGLE3 raw config and target model config + with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f: + eagle3_raw_config = json.load(f) + with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f: + target_config = json.load(f) + + # EAGLE3 extract_layers + target_num_layers = target_config["num_hidden_layers"] + extract_layers = [2, target_num_layers // 2, target_num_layers - 3] + logger.info(f"EAGLE3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)") + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers) + + # EAGLE3 target_hidden_size: prefer EAGLE3 config, fallback to target config + if "target_hidden_size" in eagle3_raw_config and eagle3_raw_config["target_hidden_size"] is not None: + target_hidden_size = eagle3_raw_config["target_hidden_size"] + logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from EAGLE3 config)") + else: + target_hidden_size = target_config["hidden_size"] + logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)") + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size) + def set_vocab(self): + # For EAGLE-3 models, use tokenizer from target model if provided + if hasattr(self, 'is_eagle3') and self.is_eagle3: + if self.target_model_dir is None: + raise ValueError( + "EAGLE-3 draft model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory containing the tokenizer." + ) + logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}") + # Temporarily swap dir_model to load tokenizer from target model + original_dir_model = self.dir_model + self.dir_model = self.target_model_dir + if self.is_mistral_format: return self._set_vocab_mistral() @@ -2391,6 +2441,10 @@ def set_vocab(self): # Llama 3 self._set_vocab_gpt2() + # Restore original dir_model for EAGLE-3 + if hasattr(self, 'is_eagle3') and self.is_eagle3: + self.dir_model = original_dir_model + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( @@ -2435,7 +2489,45 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): _experts: list[dict[str, Tensor]] | None = None + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + tensors = super().index_tensors(remote_hf_model_id) + # EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set) + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + logger.info("EAGLE-3: Renaming midlayer.* to model.layers.0.*") + new_tensors = {} + # EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model + for name, gen in tensors.items(): + if name.startswith("midlayer."): + new_name = "model.layers.0." + name[len("midlayer."):] + new_tensors[new_name] = gen + else: + new_tensors[name] = gen + return new_tensors + else: + return tensors + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # Eagle-3 llama checkpoint special handling + if hasattr(self, 'is_eagle3') and self.is_eagle3: + # Eagle-3 llama checkpoint special weights handling + # fc.weight: feature fusion layer + if name == "fc.weight": + return [(name, data_torch)] + # d2t: draft to target vocabulary mapping + elif name == "d2t": + # Skip parent class processing (store for manual handling in prepare_tensors) + if not hasattr(self, '_eagle3_int_tensors'): + self._eagle3_int_tensors = {} + self._eagle3_int_tensors[name] = data_torch + return [] + # t2d: target to draft vocabulary mapping (not used, skip completely) + elif name == "t2d": + return [] + # hidden_norm: EAGLE-3 specific layer normalization + elif name == "model.layers.0.hidden_norm.weight": + return [("blk.0.hidden_norm.weight", data_torch)] + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) @@ -2538,8 +2630,26 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) def prepare_tensors(self): + # EAGLE-3: collect original dtypes BEFORE parent class converts them to F32 + eagle3_original_dtypes = {} + if hasattr(self, 'is_eagle3') and self.is_eagle3: + for name, data_torch in self.get_tensors(): + if name == "d2t": + eagle3_original_dtypes[name] = data_torch.dtype + super().prepare_tensors() + if hasattr(self, 'is_eagle3') and self.is_eagle3 and hasattr(self, '_eagle3_int_tensors'): + for name, data_torch in self._eagle3_int_tensors.items(): + old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype) + # Keep as int64 to match original torch tensor dtype + data = data_torch.to(torch.int64).numpy() + data_qtype = gguf.GGMLQuantizationType.I64 + + shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" + logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] @@ -10125,6 +10235,7 @@ class LazyTorchTensor(gguf.LazyBase): torch.float16: np.float16, torch.float32: np.float32, torch.uint8: np.uint8, + torch.int64: np.int64, } # only used when byteswapping data. Only correct size is needed @@ -10285,6 +10396,10 @@ def parse_args() -> argparse.Namespace: "--no-tensor-first-split", action="store_true", help="do not add tensors to the first split (disabled by default)" ) + parser.add_argument( + "--target-model-dir", type=str, default=None, + help="directory containing target model tokenizer (for EAGLE-3 draft models that don't have their own tokenizer)", + ) parser.add_argument( "--metadata", type=Path, help="Specify the path for an authorship metadata override file" @@ -10457,7 +10572,8 @@ def main() -> None: split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, - sentence_transformers_dense_modules=args.sentence_transformers_dense_modules + sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None ) if args.vocab_only: diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 8141052a227..3b65f3c5b10 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -4,6 +4,7 @@ #include "speculative.h" #include "log.h" #include "llama.h" +#include "chat.h" #include #include @@ -34,16 +35,42 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); llama_model * model_tgt = NULL; - //llama_model * model_dft = NULL; + llama_model * model_dft = NULL; llama_context * ctx_tgt = NULL; llama_context * ctx_dft = NULL; - // load the target model - auto llama_init_tgt = common_init_from_params(params); + // EAGLE3 specific contexts + llama_context * ctx_encoder = NULL; + llama_context * ctx_decoder = NULL; + + // For EAGLE3: load both draft model and target model + if (params.speculative.eagle3) { + llama_model_params dft_mp = llama_model_default_params(); + dft_mp.n_gpu_layers = params.speculative.n_gpu_layers; + model_dft = llama_model_load_from_file(params.speculative.model.path.c_str(), dft_mp); + if (!model_dft) { + LOG_ERR("failed to load EAGLE3 draft model\n"); + return 1; + } - model_tgt = llama_init_tgt->model(); - ctx_tgt = llama_init_tgt->context(); + llama_model_params tgt_mp = llama_model_default_params(); + tgt_mp.n_gpu_layers = params.n_gpu_layers; + model_tgt = llama_model_load_from_file(params.model.path.c_str(), tgt_mp); + if (!model_tgt) { + LOG_ERR("failed to load target model\n"); + return 1; + } + + llama_context_params tcp = common_context_params_to_llama(params); + tcp.eagle3_model = model_dft; // Enable feature extraction + ctx_tgt = llama_init_from_model(model_tgt, tcp); + } else { + // Standard load the target model + auto llama_init_tgt = common_init_from_params(params); + model_tgt = llama_init_tgt->model(); + ctx_tgt = llama_init_tgt->context(); + } const llama_vocab * vocab = llama_model_get_vocab(model_tgt); @@ -61,18 +88,57 @@ int main(int argc, char ** argv) { params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; - auto llama_init_dft = common_init_from_params(params); + if (params.speculative.eagle3) { + // EAGLE3: create encoder and decoder contexts + llama_context_params enc_params = common_context_params_to_llama(params); + enc_params.embeddings = true; + ctx_encoder = llama_init_from_model(model_dft, enc_params); + if (!ctx_encoder) { + LOG_ERR("failed to create EAGLE3 encoder context\n"); + return 1; + } - //model_dft = llama_init_dft->model(); - ctx_dft = llama_init_dft->context(); + llama_context_params dec_params = common_context_params_to_llama(params); + dec_params.target_model = model_tgt; + dec_params.embeddings = true; + ctx_decoder = llama_init_from_model(model_dft, dec_params); + if (!ctx_decoder) { + LOG_ERR("failed to create EAGLE3 decoder context\n"); + return 1; + } + } else { + // Standard: load draft model context + auto llama_init_dft = common_init_from_params(params); + model_dft = llama_init_dft->model(); + ctx_dft = llama_init_dft->context(); + + if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { + LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str()); + } + } - if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { - LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str()); + // Apply chat template for EAGLE3 if available which can increase the acceptance rate + std::string prompt = params.prompt; + if (params.speculative.eagle3) { + auto chat_templates = common_chat_templates_init(model_tgt, params.chat_template); + if (common_chat_templates_was_explicit(chat_templates.get())) { + std::vector chat_msgs; + common_chat_msg user_msg; + user_msg.role = "user"; + user_msg.content = params.prompt; + chat_msgs.push_back(user_msg); + + common_chat_templates_inputs inputs; + inputs.messages = chat_msgs; + inputs.add_generation_prompt = true; + prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt; + LOG_INF("%s: EAGLE3 chat template applied\n", __func__); + } } // Tokenize the prompt std::vector inp; - inp = common_tokenize(ctx_tgt, params.prompt, true, true); + inp = common_tokenize(ctx_tgt, prompt, true, true); if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) { LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt)); @@ -115,26 +181,52 @@ int main(int argc, char ** argv) { struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); // eval the prompt - llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1)); + llama_token id_last; + llama_tokens prompt_tgt; + int n_past; - // note: keep the last token separate! - llama_token id_last = inp.back(); + if (params.speculative.eagle3) { + // Target model decodes full prompt and sample first token and intermediate features are extracted + llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size())); - // all tokens currently in the target context - llama_tokens prompt_tgt(inp.begin(), inp.end() - 1); - prompt_tgt.reserve(llama_n_ctx(ctx_tgt)); + id_last = common_sampler_sample(smpl, ctx_tgt, -1); + common_sampler_accept(smpl, id_last, true); + LOG("%s", common_token_to_piece(ctx_tgt, id_last).c_str()); + n_predict++; - int n_past = inp.size() - 1; + // all tokens currently in the target context + prompt_tgt.assign(inp.begin(), inp.end()); + prompt_tgt.reserve(llama_n_ctx(ctx_tgt)); + + n_past = inp.size(); + } else { + llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1)); + + // note: keep the last token separate! + id_last = inp.back(); + + // all tokens currently in the target context + prompt_tgt.assign(inp.begin(), inp.end() - 1); + prompt_tgt.reserve(llama_n_ctx(ctx_tgt)); + + n_past = inp.size() - 1; + } // init the speculator struct common_speculative_params params_spec; params_spec.n_draft = n_draft; - params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft; params_spec.p_min = p_min; - struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft); - for (auto &pair : params.speculative.replacements) { - common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str()); + struct common_speculative * spec = NULL; + + if (params.speculative.eagle3) { + spec = common_speculative_init_eagle3(ctx_tgt, ctx_encoder, ctx_decoder); + } else { + params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft; + spec = common_speculative_init(ctx_tgt, ctx_dft); + for (auto &pair : params.speculative.replacements) { + common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str()); + } } llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1); @@ -249,7 +341,14 @@ int main(int argc, char ** argv) { LOG_INF("\n"); LOG_INF("draft:\n\n"); - llama_perf_context_print(ctx_dft); + if (ctx_dft) { + llama_perf_context_print(ctx_dft); + } else if (ctx_encoder && ctx_decoder) { + LOG_INF(" Eagle3 Draft encoder:\n"); + llama_perf_context_print(ctx_encoder); + LOG_INF("\nEagle3 Draft decoder:\n"); + llama_perf_context_print(ctx_decoder); + } LOG_INF("\n"); LOG_INF("target:\n\n"); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 686da3dbd10..fa73e8216b8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -629,6 +629,7 @@ extern "C" { GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) + GGML_TENSOR_FLAG_SYNC = 16, // ...forces a new split/sync point in the scheduler (e.g. for EAGLE3 decoder) }; enum ggml_tri_type { @@ -853,6 +854,7 @@ extern "C" { GGML_API void ggml_set_output(struct ggml_tensor * tensor); GGML_API void ggml_set_param(struct ggml_tensor * tensor); GGML_API void ggml_set_loss(struct ggml_tensor * tensor); + GGML_API void ggml_set_sync(struct ggml_tensor * tensor); // force sync point in scheduler // // operations on tensors with backpropagation diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 08681f35e3f..8e30d48ccc0 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1202,6 +1202,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra } } + // check if this node requires a sync point (e.g. for EAGLE3 parallel path fix) + if (node->flags & GGML_TENSOR_FLAG_SYNC) { + need_new_split = true; + } + if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; i_split++; @@ -1576,6 +1581,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (ec != GGML_STATUS_SUCCESS) { return ec; } + + // If any node in this split has SYNC flag, synchronize after compute + // This ensures the sync node is complete before next split (e.g. for EAGLE3 parallel path sync fix) + for (int j = 0; j < split->graph.n_nodes; j++) { + if (split->graph.nodes[j]->flags & GGML_TENSOR_FLAG_SYNC) { + ggml_backend_synchronize(split_backend); + break; + } + } } else { // similar to ggml_backend_compare_graph_backend for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f0913cd3596..4625c3bd770 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7451,6 +7451,10 @@ void ggml_set_loss(struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_LOSS; } +void ggml_set_sync(struct ggml_tensor * tensor) { + tensor->flags |= GGML_TENSOR_FLAG_SYNC; +} + //////////////////////////////////////////////////////////////////////////////// void ggml_quantize_init(enum ggml_type type) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2b8489c591b..7d9d9b103b6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -147,6 +147,8 @@ class LLM: EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" + EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers" + EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -446,6 +448,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + EAGLE3 = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -710,6 +713,10 @@ class MODEL_TENSOR(IntEnum): NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() + # EAGLE3 specific tensors + EAGLE3_FC = auto() # feature fusion layer + EAGLE3_HIDDEN_NORM = auto() # hidden normalization + EAGLE3_D2T = auto() # draft to target vocabulary mapping MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -820,6 +827,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.EAGLE3: "eagle3", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1082,6 +1090,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", + MODEL_TENSOR.EAGLE3_FC: "fc", + MODEL_TENSOR.EAGLE3_HIDDEN_NORM: "blk.{bid}.hidden_norm", + MODEL_TENSOR.EAGLE3_D2T: "d2t", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -3094,6 +3105,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.EAGLE3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.EAGLE3_FC, + MODEL_TENSOR.EAGLE3_HIDDEN_NORM, + MODEL_TENSOR.EAGLE3_D2T, + ], # TODO } diff --git a/include/llama.h b/include/llama.h index b52eaacfa7e..c502b9ad0ea 100644 --- a/include/llama.h +++ b/include/llama.h @@ -363,6 +363,13 @@ extern "C" { bool kv_unified; // use a unified buffer across the input sequences when computing the attention // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + + // EAGLE3 extraction configuration + // When eagle3_model is set, layer extraction is automatically enabled + const struct llama_model * eagle3_model; // EAGLE3 model to read extract_layers configuration from + // If non-NULL, enables automatic feature extraction + const struct llama_model * target_model; // reference to target model + // only used to share embedding layer with eagle3 model }; // model quantization parameters @@ -846,6 +853,23 @@ extern "C" { llama_seq_id dest_seq_id, llama_state_seq_flags flags); + // + // EAGLE3 draft model support + // + + // Get pointer to target model features extracted for EAGLE3 encoder + // Returns NULL if no features are available + // Format: [3*n_embd, n_tokens] - use model.hparams.n_embd and batch.n_tokens for dimensions + LLAMA_API const float * llama_get_eagle3_target_features(struct llama_context * ctx); + + // Set g_embeddings from EAGLE3 encoder output for decoder input + // g_embd: pointer to encoder output embeddings + LLAMA_API void llama_set_eagle3_g_embeddings( + struct llama_context * ctx, + const float * g_embd, + int32_t n_embd, + int32_t n_tokens); + // // Decoding // diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4192af7c0c3..4ffbc49a802 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -58,6 +58,7 @@ add_library(llama models/deepseek2.cpp models/dots1.cpp models/dream.cpp + models/eagle3.cpp models/ernie4-5-moe.cpp models/ernie4-5.cpp models/exaone.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 64ad1b77690..b8370c29553 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -112,6 +112,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_EAGLE3, "eagle3" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -245,6 +246,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, + { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, + { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" }, @@ -2540,6 +2544,30 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, + { + LLM_ARCH_EAGLE3, + { + // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own, Llama 3.1 8B EAGLE3 uses target model's) + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, // Optional - only if EAGLE3 config has rope_scaling + // Single decoder layer (blk.0) + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + // EAGLE-3 specific layers + { LLM_TENSOR_EAGLE3_HIDDEN_NORM, "blk.%d.hidden_norm" }, + { LLM_TENSOR_EAGLE3_FC, "fc" }, + { LLM_TENSOR_EAGLE3_D2T, "d2t" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2742,6 +2770,10 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + // EAGLE-3 tensors + {LLM_TENSOR_EAGLE3_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_EAGLE3_HIDDEN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_EAGLE3_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index e113180024d..0aa7dd80d75 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -117,6 +117,7 @@ enum llm_arch { LLM_ARCH_PANGU_EMBED, LLM_ARCH_MISTRAL3, LLM_ARCH_UNKNOWN, + LLM_ARCH_EAGLE3, }; enum llm_kv { @@ -287,6 +288,9 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_EAGLE3_EXTRACT_LAYERS, + LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, + LLM_KV_SHORTCONV_L_CACHE, LLM_KV_XIELU_ALPHA_N, @@ -492,6 +496,9 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_EAGLE3_FC, // eagle3: feature fusion layer + LLM_TENSOR_EAGLE3_HIDDEN_NORM, // eagle3: additional normalization layer + LLM_TENSOR_EAGLE3_D2T, // eagle3: draft to target vocabulary mapping }; enum llm_tensor_layer { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2a17e44ecdf..ea6dfaea3c9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -135,6 +135,7 @@ llama_context::llama_context( cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; + cparams.eagle3_extract_enabled = (params.eagle3_model != nullptr); // auto-enable if eagle3_model is provided { const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE"); @@ -333,6 +334,30 @@ llama_context::llama_context( cross.v_embd.clear(); + // Initialize EAGLE3 feature extraction configuration + if (cparams.eagle3_extract_enabled) { + // Feature extraction layers configuration must come from EAGLE3 model + if (!params.eagle3_model) { + LLAMA_LOG_ERROR("%s: EAGLE3 extraction enabled but eagle3_model not provided\n", __func__); + throw std::runtime_error("EAGLE3 extraction requires eagle3_model parameter"); + } + + const auto & eagle3_hparams = params.eagle3_model->hparams; + // Copy feature extraction layer indices from EAGLE3 model's hparams + eagle3.extract_layer_indices.assign( + eagle3_hparams.eagle3_extract_layers.begin(), + eagle3_hparams.eagle3_extract_layers.end() + ); + + // Allocate tensors array for extraction + eagle3.extract_tensors.resize(eagle3.extract_layer_indices.size(), nullptr); + + LLAMA_LOG_INFO("%s: EAGLE3 extraction enabled for layers [%d, %d, %d]\n", __func__, + eagle3.extract_layer_indices[0], + eagle3.extract_layer_indices[1], + eagle3.extract_layer_indices[2]); + } + // avoid reserving graphs with zero outputs - assume one output per sequence n_outputs = n_seqs; @@ -832,6 +857,14 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll //const auto t_start_us = ggml_time_us(); res->set_inputs(&ubatch); + + // EAGLE3: Fill g_embeddings for decoder input + if (model.arch == LLM_ARCH_EAGLE3 && gtype == LLM_GRAPH_TYPE_DECODER && !eagle3.g_embeddings.empty()) { + ggml_tensor * g_embd = ggml_graph_get_tensor(gf, "inp_g_embeddings"); + if (g_embd) { + ggml_backend_tensor_set(g_embd, eagle3.g_embeddings.data(), 0, ggml_nbytes(g_embd)); + } + } //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); } @@ -843,6 +876,11 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll return nullptr; } + // EAGLE3: Extract intermediate layer features after graph execution + if (cparams.eagle3_extract_enabled && !eagle3.extract_tensors.empty()) { + extract_eagle3_features(ubatch); + } + ret = GGML_STATUS_SUCCESS; return res; @@ -858,7 +896,8 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_inp(); + // EAGLE3: use 3*target_hidden_size for concatenated features input + const int64_t n_embd = (model.arch == LLM_ARCH_EAGLE3 && batch_inp.embd) ? 3 * hparams.eagle3_target_hidden_size : hparams.n_embd; const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -941,8 +980,15 @@ int llama_context::encode(const llama_batch & batch_inp) { // extract token embeddings GGML_ASSERT(embd != nullptr); - GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float)); + if (model.arch == LLM_ARCH_EAGLE3) { + // g_embeddings are stored temporarily in embd buffer + const int64_t out_embd = hparams.n_embd; + GGML_ASSERT(n_tokens * out_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens * out_embd * sizeof(float)); + } else { + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float)); + } } break; case LLAMA_POOLING_TYPE_MEAN: case LLAMA_POOLING_TYPE_CLS: @@ -1181,7 +1227,8 @@ int llama_context::decode(const llama_batch & batch_inp) { auto * t_logits = res->get_logits(); auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; - if (t_embd && res->get_embd_pooled()) { + // For EAGLE3, don't override t_embd with t_embd_pooled - we need the prenorm value during eagle3 decoder autoregressive generation + if (t_embd && res->get_embd_pooled() && model.arch != LLM_ARCH_EAGLE3) { t_embd = res->get_embd_pooled(); } @@ -1196,7 +1243,39 @@ int llama_context::decode(const llama_batch & batch_inp) { if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + + // EAGLE3: Map draft vocab to target vocab + if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) { + static thread_local std::vector eagle3_d2t_map; + static thread_local std::vector eagle3_draft_logits; + + const int64_t draft_vocab_size = t_logits->ne[0]; + const uint32_t last_idx = n_outputs - 1; + + // Load d2t mapping once (on first call) + if (eagle3_d2t_map.empty()) { + eagle3_d2t_map.resize(model.d2t->ne[0]); + ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0, eagle3_d2t_map.size() * sizeof(int64_t)); + } + + // Read only the last token's draft logits + eagle3_draft_logits.resize(draft_vocab_size); + const size_t last_offset = last_idx * draft_vocab_size * sizeof(float); + ggml_backend_tensor_get(t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float)); + + + // Map only the last token's draft logits to target vocab + float * last_logits_out = logits_out + last_idx * n_vocab; + std::fill(last_logits_out, last_logits_out + n_vocab, -std::numeric_limits::infinity()); + + for (int64_t j = 0; j < draft_vocab_size; j++) { + const int64_t target_id = j + eagle3_d2t_map[j]; + GGML_ASSERT(target_id >= 0 && target_id < n_vocab); + last_logits_out[target_id] = eagle3_draft_logits[j]; + } + } else { + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + } } } @@ -1455,7 +1534,16 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u auto * res = gf_res_reserve.get(); - const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); + // EAGLE3: auto-detect encoder (embeddings+no target_model) or decoder (has target_model) + llm_graph_type gtype = LLM_GRAPH_TYPE_DEFAULT; + if (model.arch == LLM_ARCH_EAGLE3) { + if (cparams.embeddings && model.target_tok_embd == nullptr) { + gtype = LLM_GRAPH_TYPE_ENCODER; + } else if (model.target_tok_embd != nullptr) { + gtype = LLM_GRAPH_TYPE_DECODER; + } + } + const auto gparams = graph_params(res, ubatch, mctx, gtype); res->reset(); @@ -1491,6 +1579,7 @@ llm_graph_params llama_context::graph_params( /*.loras =*/ &loras, /*.mctx =*/ mctx, /*.cross =*/ &cross, + /*.eagle3 =*/ &eagle3, /*.n_outputs =*/ n_outputs, /*.cb =*/ graph_get_cb(), /*.res =*/ res, @@ -1534,6 +1623,27 @@ llm_graph_cb llama_context::graph_get_cb() const { ggml_set_name(cur, name); } + // EAGLE3: Extract intermediate layer features if this is an extraction point + if (cparams.eagle3_extract_enabled) { + static constexpr const char * prefix = "eagle3_extract_"; + static constexpr size_t prefix_len = 15; // strlen("eagle3_extract_") + + if (strncmp(name, prefix, prefix_len) == 0) { + // Parse the extraction index from the name (e.g., "eagle3_extract_0" -> 0) + size_t extract_idx = 0; + if (sscanf(name + prefix_len, "%zu", &extract_idx) == 1 && extract_idx < eagle3.extract_tensors.size()) { + // Mark as output tensor to ensure proper backend assignment + ggml_set_output(cur); + // Store this tensor reference for post-execution extraction + eagle3.extract_tensors[extract_idx] = cur; + LLAMA_LOG_DEBUG("%s: EAGLE3 stored tensor reference for extraction: " + "index=%zu, layer=%d, target_layer=%d, tensor=%s\n", + __func__, extract_idx, il, + eagle3.extract_layer_indices[extract_idx], name); + } + } + } + if (!cparams.offload_kqv) { if (strcmp(name, "kqv_merged_cont") == 0) { // all nodes between the KV store and the attention output are run on the CPU @@ -1559,6 +1669,54 @@ llm_graph_cb llama_context::graph_get_cb() const { }; } +void llama_context::extract_eagle3_features(const llama_ubatch & ubatch) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_embd = model.hparams.n_embd; + const size_t n_layers = eagle3.extract_tensors.size(); + + // Allocate storage for concatenated features + const int64_t n_embd_concat = n_embd * n_layers; + eagle3.target_features.resize(n_embd_concat * n_tokens); + + // Temporary buffer to hold layer features before transposing + static thread_local std::vector temp_layer_features; + temp_layer_features.resize(n_embd * n_tokens); + + LLAMA_LOG_DEBUG("%s: Start to extract EAGLE3 features: %zu layers, %lld tokens, %lld embd\n", + __func__, n_layers, (long long)n_tokens, (long long)n_embd); + + // Extract each layer's features and interleave into token-major layout + for (size_t layer_idx = 0; layer_idx < n_layers; ++layer_idx) { + ggml_tensor * tensor = eagle3.extract_tensors[layer_idx]; + GGML_ASSERT(tensor != nullptr && "EAGLE3 extraction tensor is null"); + + // Get the backend where this tensor is stored + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), tensor); + GGML_ASSERT(backend != nullptr && "EAGLE3 tensor has no backend"); + + // Verify tensor shape: should be [n_embd, n_tokens] + GGML_ASSERT(tensor->ne[0] == n_embd && tensor->ne[1] == n_tokens && + "EAGLE3 extraction tensor has unexpected shape"); + + // Get layer features to temp buffer + const size_t size_bytes = n_embd * n_tokens * sizeof(float); + ggml_backend_tensor_get_async(backend, tensor, temp_layer_features.data(), 0, size_bytes); + ggml_backend_sched_synchronize(sched.get()); + + // Then copy to correct position in target_features + // target_features layout: [token_0_all_layers, token_1_all_layers, ...] + // Each token has [layer_0_embd, layer_1_embd, layer_2_embd] + for (int64_t token_idx = 0; token_idx < n_tokens; ++token_idx) { + // Source: temp_layer_features[token_idx * n_embd ... (token_idx + 1) * n_embd - 1] + const float * src = temp_layer_features.data() + token_idx * n_embd; + // Dest: target_features[token_idx * n_embd_concat + layer_idx * n_embd] + float * dest = eagle3.target_features.data() + token_idx * n_embd_concat + layer_idx * n_embd; + std::memcpy(dest, src, n_embd * sizeof(float)); + } + } + +} + // // state save/load // @@ -2354,6 +2512,8 @@ llama_context_params llama_context_default_params() { /*.op_offload =*/ true, /*.swa_full =*/ true, /*.kv_unified =*/ false, + /*.eagle3_model =*/ nullptr, + /*.target_model =*/ nullptr, }; return result; @@ -2367,6 +2527,12 @@ llama_context * llama_init_from_model( return nullptr; } + // Auto-setup for EAGLE3: set target embedding if target_model is provided + if (model->arch == LLM_ARCH_EAGLE3 && params.target_model) { + model->target_tok_embd = params.target_model->tok_embd; + LLAMA_LOG_INFO("%s: EAGLE3 auto-setup: using target model's embedding layer\n", __func__); + } + if (params.n_batch == 0 && params.n_ubatch == 0) { LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__); return nullptr; @@ -3016,3 +3182,33 @@ void llama_opt_epoch( callback_train, callback_eval); } + +// +// EAGLE3 member functions +// + +const float * llama_context::get_eagle3_target_features() const { + GGML_ASSERT(!eagle3.target_features.empty() && "EAGLE3 target features not extracted - call llama_encode() on target model first"); + return eagle3.target_features.data(); +} + +void llama_context::set_eagle3_g_embeddings(const float * g_embd, int32_t n_embd, int32_t n_tokens) { + GGML_ASSERT(g_embd != nullptr && "g_embeddings cannot be null"); + GGML_ASSERT(n_embd > 0 && n_tokens > 0 && "invalid dimensions"); + + const size_t size = n_embd * n_tokens; + eagle3.g_embeddings.resize(size); + std::memcpy(eagle3.g_embeddings.data(), g_embd, size * sizeof(float)); +} + +// +// C API wrappers +// + +const float * llama_get_eagle3_target_features(llama_context * ctx) { + return ctx->get_eagle3_target_features(); +} + +void llama_set_eagle3_g_embeddings(llama_context * ctx, const float * g_embd, int32_t n_embd, int32_t n_tokens) { + ctx->set_eagle3_g_embeddings(g_embd, n_embd, n_tokens); +} diff --git a/src/llama-context.h b/src/llama-context.h index cd26eafe189..1528d3f03e7 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -208,6 +208,12 @@ struct llama_context { // reserve a graph with a dummy ubatch of the specified size ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false); + // EAGLE3: Get pointer to target model features extracted for EAGLE3 encoder + const float * get_eagle3_target_features() const; + + // EAGLE3: Set g_embeddings from encoder output for decoder input + void set_eagle3_g_embeddings(const float * g_embd, int32_t n_embd, int32_t n_tokens); + private: llm_graph_params graph_params( llm_graph_result * res, @@ -217,6 +223,9 @@ struct llama_context { llm_graph_cb graph_get_cb() const; + // EAGLE3: Extract intermediate layer features from target model + void extract_eagle3_features(const llama_ubatch & ubatch); + // TODO: read/write lora adapters and cvec size_t state_write_data(llama_io_write_i & io); size_t state_read_data (llama_io_read_i & io); @@ -235,6 +244,9 @@ struct llama_context { llama_adapter_loras loras; llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably + + mutable llama_eagle3 eagle3; // EAGLE3 draft model support - stores features from target model + // mutable because it's modified during graph building (const function) std::unique_ptr memory; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index fcef8fa9760..456c06e9a91 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -34,6 +34,7 @@ struct llama_cparams { bool warmup; bool op_offload; bool kv_unified; + bool eagle3_extract_enabled; // enable layer extraction for EAGLE3 speculative decoding enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 8909bbfb95e..2b21a1d6590 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -590,6 +590,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : loras (params.loras), mctx (params.mctx), cross (params.cross), + eagle3 (params.eagle3), cb_func (params.cb), res (params.res), ctx0 (res->get_ctx()), diff --git a/src/llama-graph.h b/src/llama-graph.h index e9d387bd7c5..f93a8584400 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -70,6 +70,30 @@ struct llama_cross { std::vector> seq_ids_enc; }; +// EAGLE3 support - stores intermediate features from target model +struct llama_eagle3 { + // Configuration: which layers to extract from target model + std::vector extract_layer_indices; + + // Extracted features from target model (for encoder input) + // Concatenated [layer_l, layer_m, layer_h] embeddings + // Shape: [n_layers * n_embd, n_tokens] where n_layers = extract_layer_indices.size() + std::vector target_features; + + // Encoder output (for decoder input) + std::vector g_embeddings; + + // Tensor references for feature extraction from target model + std::vector extract_tensors; + + // Clear all stored data + void clear() { + target_features.clear(); + g_embeddings.clear(); + extract_tensors.clear(); + } +}; + struct llm_graph_params; // @@ -416,6 +440,7 @@ struct llm_graph_params { const llama_adapter_loras * loras; const llama_memory_context_i * mctx; const llama_cross * cross; + llama_eagle3 * eagle3; // non-const: we write extracted features here uint32_t n_outputs; @@ -579,6 +604,7 @@ struct llm_graph_context { const llama_adapter_loras * loras; const llama_memory_context_i * mctx; const llama_cross * cross; + llama_eagle3 * eagle3; // non-const: we write extracted features here const llm_graph_cb & cb_func; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index a467c64a14e..d4337aea376 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -188,6 +188,13 @@ struct llama_hparams { // qwen3vl deepstack uint32_t n_deepstack_layers = 0; + // EAGLE3 draft model - layer indices to extract from target model + // e.g., for 32-layer target: [2, 16, 29] (low, middle, high) + std::array eagle3_extract_layers = {0, 0, 0}; + + // EAGLE3 draft model - target model hidden size + uint32_t eagle3_target_hidden_size = 0; + // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 28f06b4e615..acbdb5d9612 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2230,6 +2230,28 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_EAGLE3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // EAGLE3 layer extraction configuration + // Use array (has template instantiation), then copy first 3 elements + std::array extract_layers_tmp = {}; + if (!ml.get_key_or_arr(LLM_KV_EAGLE3_EXTRACT_LAYERS, extract_layers_tmp, 3, false)) { + throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata"); + } + std::copy_n(extract_layers_tmp.begin(), 3, hparams.eagle3_extract_layers.begin()); + LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__, + hparams.eagle3_extract_layers[0], + hparams.eagle3_extract_layers[1], + hparams.eagle3_extract_layers[2]); + + // EAGLE3 target model hidden size + ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size); + LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__, + hparams.eagle3_target_hidden_size, hparams.n_embd); + + type = LLM_TYPE_UNKNOWN; + } break; case LLM_ARCH_COGVLM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -6408,6 +6430,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0); } } break; + case LLM_ARCH_EAGLE3: + { + const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size; + const int64_t n_embd_attn_input = 2 * n_embd; + + // Get vocab size from the d2t tensor in the GGUF file + // d2t: draft to target mapping (size = draft_vocab_size) + const struct ggml_tensor * d2t_meta = ml.get_tensor_meta("d2t"); + if (!d2t_meta) { + throw std::runtime_error("EAGLE3 model requires 'd2t' tensor but it was not found in the model file"); + } + const int64_t n_draft_vocab = d2t_meta->ne[0]; + + // Feature fusion layer: projects 3 target layers to draft hidden size + fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0); + + // Draft to target vocabulary mapping tensor + d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0); + + // Output layer (uses draft vocab size) + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, 0); + + // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own) + const struct ggml_tensor * tok_embd_meta = ml.get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str()); + if (tok_embd_meta) { + const int64_t n_target_vocab = tok_embd_meta->ne[1]; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab); + } + + // Single decoder layer + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + // input_layernorm: applied to token embeddings + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // Attention takes input_embeds_normed + fused_target_normed as input + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + // EAGLE-3 specific: hidden_norm applied to fused target features + layer.eagle3_hidden_norm = create_tensor(tn(LLM_TENSOR_EAGLE3_HIDDEN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // rope_freqs for llama3 rope scaling (optional - only if EAGLE3 config has rope_scaling) + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED); + } + } break; case LLM_ARCH_COGVLM: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -7564,6 +7642,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_EAGLE3: + { + if (params.gtype == LLM_GRAPH_TYPE_ENCODER) { + llm = std::make_unique(*this, params); + } else { + llm = std::make_unique(*this, params); + } + } break; case LLM_ARCH_COGVLM: { llm = std::make_unique(*this, params); @@ -7749,6 +7835,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_EAGLE3: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/src/llama-model.h b/src/llama-model.h index f8342cf2cb1..2d72fc78f5c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -404,6 +404,9 @@ struct llama_layer { struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_eps = nullptr; + // eagle3 + struct ggml_tensor * eagle3_hidden_norm = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; @@ -453,6 +456,13 @@ struct llama_model { struct ggml_tensor * per_layer_model_proj = nullptr; struct ggml_tensor * per_layer_proj_norm = nullptr; + // eagle3 + struct ggml_tensor * fc = nullptr; // feature fusion layer + struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping + // Reference to target model's embedding layer + // This allows EAGLE3 to use target model's embeddings without copying + struct ggml_tensor * target_tok_embd = nullptr; + std::vector layers; //Dense linear projections for SentenceTransformers models like embeddinggemma diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp new file mode 100644 index 00000000000..8987a0c5816 --- /dev/null +++ b/src/models/eagle3.cpp @@ -0,0 +1,187 @@ +#include "models.h" + +// EAGLE3 Encoder: processes target model features through feature fusion layer +// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high +// Output: g_embeddings e.g. [4096, n_tokens] stored in context +llm_build_eagle3_encode::llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + + const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size; + + ggml_tensor * cur; + + // Input: Target model features (3 layers concatenated: low, mid, high) + // Data will be provided via ubatch->embd in encode_eagle3_features() + auto inp_target = std::make_unique(); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens); + ggml_set_input(inp_target->embd); + ggml_tensor * target_features = inp_target->embd; + res->add_input(std::move(inp_target)); + cb(target_features, "inp_target_features", -1); + + // Feature fusion layer + ggml_tensor * fused_target = build_lora_mm(model.fc, target_features); + cb(fused_target, "fc_out", -1); + + // Output: g_embeddings e.g. [4096, n_tokens] + cur = fused_target; + res->t_embd = cur; + + ggml_build_forward_expand(gf, cur); +} + +// EAGLE3 Decoder: processes draft tokens using g_embeddings from encoder +// Input: draft tokens + g_embeddings from encoder +// Output: draft logits +llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_layer == 1); // EAGLE-3 has only one decoder layer + + ggml_tensor * cur; + ggml_tensor * inpL; + + // EAGLE3 Decoder receives: + // 1. Token embeddings (e.g.from EAGLE3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) + // 2. g_embeddings from encoder + // Choose token_embd_eagle3: prefer EAGLE3's own if available (Llama 3.3 70B), else use target's (Llama 3.1 8B) + ggml_tensor * token_embd_eagle3 = (model.tok_embd != nullptr) ? model.tok_embd : model.target_tok_embd; + GGML_ASSERT(token_embd_eagle3 != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + ggml_tensor * input_embeds = build_inp_embd(token_embd_eagle3); + cb(input_embeds, "token_embd_eagle3", -1); + ggml_tensor * g_embeddings = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(g_embeddings); + ggml_set_name(g_embeddings, "inp_g_embeddings"); + cb(g_embeddings, "inp_g_embeddings", -1); + + // Store raw g_embeddings as residual + ggml_tensor * residual = g_embeddings; + + // Apply input_layernorm to the token embeddings + ggml_tensor * input_embeds_normed = build_norm(input_embeds, + model.layers[0].attn_norm, NULL, + LLM_NORM_RMS, 0); + cb(input_embeds_normed, "input_layernorm", -1); + + // Force a sync point between the two parallel RMS_NORM paths + // This prevents buffer reuse issues on GPU (EAGLE3 GPU fix) + ggml_set_sync(input_embeds_normed); + + // Apply hidden_norm to g_embeddings + ggml_tensor * g_embeddings_normed = build_norm(g_embeddings, + model.layers[0].eagle3_hidden_norm, NULL, + LLM_NORM_RMS, -1); + cb(g_embeddings_normed, "g_embeddings_normed", -1); + + // Concatenate normalized input_embeds and normalized g_embeddings + cur = ggml_concat(ctx0, input_embeds_normed, g_embeddings_normed, 0); + cb(cur, "concat_embeds_g", -1); + + inpL = cur; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + + // Single decoder layer (il = 0) + const int il = 0; + { + // inpL is the concatenated input (normalized input_embeds + normalized g_embeddings) + ggml_tensor * inpSA = inpL; + + // Self-attention with concatenated input + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, inpL); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, inpL); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, inpL); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // rope freq factors, returns nullptr if not available + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // RoPE + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + // Add residual and update it + ggml_tensor * attn_with_residual = ggml_add(ctx0, cur, residual); + cb(attn_with_residual, "attn_with_residual", il); + + // Update residual + residual = attn_with_residual; + + // Apply FFN norm to the sum + ggml_tensor * ffn_inp = build_norm(attn_with_residual, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(ffn_inp, "post_attn_norm", il); + + cur = ffn_inp; + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + inpL = cur; + } + + cur = inpL; + + // Output norm with residual + ggml_tensor * final_with_residual = ggml_add(ctx0, cur, residual); + cb(final_with_residual, "eagle3_prenorm", -1); + + // Output prenorm state (for next token's g_embeddings in autoregressive generation) + ggml_set_output(final_with_residual); + res->t_embd = final_with_residual; + + cur = build_norm(final_with_residual, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head - projects to draft vocabulary + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} \ No newline at end of file diff --git a/src/models/llama.cpp b/src/models/llama.cpp index ab7fd5d0508..e695ae2c633 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -23,6 +23,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, diff --git a/src/models/models.h b/src/models/models.h index 6494f545018..419b88002bc 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -150,6 +150,14 @@ struct llm_build_dream : public llm_graph_context { llm_build_dream(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_eagle3_encode : public llm_graph_context { + llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params); +}; + +struct llm_build_eagle3_decode : public llm_graph_context { + llm_build_eagle3_decode(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_ernie4_5 : public llm_graph_context { llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params); }; From ac5667dcc6ea7d820c468e83a6e52bf646e63f71 Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Tue, 16 Dec 2025 16:53:28 +0000 Subject: [PATCH 2/9] fix eagle3 logits sync bug & remove ggml_set_sync() --- ggml/include/ggml.h | 2 -- ggml/src/ggml-backend.cpp | 14 -------------- ggml/src/ggml.c | 4 ---- src/llama-context.cpp | 3 ++- src/models/eagle3.cpp | 4 ---- 5 files changed, 2 insertions(+), 25 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index fa73e8216b8..686da3dbd10 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -629,7 +629,6 @@ extern "C" { GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) - GGML_TENSOR_FLAG_SYNC = 16, // ...forces a new split/sync point in the scheduler (e.g. for EAGLE3 decoder) }; enum ggml_tri_type { @@ -854,7 +853,6 @@ extern "C" { GGML_API void ggml_set_output(struct ggml_tensor * tensor); GGML_API void ggml_set_param(struct ggml_tensor * tensor); GGML_API void ggml_set_loss(struct ggml_tensor * tensor); - GGML_API void ggml_set_sync(struct ggml_tensor * tensor); // force sync point in scheduler // // operations on tensors with backpropagation diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 8e30d48ccc0..08681f35e3f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1202,11 +1202,6 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra } } - // check if this node requires a sync point (e.g. for EAGLE3 parallel path fix) - if (node->flags & GGML_TENSOR_FLAG_SYNC) { - need_new_split = true; - } - if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; i_split++; @@ -1581,15 +1576,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (ec != GGML_STATUS_SUCCESS) { return ec; } - - // If any node in this split has SYNC flag, synchronize after compute - // This ensures the sync node is complete before next split (e.g. for EAGLE3 parallel path sync fix) - for (int j = 0; j < split->graph.n_nodes; j++) { - if (split->graph.nodes[j]->flags & GGML_TENSOR_FLAG_SYNC) { - ggml_backend_synchronize(split_backend); - break; - } - } } else { // similar to ggml_backend_compare_graph_backend for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4625c3bd770..f0913cd3596 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7451,10 +7451,6 @@ void ggml_set_loss(struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_LOSS; } -void ggml_set_sync(struct ggml_tensor * tensor) { - tensor->flags |= GGML_TENSOR_FLAG_SYNC; -} - //////////////////////////////////////////////////////////////////////////////// void ggml_quantize_init(enum ggml_type type) { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ea6dfaea3c9..3506edd92bc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1261,7 +1261,8 @@ int llama_context::decode(const llama_batch & batch_inp) { // Read only the last token's draft logits eagle3_draft_logits.resize(draft_vocab_size); const size_t last_offset = last_idx * draft_vocab_size * sizeof(float); - ggml_backend_tensor_get(t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float)); + ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float)); + synchronize(); // Map only the last token's draft logits to target vocab diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 8987a0c5816..dea887bdd39 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -63,10 +63,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons LLM_NORM_RMS, 0); cb(input_embeds_normed, "input_layernorm", -1); - // Force a sync point between the two parallel RMS_NORM paths - // This prevents buffer reuse issues on GPU (EAGLE3 GPU fix) - ggml_set_sync(input_embeds_normed); - // Apply hidden_norm to g_embeddings ggml_tensor * g_embeddings_normed = build_norm(g_embeddings, model.layers[0].eagle3_hidden_norm, NULL, From 5a79c1900f9ed31be400b827424890b774be5dfb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 17 Dec 2025 15:49:03 +0200 Subject: [PATCH 3/9] eagle3 : improve naming --- common/speculative.cpp | 2 +- include/llama.h | 4 +- src/llama-context.cpp | 42 ++++----- src/llama-graph.h | 8 +- src/models/eagle3.cpp | 208 ++++++++++++++++++++--------------------- src/models/models.h | 2 + 6 files changed, 134 insertions(+), 132 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 058e75b7961..4f97d464ddb 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -307,7 +307,7 @@ static llama_tokens gen_eagle3_draft( /*.n_tokens =*/ n_new, /*.token =*/ nullptr, /*.embd =*/ const_cast(features), - /*.pos =*/ nullptr, + /*.pos =*/ nullptr, /*.n_seq_id =*/ nullptr, /*.seq_id =*/ nullptr, /*.logits =*/ nullptr, diff --git a/include/llama.h b/include/llama.h index caded2c83e4..7d26eebcbd8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -364,7 +364,7 @@ extern "C" { bool kv_unified; // use a unified buffer across the input sequences when computing the attention // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // ref: https://github.com/ggml-org/llama.cpp/pull/14363 - + // EAGLE3 extraction configuration // When eagle3_model is set, layer extraction is automatically enabled const struct llama_model * eagle3_model; // EAGLE3 model to read extract_layers configuration from @@ -876,7 +876,7 @@ extern "C" { // Returns NULL if no features are available // Format: [3*n_embd, n_tokens] - use model.hparams.n_embd and batch.n_tokens for dimensions LLAMA_API const float * llama_get_eagle3_target_features(struct llama_context * ctx); - + // Set g_embeddings from EAGLE3 encoder output for decoder input // g_embd: pointer to encoder output embeddings LLAMA_API void llama_set_eagle3_g_embeddings( diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a921112df2d..b4f6bb5b997 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -353,7 +353,7 @@ llama_context::llama_context( // Allocate tensors array for extraction eagle3.extract_tensors.resize(eagle3.extract_layer_indices.size(), nullptr); - + LLAMA_LOG_INFO("%s: EAGLE3 extraction enabled for layers [%d, %d, %d]\n", __func__, eagle3.extract_layer_indices[0], eagle3.extract_layer_indices[1], @@ -879,7 +879,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll //const auto t_start_us = ggml_time_us(); res->set_inputs(&ubatch); - + // EAGLE3: Fill g_embeddings for decoder input if (model.arch == LLM_ARCH_EAGLE3 && gtype == LLM_GRAPH_TYPE_DECODER && !eagle3.g_embeddings.empty()) { ggml_tensor * g_embd = ggml_graph_get_tensor(gf, "inp_g_embeddings"); @@ -1265,32 +1265,32 @@ int llama_context::decode(const llama_batch & batch_inp) { if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); - + // EAGLE3: Map draft vocab to target vocab if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) { static thread_local std::vector eagle3_d2t_map; static thread_local std::vector eagle3_draft_logits; - + const int64_t draft_vocab_size = t_logits->ne[0]; const uint32_t last_idx = n_outputs - 1; - + // Load d2t mapping once (on first call) if (eagle3_d2t_map.empty()) { eagle3_d2t_map.resize(model.d2t->ne[0]); ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0, eagle3_d2t_map.size() * sizeof(int64_t)); } - + // Read only the last token's draft logits eagle3_draft_logits.resize(draft_vocab_size); const size_t last_offset = last_idx * draft_vocab_size * sizeof(float); ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float)); synchronize(); - - + + // Map only the last token's draft logits to target vocab float * last_logits_out = logits_out + last_idx * n_vocab; std::fill(last_logits_out, last_logits_out + n_vocab, -std::numeric_limits::infinity()); - + for (int64_t j = 0; j < draft_vocab_size; j++) { const int64_t target_id = j + eagle3_d2t_map[j]; GGML_ASSERT(target_id >= 0 && target_id < n_vocab); @@ -1656,7 +1656,7 @@ llm_graph_cb llama_context::graph_get_cb() const { if (cparams.eagle3_extract_enabled) { static constexpr const char * prefix = "eagle3_extract_"; static constexpr size_t prefix_len = 15; // strlen("eagle3_extract_") - + if (strncmp(name, prefix, prefix_len) == 0) { // Parse the extraction index from the name (e.g., "eagle3_extract_0" -> 0) size_t extract_idx = 0; @@ -1667,7 +1667,7 @@ llm_graph_cb llama_context::graph_get_cb() const { eagle3.extract_tensors[extract_idx] = cur; LLAMA_LOG_DEBUG("%s: EAGLE3 stored tensor reference for extraction: " "index=%zu, layer=%d, target_layer=%d, tensor=%s\n", - __func__, extract_idx, il, + __func__, extract_idx, il, eagle3.extract_layer_indices[extract_idx], name); } } @@ -1702,36 +1702,36 @@ void llama_context::extract_eagle3_features(const llama_ubatch & ubatch) { const int64_t n_tokens = ubatch.n_tokens; const int64_t n_embd = model.hparams.n_embd; const size_t n_layers = eagle3.extract_tensors.size(); - + // Allocate storage for concatenated features const int64_t n_embd_concat = n_embd * n_layers; eagle3.target_features.resize(n_embd_concat * n_tokens); - + // Temporary buffer to hold layer features before transposing static thread_local std::vector temp_layer_features; temp_layer_features.resize(n_embd * n_tokens); - + LLAMA_LOG_DEBUG("%s: Start to extract EAGLE3 features: %zu layers, %lld tokens, %lld embd\n", __func__, n_layers, (long long)n_tokens, (long long)n_embd); - + // Extract each layer's features and interleave into token-major layout for (size_t layer_idx = 0; layer_idx < n_layers; ++layer_idx) { ggml_tensor * tensor = eagle3.extract_tensors[layer_idx]; GGML_ASSERT(tensor != nullptr && "EAGLE3 extraction tensor is null"); - + // Get the backend where this tensor is stored ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), tensor); GGML_ASSERT(backend != nullptr && "EAGLE3 tensor has no backend"); - + // Verify tensor shape: should be [n_embd, n_tokens] GGML_ASSERT(tensor->ne[0] == n_embd && tensor->ne[1] == n_tokens && "EAGLE3 extraction tensor has unexpected shape"); - + // Get layer features to temp buffer const size_t size_bytes = n_embd * n_tokens * sizeof(float); ggml_backend_tensor_get_async(backend, tensor, temp_layer_features.data(), 0, size_bytes); ggml_backend_sched_synchronize(sched.get()); - + // Then copy to correct position in target_features // target_features layout: [token_0_all_layers, token_1_all_layers, ...] // Each token has [layer_0_embd, layer_1_embd, layer_2_embd] @@ -1743,7 +1743,7 @@ void llama_context::extract_eagle3_features(const llama_ubatch & ubatch) { std::memcpy(dest, src, n_embd * sizeof(float)); } } - + } // @@ -3235,7 +3235,7 @@ const float * llama_context::get_eagle3_target_features() const { void llama_context::set_eagle3_g_embeddings(const float * g_embd, int32_t n_embd, int32_t n_tokens) { GGML_ASSERT(g_embd != nullptr && "g_embeddings cannot be null"); GGML_ASSERT(n_embd > 0 && n_tokens > 0 && "invalid dimensions"); - + const size_t size = n_embd * n_tokens; eagle3.g_embeddings.resize(size); std::memcpy(eagle3.g_embeddings.data(), g_embd, size * sizeof(float)); diff --git a/src/llama-graph.h b/src/llama-graph.h index 617ea154c34..69df6b1f4e3 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -74,18 +74,18 @@ struct llama_cross { struct llama_eagle3 { // Configuration: which layers to extract from target model std::vector extract_layer_indices; - + // Extracted features from target model (for encoder input) // Concatenated [layer_l, layer_m, layer_h] embeddings // Shape: [n_layers * n_embd, n_tokens] where n_layers = extract_layer_indices.size() std::vector target_features; - + // Encoder output (for decoder input) std::vector g_embeddings; - + // Tensor references for feature extraction from target model std::vector extract_tensors; - + // Clear all stored data void clear() { target_features.clear(); diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index dea887bdd39..629d89d3270 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -1,103 +1,109 @@ #include "models.h" +ggml_tensor * llm_build_eagle3_encode::build_inp_embd() const { + const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size; + + ggml_tensor * cur = nullptr; + + // Input: Target model features (3 layers concatenated: low, mid, high) + // Data will be provided via ubatch->embd in encode_eagle3_features() + auto inp_target = std::make_unique(); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens); + ggml_set_input(inp_target->embd); + + cur = inp_target->embd; + cb(cur, "inp_embd", -1); + + res->add_input(std::move(inp_target)); + + return cur; +} + // EAGLE3 Encoder: processes target model features through feature fusion layer // Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high // Output: g_embeddings e.g. [4096, n_tokens] stored in context llm_build_eagle3_encode::llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur = nullptr; - const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size; - - ggml_tensor * cur; + cur = build_inp_embd(); - // Input: Target model features (3 layers concatenated: low, mid, high) - // Data will be provided via ubatch->embd in encode_eagle3_features() - auto inp_target = std::make_unique(); - inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens); - ggml_set_input(inp_target->embd); - ggml_tensor * target_features = inp_target->embd; - res->add_input(std::move(inp_target)); - cb(target_features, "inp_target_features", -1); + // Feature fusion layer + cur = build_lora_mm(model.fc, cur); + cb(cur, "fc_out", -1); - // Feature fusion layer - ggml_tensor * fused_target = build_lora_mm(model.fc, target_features); - cb(fused_target, "fc_out", -1); + // Output: g_embeddings e.g. [4096, n_tokens] + res->t_embd = cur; - // Output: g_embeddings e.g. [4096, n_tokens] - cur = fused_target; - res->t_embd = cur; - - ggml_build_forward_expand(gf, cur); + ggml_build_forward_expand(gf, cur); } // EAGLE3 Decoder: processes draft tokens using g_embeddings from encoder // Input: draft tokens + g_embeddings from encoder // Output: draft logits llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; - - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_layer == 1); // EAGLE-3 has only one decoder layer - - ggml_tensor * cur; - ggml_tensor * inpL; - - // EAGLE3 Decoder receives: - // 1. Token embeddings (e.g.from EAGLE3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) - // 2. g_embeddings from encoder - // Choose token_embd_eagle3: prefer EAGLE3's own if available (Llama 3.3 70B), else use target's (Llama 3.1 8B) - ggml_tensor * token_embd_eagle3 = (model.tok_embd != nullptr) ? model.tok_embd : model.target_tok_embd; - GGML_ASSERT(token_embd_eagle3 != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); - ggml_tensor * input_embeds = build_inp_embd(token_embd_eagle3); - cb(input_embeds, "token_embd_eagle3", -1); - ggml_tensor * g_embeddings = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - ggml_set_input(g_embeddings); - ggml_set_name(g_embeddings, "inp_g_embeddings"); - cb(g_embeddings, "inp_g_embeddings", -1); - - // Store raw g_embeddings as residual - ggml_tensor * residual = g_embeddings; + const int64_t n_embd_head = hparams.n_embd_head_v; - // Apply input_layernorm to the token embeddings - ggml_tensor * input_embeds_normed = build_norm(input_embeds, - model.layers[0].attn_norm, NULL, - LLM_NORM_RMS, 0); - cb(input_embeds_normed, "input_layernorm", -1); - - // Apply hidden_norm to g_embeddings - ggml_tensor * g_embeddings_normed = build_norm(g_embeddings, - model.layers[0].eagle3_hidden_norm, NULL, - LLM_NORM_RMS, -1); - cb(g_embeddings_normed, "g_embeddings_normed", -1); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_layer == 1); // EAGLE-3 has only one decoder layer - // Concatenate normalized input_embeds and normalized g_embeddings - cur = ggml_concat(ctx0, input_embeds_normed, g_embeddings_normed, 0); - cb(cur, "concat_embeds_g", -1); - - inpL = cur; + ggml_tensor * cur; + ggml_tensor * inpL; - // inp_pos - contains the positions - ggml_tensor * inp_pos = build_inp_pos(); + // EAGLE3 Decoder receives: + // 1. Token embeddings (e.g.from EAGLE3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) + // 2. g_embeddings from encoder + // Choose token_embd_eagle3: prefer EAGLE3's own if available (Llama 3.3 70B), else use target's (Llama 3.1 8B) + ggml_tensor * token_embd_eagle3 = (model.tok_embd != nullptr) ? model.tok_embd : model.target_tok_embd; + GGML_ASSERT(token_embd_eagle3 != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + ggml_tensor * inp_embd = build_inp_embd(token_embd_eagle3); + cb(inp_embd, "inp_embd", -1); - auto * inp_attn = build_attn_inp_kv(); + // TODO: refactor into llm_graph_input + ggml_tensor * inp_g = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(inp_g); + cb(inp_g, "inp_g_embeddings", -1); // TODO: do not change the name! refactor into llm_graph_input - ggml_tensor * inp_out_ids = build_inp_out_ids(); + inpL = inp_g; - const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); - // Single decoder layer (il = 0) - const int il = 0; - { - // inpL is the concatenated input (normalized input_embeds + normalized g_embeddings) + auto * inp_attn = build_attn_inp_kv(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // Single decoder layer (il = 0) + const int il = 0; + { + // inpL is the concatenated input (normalized inp_embd + normalized inp_g) ggml_tensor * inpSA = inpL; + // Apply input_layernorm to the token embeddings + ggml_tensor * embd_norm = build_norm(inp_embd, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(embd_norm, "embd_norm", il); + + // Apply hidden_norm to inp_g + ggml_tensor * g_norm = build_norm(inp_g, + model.layers[il].eagle3_hidden_norm, NULL, + LLM_NORM_RMS, -1); + cb(g_norm, "g_norm", il); + + // Concatenate normalized inp_embd and normalized inp_g + cur = ggml_concat(ctx0, embd_norm, g_norm, il); + cb(cur, "concat_embd", il); + // Self-attention with concatenated input - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, inpL); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, inpL); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, inpL); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -127,25 +133,19 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); if (inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - residual = ggml_get_rows(ctx0, residual, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } // Add residual and update it - ggml_tensor * attn_with_residual = ggml_add(ctx0, cur, residual); - cb(attn_with_residual, "attn_with_residual", il); - - // Update residual - residual = attn_with_residual; - + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + // Apply FFN norm to the sum - ggml_tensor * ffn_inp = build_norm(attn_with_residual, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); - cb(ffn_inp, "post_attn_norm", il); - - cur = ffn_inp; + cb(cur, "post_attn_norm", il); cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -154,30 +154,30 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); - + + // Output norm with residual + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "eagle3_prenorm", il); + inpL = cur; - } + } - cur = inpL; + cur = inpL; - // Output norm with residual - ggml_tensor * final_with_residual = ggml_add(ctx0, cur, residual); - cb(final_with_residual, "eagle3_prenorm", -1); - - // Output prenorm state (for next token's g_embeddings in autoregressive generation) - ggml_set_output(final_with_residual); - res->t_embd = final_with_residual; - - cur = build_norm(final_with_residual, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); + // Output prenorm state (for next token's g_embeddings in autoregressive generation) + ggml_set_output(cur); + res->t_embd = cur; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); - // lm_head - projects to draft vocabulary - cur = build_lora_mm(model.output, cur); + // lm_head - projects to draft vocabulary + cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - res->t_logits = cur; + cb(cur, "result_output", -1); + res->t_logits = cur; - ggml_build_forward_expand(gf, cur); -} \ No newline at end of file + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/models.h b/src/models/models.h index 653c962d191..a6d1a2fccf2 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -152,6 +152,8 @@ struct llm_build_dream : public llm_graph_context { struct llm_build_eagle3_encode : public llm_graph_context { llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params); +private: + ggml_tensor * build_inp_embd() const; }; struct llm_build_eagle3_decode : public llm_graph_context { From c0d99e65d2d27f44df7f16e98dc7f28b6fe832cb Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Thu, 8 Jan 2026 23:49:06 +0000 Subject: [PATCH 4/9] add eagle3 support for Qwen3 series models --- convert_hf_to_gguf.py | 9 +++++---- src/models/qwen3.cpp | 11 +++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a9e17ee1faf..7ef9ffb27b0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2395,6 +2395,7 @@ def prepare_tensors(self): "VLlama3ForCausalLM", "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", + "LlamaForCausalLMEagle3", "LlamaModel") class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA @@ -2477,10 +2478,6 @@ def set_vocab(self): # Llama 3 self._set_vocab_gpt2() - # Restore original dir_model for EAGLE-3 - if hasattr(self, 'is_eagle3') and self.is_eagle3: - self.dir_model = original_dir_model - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( @@ -2504,6 +2501,10 @@ def set_vocab(self): if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) + # Restore original dir_model for EAGLE-3 + if hasattr(self, 'is_eagle3') and self.is_eagle3: + self.dir_model = original_dir_model + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index a5cfffa5314..c1f34624c03 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -21,6 +21,17 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, From 71ba283a6573b3735fa07c39d6e5f8cdeb9a34ab Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Fri, 9 Jan 2026 11:54:28 +0000 Subject: [PATCH 5/9] add eagle3 support for Qwen3 MoE models --- src/models/qwen3moe.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 888534fb347..c0b6ff5df97 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -21,6 +21,17 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, From 3da288d78dc68005502481c50cb8bb3d482a6127 Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Sat, 10 Jan 2026 14:09:50 +0000 Subject: [PATCH 6/9] eagle3: load lm_head from target model if not in draft model when convert GGUF --- convert_hf_to_gguf.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7ef9ffb27b0..52140107fb5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2638,6 +2638,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # EAGLE3: If no lm_head in draft model, load from target model + if hasattr(self, 'is_eagle3') and self.is_eagle3 and "lm_head.weight" not in self.model_tensors: + from safetensors import safe_open + for sf_file in self.target_model_dir.glob("*.safetensors"): + with safe_open(sf_file, framework="pt") as f: + if "lm_head.weight" in f.keys(): + lm_head = f.get_tensor("lm_head.weight") + logger.info(f"EAGLE3: No lm_head in draft model, loaded lm_head from {sf_file.name}, shape = {lm_head.shape}") + yield ("output.weight", lm_head) + break + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): if rope_params.get("rope_type", '').lower() == "llama3": base = rope_params.get("rope_theta", 10000.0) From 13a9f31de3c4112c65693db3ed3e08223a069365 Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Sat, 10 Jan 2026 18:30:19 +0000 Subject: [PATCH 7/9] eagle3: make d2t mapping optional --- src/llama-model.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f4e22bdda8d..287bfe7f142 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6464,20 +6464,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size; const int64_t n_embd_attn_input = 2 * n_embd; - // Get vocab size from the d2t tensor in the GGUF file - // d2t: draft to target mapping (size = draft_vocab_size) + // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if EAGLE3 has different vocab_size than target) + // d2t: draft to target vocabulary mapping + int64_t n_draft_vocab = n_vocab; // Default: same as target vocab const struct ggml_tensor * d2t_meta = ml.get_tensor_meta("d2t"); - if (!d2t_meta) { - throw std::runtime_error("EAGLE3 model requires 'd2t' tensor but it was not found in the model file"); + if (d2t_meta) { + n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size + d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } else { + d2t = nullptr; // no d2t, use default vocab size + LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); } - const int64_t n_draft_vocab = d2t_meta->ne[0]; // Feature fusion layer: projects 3 target layers to draft hidden size fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0); - // Draft to target vocabulary mapping tensor - d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0); - // Output layer (uses draft vocab size) output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, 0); From 75883cde73fbbd0792cd578cb572ab4382d7b8c3 Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Sat, 10 Jan 2026 18:33:41 +0000 Subject: [PATCH 8/9] eagle3: add support for gpt-oss-120B eagle3 --- src/models/openai-moe-iswa.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp index 96596709eec..08cc41f3c11 100644 --- a/src/models/openai-moe-iswa.cpp +++ b/src/models/openai-moe-iswa.cpp @@ -16,6 +16,17 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } + // norm cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, From 7b78bfa9845f3de31e634809a4fdbaf10000bc29 Mon Sep 17 00:00:00 2001 From: ruixiangw Date: Fri, 16 Jan 2026 00:54:14 +0000 Subject: [PATCH 9/9] eagle3: add support for RedHtAI eagle3 speculator series models --- convert_hf_to_gguf.py | 17 ++++++++++++++++- gguf-py/gguf/constants.py | 1 + src/llama-arch.cpp | 5 +++-- src/llama-arch.h | 1 + src/llama-hparams.h | 3 +++ src/llama-model.cpp | 9 ++++++++- src/models/eagle3.cpp | 9 ++++++--- 7 files changed, 38 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 52140107fb5..2babd7f9f08 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2396,6 +2396,8 @@ def prepare_tensors(self): "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", "LlamaForCausalLMEagle3", + "Eagle3Speculator", + "Eagle3DraftModel", "LlamaModel") class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA @@ -2445,6 +2447,11 @@ def __init__(self, *args, **kwargs): logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)") self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size) + # Eagle3Speculator norm_before_residual specific handling + norm_before_residual = eagle3_raw_config.get("norm_before_residual", False) + logger.info(f"EAGLE3: norm_before_residual = {norm_before_residual} (from EAGLE3 config)") + self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual) + def set_vocab(self): # For EAGLE-3 models, use tokenizer from target model if provided if hasattr(self, 'is_eagle3') and self.is_eagle3: @@ -2528,15 +2535,23 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: tensors = super().index_tensors(remote_hf_model_id) + + # Handle Eagle3Speculator nested config + if "transformer_layer_config" in self.hparams: + self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]} + # EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set) if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: - logger.info("EAGLE-3: Renaming midlayer.* to model.layers.0.*") + logger.info("EAGLE-3: Renaming midlayer.* or layers.0.* to model.layers.0.*") new_tensors = {} # EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model for name, gen in tensors.items(): if name.startswith("midlayer."): new_name = "model.layers.0." + name[len("midlayer."):] new_tensors[new_name] = gen + elif name.startswith("layers.0."): # layers.0.* -> model.layers.0.* (Eagle3Speculator format) + new_name = "model." + name + new_tensors[new_name] = gen else: new_tensors[name] = gen return new_tensors diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b1160ca26d8..2ae5094619d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -149,6 +149,7 @@ class LLM: DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers" EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" + EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 4caa5f77aee..8304c636155 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -248,8 +248,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, - { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, - { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, + { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims diff --git a/src/llama-arch.h b/src/llama-arch.h index 3e731b5005b..36cad138a86 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -292,6 +292,7 @@ enum llm_kv { LLM_KV_EAGLE3_EXTRACT_LAYERS, LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, + LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, LLM_KV_SHORTCONV_L_CACHE, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 9272c728e31..f8ed7f364c1 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -196,6 +196,9 @@ struct llama_hparams { // EAGLE3 draft model - target model hidden size uint32_t eagle3_target_hidden_size = 0; + // EAGLE3 draft model - apply hidden_norm before storing residual + bool eagle3_norm_before_residual = false; + // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 287bfe7f142..4879376aefa 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2260,7 +2260,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size); LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__, hparams.eagle3_target_hidden_size, hparams.n_embd); - + + // EAGLE3 norm_before_residual (optional, default false) + // compatible with Readhat eagle3 speculator model + ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false); + if (hparams.eagle3_norm_before_residual) { + LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__); + } + type = LLM_TYPE_UNKNOWN; } break; case LLM_ARCH_COGVLM: diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 629d89d3270..4f9410b3602 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -77,9 +77,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons // Single decoder layer (il = 0) const int il = 0; { - // inpL is the concatenated input (normalized inp_embd + normalized inp_g) - ggml_tensor * inpSA = inpL; - // Apply input_layernorm to the token embeddings ggml_tensor * embd_norm = build_norm(inp_embd, model.layers[il].attn_norm, NULL, @@ -92,6 +89,12 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons LLM_NORM_RMS, -1); cb(g_norm, "g_norm", il); + // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model) + // - false (default): use raw inp_g for residual + // - true: use normalized g_norm for residual + // inpL is the concatenated input (normalized inp_embd + normalized inp_g) + ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL; + // Concatenate normalized inp_embd and normalized inp_g cur = ggml_concat(ctx0, embd_norm, g_norm, il); cb(cur, "concat_embd", il);