diff --git a/common/arg.cpp b/common/arg.cpp index 9c85696ebdb..ed320a4563b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3004,7 +3004,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, bool value) { params.use_jinja = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" @@ -3035,7 +3035,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.chat_template = value; } - ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"--chat-template-file"}, "JINJA_TEMPLATE_FILE", string_format( @@ -3346,6 +3346,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.p_min = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN")); + add_opt(common_arg( + {"--eagle3"}, + "use EAGLE3 speculative decoding with the draft model", + [](common_params & params) { + params.speculative.eagle3 = true; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), diff --git a/common/common.h b/common/common.h index b284244530a..612a0a062e5 100644 --- a/common/common.h +++ b/common/common.h @@ -280,10 +280,13 @@ struct common_params_speculative { struct common_params_model mparams_dft; + llama_model * model_tgt = nullptr; // the target model llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts llama_context_params cparams_dft; // these are the parameters for the draft llama_context + bool eagle3 = false; // use EAGLE3 speculative decoding + int32_t n_ctx = 0; // draft context size int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) diff --git a/common/speculative.cpp b/common/speculative.cpp index 3e68c38e49c..53ea52e7400 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -47,6 +47,7 @@ struct common_speculative_config { const common_params_speculative & p = common_params_speculative{}) : type(t), params(p) {} }; + static bool common_speculative_are_compatible( const llama_model * model_tgt, const llama_model * model_dft) { @@ -210,7 +211,9 @@ struct common_speculative_state_draft : public common_speculative_state { ~common_speculative_state_draft() override { llama_perf_context_print(ctx_dft); - llama_free(ctx_dft); + if (ctx_dft) { + llama_free(ctx_dft); + } common_sampler_free(smpl); @@ -228,11 +231,11 @@ struct common_speculative_state_draft : public common_speculative_state { llama_tokens & result) override { auto * spec = this; - auto & batch = spec->batch; - auto & ctx_tgt = spec->ctx_tgt; - auto & ctx_dft = spec->ctx_dft; - auto & smpl = spec->smpl; - auto & prompt_dft = spec->prompt_dft; + auto & batch = spec->batch; + auto & ctx_tgt = spec->ctx_tgt; + auto & ctx_dft = spec->ctx_dft; + auto & smpl = spec->smpl; + auto & prompt_dft = spec->prompt_dft; auto * mem_dft = llama_get_memory(ctx_dft); @@ -438,7 +441,52 @@ struct common_speculative_state_draft : public common_speculative_state { }; struct common_speculative_state_eagle3 : public common_speculative_state { - common_speculative_state_eagle3(enum common_speculative_type type) : common_speculative_state(type) {} + llama_context * ctx_tgt; + + common_sampler * smpl; + + llama_batch batch; + + struct llama_context * ctx_dft_enc = nullptr; + struct llama_context * ctx_dft_dec = nullptr; + + int32_t eagle3_n_past = 0; // number of verified positions in decoder KV cache + + common_speculative_state_eagle3( + enum common_speculative_type type, + llama_context * ctx_tgt, + llama_context * ctx_dft_enc, + llama_context * ctx_dft_dec) + : common_speculative_state(type) + , ctx_tgt(ctx_tgt) + , ctx_dft_enc(ctx_dft_enc) + , ctx_dft_dec(ctx_dft_dec) + { + batch = llama_batch_init(llama_n_batch(ctx_dft_dec), 0, 1); + + // Initialize sampler for EAGLE3 decoder + common_params_sampling params; + params.no_perf = false; + params.top_k = 10; // set 1 for greedy sampling (argmax) to match vLLM's default behavior but >1 always gets higher acceptance rate for eagle3 + params.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; + smpl = common_sampler_init(llama_get_model(ctx_dft_dec), params); + } + + ~common_speculative_state_eagle3() override { + llama_perf_context_print(ctx_dft_dec); + + if (ctx_dft_dec) { + llama_free(ctx_dft_dec); + } + + if (ctx_dft_enc) { + llama_free(ctx_dft_enc); + } + + common_sampler_free(smpl); + + llama_batch_free(batch); + } void begin(const llama_tokens & prompt) override { GGML_UNUSED(prompt); @@ -448,12 +496,97 @@ struct common_speculative_state_eagle3 : public common_speculative_state { const common_params_speculative & params, const llama_tokens & prompt_tgt, llama_token id_last, - llama_tokens & draft_tokens) override { - // TODO: implement - GGML_UNUSED(params); - GGML_UNUSED(prompt_tgt); - GGML_UNUSED(id_last); - GGML_UNUSED(draft_tokens); + llama_tokens & result) override { + auto * spec = this; + + auto & batch = spec->batch; + auto & ctx_tgt = spec->ctx_tgt; + auto & ctx_dft_enc = spec->ctx_dft_enc; + auto & ctx_dft_dec = spec->ctx_dft_dec; + auto & smpl = spec->smpl; + + //result = gen_eagle3_draft(spec, params, prompt_tgt, id_last); + const int n_embd = llama_model_n_embd(llama_get_model(ctx_dft_enc)); + const int n = (int)prompt_tgt.size(); + const int n_new = n - spec->eagle3_n_past; + + GGML_ASSERT(n >= 1 && "prompt_tgt is empty"); + GGML_ASSERT(n_new >= 1 && "must have at least 1 new token"); + + // Clear draft positions from decoder KV cache [n_past, inf) + llama_memory_seq_rm(llama_get_memory(ctx_dft_dec), 0, spec->eagle3_n_past, -1); + + // Encoder: features → g_embeddings + const float * features = llama_get_eagle3_target_features(ctx_tgt); + GGML_ASSERT(features && "no target features"); + + llama_batch enc_batch = { + /*.n_tokens =*/ n_new, + /*.token =*/ nullptr, + /*.embd =*/ const_cast(features), + /*.pos =*/ nullptr, + /*.n_seq_id =*/ nullptr, + /*.seq_id =*/ nullptr, + /*.logits =*/ nullptr, + }; + GGML_ASSERT(llama_encode(ctx_dft_enc, enc_batch) == 0); + + const float * g_embd = llama_get_embeddings(ctx_dft_enc); + GGML_ASSERT(g_embd && "encoder output failed"); + + // Decoder batch: process new tokens with KV cache reuse + llama_set_eagle3_g_embeddings(ctx_dft_dec, g_embd, n_embd, n_new); + + common_batch_clear(batch); + for (int i = 0; i < n_new; i++) { + const int pos = spec->eagle3_n_past + i; + const llama_token tok = (pos < n - 1) ? prompt_tgt[pos + 1] : id_last; + common_batch_add(batch, tok, pos, {0}, true); + } + + GGML_ASSERT(llama_decode(ctx_dft_dec, batch) == 0); + + spec->eagle3_n_past = n; // update verified positions + + // Sample draft tokens + result.clear(); + common_sampler_reset(smpl); + + // Sample and check probability (consistent with standard speculative decoding) + auto sample_and_check = [&](int idx) -> bool { + common_sampler_sample(smpl, ctx_dft_dec, idx); + + const auto * cur_p = common_sampler_get_candidates(smpl, true); + const llama_token id = cur_p->data[0].id; + + common_sampler_accept(smpl, id, true); + result.push_back(id); + + return cur_p->data[0].p >= params.p_min; + }; + + // First draft token from batch decode + if (!sample_and_check(n_new - 1)) { + return; + } + + // Autoregressive: use prenorm as g_embd (-1 = last output) + const float * prenorm = llama_get_embeddings_ith(ctx_dft_dec, -1); + + for (int i = 1; i < params.n_max; i++) { + GGML_ASSERT(prenorm && "prenorm failed"); + llama_set_eagle3_g_embeddings(ctx_dft_dec, prenorm, n_embd, 1); + + common_batch_clear(batch); + common_batch_add(batch, result.back(), n - 1 + i, {0}, true); + GGML_ASSERT(llama_decode(ctx_dft_dec, batch) == 0); + + prenorm = llama_get_embeddings_ith(ctx_dft_dec, -1); + + if (!sample_and_check(0)) { + break; + } + } } void accept(uint16_t n_accepted) override { @@ -840,11 +973,35 @@ common_speculative * common_speculative_init( common_params_speculative & params, llama_context * ctx_tgt) { llama_context * ctx_dft = nullptr; + + llama_context * ctx_dft_enc = nullptr; + llama_context * ctx_dft_dec = nullptr; + if (params.model_dft) { - ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft); - if (ctx_dft == nullptr) { - LOG_ERR("%s", "failed to create draft context\n"); - return nullptr; + if (params.eagle3) { + llama_context_params params_enc = params.cparams_dft; + params_enc.target_model = nullptr; + params_enc.embeddings = true; + ctx_dft_enc = llama_init_from_model(params.model_dft, params_enc); + if (!ctx_dft_enc) { + LOG_ERR("failed to create EAGLE3 encoder context\n"); + return nullptr; + } + + llama_context_params params_dec = params.cparams_dft; + params_dec.target_model = params.model_tgt; + params_dec.embeddings = true; + ctx_dft_dec = llama_init_from_model(params.model_dft, params_dec); + if (!ctx_dft_dec) { + LOG_ERR("failed to create EAGLE3 decoder context\n"); + return nullptr; + } + } else { + ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft); + if (ctx_dft == nullptr) { + LOG_ERR("%s", "failed to create draft context\n"); + return nullptr; + } } } @@ -852,7 +1009,7 @@ common_speculative * common_speculative_init( std::vector configs = {}; // list of speculative configs to try { bool has_draft = !params.mparams_dft.path.empty(); - bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 + bool has_draft_eagle3 = params.eagle3; bool has_ngram_cache = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE); bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE); @@ -893,10 +1050,11 @@ common_speculative * common_speculative_init( configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } if (has_draft) { - configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params)); - } - if (has_draft_eagle3) { - configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params)); + if (has_draft_eagle3) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params)); + } else { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params)); + } } } @@ -916,7 +1074,11 @@ common_speculative * common_speculative_init( break; } case COMMON_SPECULATIVE_TYPE_EAGLE3: { - impls.push_back(std::make_unique(config.type)); + impls.push_back(std::make_unique(config.type, + /* .ctx_tgt = */ ctx_tgt, + /* .ctx_dft_enc = */ ctx_dft_enc, + /* .ctx_dft_dec = */ ctx_dft_dec + )); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 843c00a8969..3dcaa0e7978 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -97,6 +97,7 @@ class ModelBase: metadata_override: Path | None dir_model_card: Path remote_hf_model_id: str | None + target_model_dir: Path | None # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -116,7 +117,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, - sentence_transformers_dense_modules: bool = False): + sentence_transformers_dense_modules: bool = False, target_model_dir: Path | None = None): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -135,6 +136,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.target_model_dir = target_model_dir self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override @@ -2492,6 +2494,9 @@ def prepare_tensors(self): "VLlama3ForCausalLM", "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", + "LlamaForCausalLMEagle3", + "Eagle3Speculator", + "Eagle3DraftModel", "IQuestCoderForCausalLM", "LlamaModel") class LlamaModel(TextModel): @@ -2506,7 +2511,60 @@ def __init__(self, *args, **kwargs): hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) self.origin_hf_arch = hparams.get('architectures', [None])[0] + # detect EAGLE-3 llama checkpoint + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + self.is_eagle3 = True + self.model_arch = gguf.MODEL_ARCH.EAGLE3 + logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture") + # Re-initialize tensor_map with EAGLE3 architecture + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + # Update gguf_writer architecture + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + if not hasattr(self, 'target_model_dir') or not self.target_model_dir: + raise ValueError( + "EAGLE3 model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory to read config.json" + ) + # Read both EAGLE3 raw config and target model config + with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f: + eagle3_raw_config = json.load(f) + with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f: + target_config = json.load(f) + + # EAGLE3 extract_layers + target_num_layers = target_config["num_hidden_layers"] + extract_layers = [2, target_num_layers // 2, target_num_layers - 3] + logger.info(f"EAGLE3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)") + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers) + + # EAGLE3 target_hidden_size: prefer EAGLE3 config, fallback to target config + if "target_hidden_size" in eagle3_raw_config and eagle3_raw_config["target_hidden_size"] is not None: + target_hidden_size = eagle3_raw_config["target_hidden_size"] + logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from EAGLE3 config)") + else: + target_hidden_size = target_config["hidden_size"] + logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)") + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size) + + # Eagle3Speculator norm_before_residual specific handling + norm_before_residual = eagle3_raw_config.get("norm_before_residual", False) + logger.info(f"EAGLE3: norm_before_residual = {norm_before_residual} (from EAGLE3 config)") + self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual) + def set_vocab(self): + # For EAGLE-3 models, use tokenizer from target model if provided + if hasattr(self, 'is_eagle3') and self.is_eagle3: + if self.target_model_dir is None: + raise ValueError( + "EAGLE-3 draft model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory containing the tokenizer." + ) + logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}") + # Temporarily swap dir_model to load tokenizer from target model + original_dir_model = self.dir_model + self.dir_model = self.target_model_dir + if self.origin_hf_arch == "GlmasrModel": return self._set_vocab_glmedge() @@ -2550,6 +2608,10 @@ def set_vocab(self): if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) + # Restore original dir_model for EAGLE-3 + if hasattr(self, 'is_eagle3') and self.is_eagle3: + self.dir_model = original_dir_model + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -2571,7 +2633,53 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): _experts: list[dict[str, Tensor]] | None = None + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + tensors = super().index_tensors(remote_hf_model_id) + + # Handle Eagle3Speculator nested config + if "transformer_layer_config" in self.hparams: + self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]} + + # EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set) + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + logger.info("EAGLE-3: Renaming midlayer.* or layers.0.* to model.layers.0.*") + new_tensors = {} + # EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model + for name, gen in tensors.items(): + if name.startswith("midlayer."): + new_name = "model.layers.0." + name[len("midlayer."):] + new_tensors[new_name] = gen + elif name.startswith("layers.0."): # layers.0.* -> model.layers.0.* (Eagle3Speculator format) + new_name = "model." + name + new_tensors[new_name] = gen + else: + new_tensors[name] = gen + return new_tensors + else: + return tensors + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # Eagle-3 llama checkpoint special handling + if hasattr(self, 'is_eagle3') and self.is_eagle3: + # Eagle-3 llama checkpoint special weights handling + # fc.weight: feature fusion layer + if name == "fc.weight": + return [(name, data_torch)] + # d2t: draft to target vocabulary mapping + elif name == "d2t": + # Skip parent class processing (store for manual handling in prepare_tensors) + if not hasattr(self, '_eagle3_int_tensors'): + self._eagle3_int_tensors = {} + self._eagle3_int_tensors[name] = data_torch + return [] + # t2d: target to draft vocabulary mapping (not used, skip completely) + elif name == "t2d": + return [] + # hidden_norm: EAGLE-3 specific layer normalization + elif name == "model.layers.0.hidden_norm.weight": + return [("blk.0.hidden_norm.weight", data_torch)] + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) @@ -2641,6 +2749,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # EAGLE3: If no lm_head in draft model, load from target model + if hasattr(self, 'is_eagle3') and self.is_eagle3 and "lm_head.weight" not in self.model_tensors: + from safetensors import safe_open + for sf_file in self.target_model_dir.glob("*.safetensors"): + with safe_open(sf_file, framework="pt") as f: + if "lm_head.weight" in f.keys(): + lm_head = f.get_tensor("lm_head.weight") + logger.info(f"EAGLE3: No lm_head in draft model, loaded lm_head from {sf_file.name}, shape = {lm_head.shape}") + yield ("output.weight", lm_head) + break + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): if rope_params.get("rope_type", '').lower() == "llama3": base = rope_params.get("rope_theta", 10000.0) @@ -2671,8 +2790,26 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) def prepare_tensors(self): + # EAGLE-3: collect original dtypes BEFORE parent class converts them to F32 + eagle3_original_dtypes = {} + if hasattr(self, 'is_eagle3') and self.is_eagle3: + for name, data_torch in self.get_tensors(): + if name == "d2t": + eagle3_original_dtypes[name] = data_torch.dtype + super().prepare_tensors() + if hasattr(self, 'is_eagle3') and self.is_eagle3 and hasattr(self, '_eagle3_int_tensors'): + for name, data_torch in self._eagle3_int_tensors.items(): + old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype) + # Keep as int64 to match original torch tensor dtype + data = data_torch.to(torch.int64).numpy() + data_qtype = gguf.GGMLQuantizationType.I64 + + shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" + logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] @@ -11278,6 +11415,7 @@ class LazyTorchTensor(gguf.LazyBase): torch.float16: np.float16, torch.float32: np.float32, torch.uint8: np.uint8, + torch.int64: np.int64, } # only used when byteswapping data. Only correct size is needed @@ -11438,6 +11576,10 @@ def parse_args() -> argparse.Namespace: "--no-tensor-first-split", action="store_true", help="do not add tensors to the first split (disabled by default)" ) + parser.add_argument( + "--target-model-dir", type=str, default=None, + help="directory containing target model tokenizer (for EAGLE-3 draft models that don't have their own tokenizer)", + ) parser.add_argument( "--metadata", type=Path, help="Specify the path for an authorship metadata override file" @@ -11610,7 +11752,8 @@ def main() -> None: split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, - sentence_transformers_dense_modules=args.sentence_transformers_dense_modules + sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None ) if args.vocab_only: diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index d8b1f5a480c..336144d3b57 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -4,6 +4,7 @@ #include "speculative.h" #include "log.h" #include "llama.h" +#include "chat.h" #include #include @@ -76,13 +77,53 @@ int main(int argc, char ** argv) { return 1; } + params.speculative.model_tgt = model_tgt; params.speculative.model_dft = model_dft.get(); params.speculative.cparams_dft = common_context_params_to_llama(params_dft); + + if (params.speculative.eagle3) { + llama_set_eagle3(ctx_tgt, model_dft.get()); + } + } + + // Apply chat template for EAGLE3 if available which can increase the acceptance rate + std::string prompt = params.prompt; + if (params.speculative.eagle3) { + auto chat_templates = common_chat_templates_init(model_tgt, params.chat_template); + if (common_chat_templates_was_explicit(chat_templates.get())) { + std::vector chat_msgs; + common_chat_msg user_msg; + user_msg.role = "user"; + user_msg.content = params.prompt; + chat_msgs.push_back(user_msg); + + common_chat_templates_inputs inputs; + inputs.messages = chat_msgs; + inputs.add_generation_prompt = true; + prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt; + LOG_INF("%s: EAGLE3 chat template applied\n", __func__); + } } + int n_predict = 0; + int n_drafted = 0; + int n_accept = 0; + + // used to determine end of generation + bool has_eos = false; + + // ================================================ + // everything until here is standard initialization + // the relevant stuff for speculative decoding starts here + + const auto t_enc_start = ggml_time_us(); + + // target model sampling context + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); + // Tokenize the prompt std::vector inp; - inp = common_tokenize(ctx_tgt, params.prompt, true, true); + inp = common_tokenize(ctx_tgt, prompt, true, true); if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) { LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt)); @@ -102,33 +143,38 @@ int main(int argc, char ** argv) { LOG("%s", common_token_to_piece(ctx_tgt, id).c_str()); } - int n_predict = 0; - int n_drafted = 0; - int n_accept = 0; - - // used to determine end of generation - bool has_eos = false; + // eval the prompt + llama_token id_last; + llama_tokens prompt_tgt; + int n_past; - // ================================================ - // everything until here is standard initialization - // the relevant stuff for speculative decoding starts here + // TODO: simplify + if (params.speculative.eagle3) { + // Target model decodes full prompt and sample first token and intermediate features are extracted + llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size())); - const auto t_enc_start = ggml_time_us(); + id_last = common_sampler_sample(smpl, ctx_tgt, -1); + common_sampler_accept(smpl, id_last, true); + LOG("%s", common_token_to_piece(ctx_tgt, id_last).c_str()); + n_predict++; - // target model sampling context - struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); + // all tokens currently in the target context + prompt_tgt.assign(inp.begin(), inp.end()); + prompt_tgt.reserve(llama_n_ctx(ctx_tgt)); - // eval the prompt - llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1)); + n_past = inp.size(); + } else { + llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1)); - // note: keep the last token separate! - llama_token id_last = inp.back(); + // note: keep the last token separate! + id_last = inp.back(); - // all tokens currently in the target context - llama_tokens prompt_tgt(inp.begin(), inp.end() - 1); - prompt_tgt.reserve(llama_n_ctx(ctx_tgt)); + // all tokens currently in the target context + prompt_tgt.assign(inp.begin(), inp.end() - 1); + prompt_tgt.reserve(llama_n_ctx(ctx_tgt)); - int n_past = inp.size() - 1; + n_past = inp.size() - 1; + } // init the speculator const auto & params_spec = params.speculative; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3af4fffe957..7f4bfd6236d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -150,6 +150,9 @@ class LLM: SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" + EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers" + EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" + EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -463,6 +466,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + EAGLE3 = auto() MIMO2 = auto() STEP35 = auto() LLAMA_EMBED = auto() @@ -770,6 +774,10 @@ class MODEL_TENSOR(IntEnum): NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() + # EAGLE3 specific tensors + EAGLE3_FC = auto() # feature fusion layer + EAGLE3_HIDDEN_NORM = auto() # hidden normalization + EAGLE3_D2T = auto() # draft to target vocabulary mapping # lfm2 audio A_ENC_NORM_CONV = auto() A_ENC_LINEAR_POS = auto() @@ -894,6 +902,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.EAGLE3: "eagle3", MODEL_ARCH.MIMO2: "mimo2", MODEL_ARCH.STEP35: "step35", MODEL_ARCH.LLAMA_EMBED: "llama-embed", @@ -1209,6 +1218,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", + MODEL_TENSOR.EAGLE3_FC: "fc", + MODEL_TENSOR.EAGLE3_HIDDEN_NORM: "blk.{bid}.hidden_norm", + MODEL_TENSOR.EAGLE3_D2T: "d2t", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -3348,6 +3360,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.EAGLE3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.EAGLE3_FC, + MODEL_TENSOR.EAGLE3_HIDDEN_NORM, + MODEL_TENSOR.EAGLE3_D2T, + ], MODEL_ARCH.MIMO2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/include/llama.h b/include/llama.h index bf4e28a8be1..b04809229cb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -371,6 +371,10 @@ extern "C" { // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + // EAGLE3 extraction configuration + const struct llama_model * target_model; // reference to target model + // only used to share embedding layer with eagle3 model + // [EXPERIMENTAL] // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) @@ -686,6 +690,14 @@ extern "C" { int32_t il_start, int32_t il_end); + // + // eagle3 (tmp) + // + + LLAMA_API void llama_set_eagle3( + struct llama_context * ctx, + const struct llama_model * model); + // // Memory // @@ -885,6 +897,23 @@ extern "C" { llama_seq_id dest_seq_id, llama_state_seq_flags flags); + // + // EAGLE3 draft model support + // + + // Get pointer to target model features extracted for EAGLE3 encoder + // Returns NULL if no features are available + // Format: [3*n_embd, n_tokens] - use model.hparams.n_embd and batch.n_tokens for dimensions + LLAMA_API const float * llama_get_eagle3_target_features(struct llama_context * ctx); + + // Set g_embeddings from EAGLE3 encoder output for decoder input + // g_embd: pointer to encoder output embeddings + LLAMA_API void llama_set_eagle3_g_embeddings( + struct llama_context * ctx, + const float * g_embd, + int32_t n_embd, + int32_t n_tokens); + // // Decoding // diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2115fc4255f..0641d6d97ae 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -59,6 +59,7 @@ add_library(llama models/deepseek2.cpp models/dots1.cpp models/dream.cpp + models/eagle3.cpp models/ernie4-5-moe.cpp models/ernie4-5.cpp models/exaone.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index bd78f1e5562..7f5460817d3 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -117,6 +117,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_EAGLE3, "eagle3" }, { LLM_ARCH_MIMO2, "mimo2" }, { LLM_ARCH_STEP35, "step35" }, { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, @@ -262,6 +263,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, + { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, + { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, + { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" }, @@ -512,6 +517,10 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" }, + // EAGLE-3 specific layers + { LLM_TENSOR_EAGLE3_HIDDEN_NORM, "blk.%d.hidden_norm" }, + { LLM_TENSOR_EAGLE3_FC, "fc" }, + { LLM_TENSOR_EAGLE3_D2T, "d2t" }, }; static std::set llm_get_tensor_names(llm_arch arch) { @@ -2261,6 +2270,28 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_VISEXP_FFN_DOWN, LLM_TENSOR_VISEXP_FFN_UP, }; + case LLM_ARCH_EAGLE3: + return { + // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own, Llama 3.1 8B EAGLE3 uses target model's) + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + // Single decoder layer (blk.0) + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + // EAGLE-3 specific layers + LLM_TENSOR_EAGLE3_HIDDEN_NORM, + LLM_TENSOR_EAGLE3_FC, + LLM_TENSOR_EAGLE3_D2T, + }; case LLM_ARCH_MIMO2: return { LLM_TENSOR_TOKEN_EMBD, @@ -2590,6 +2621,10 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + // EAGLE-3 tensors + {LLM_TENSOR_EAGLE3_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_EAGLE3_HIDDEN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_EAGLE3_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index e8263369b80..b57274d59b4 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -127,6 +127,7 @@ enum llm_arch { LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, LLM_ARCH_UNKNOWN, + LLM_ARCH_EAGLE3, }; enum llm_kv { @@ -304,6 +305,10 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_EAGLE3_EXTRACT_LAYERS, + LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, + LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, + LLM_KV_SHORTCONV_L_CACHE, LLM_KV_XIELU_ALPHA_N, @@ -519,6 +524,9 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_EAGLE3_FC, // eagle3: feature fusion layer + LLM_TENSOR_EAGLE3_HIDDEN_NORM, // eagle3: additional normalization layer + LLM_TENSOR_EAGLE3_D2T, // eagle3: draft to target vocabulary mapping }; enum llm_tensor_layer { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a6df893a311..692cf8fc29b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -156,6 +156,8 @@ llama_context::llama_context( cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; + cparams.eagle3_extract_enabled = false; + // intialized later cparams.pipeline_parallel = false; @@ -1114,6 +1116,32 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } +void llama_context::set_eagle3(const llama_model * model) { + // Initialize EAGLE3 feature extraction configuration + cparams.eagle3_extract_enabled = !!model; + if (!cparams.eagle3_extract_enabled) { + return; + } + + sched_need_reserve = true; + + const auto & eagle3_hparams = model->hparams; + + // Copy feature extraction layer indices from EAGLE3 model's hparams + eagle3.extract_layer_indices.assign( + eagle3_hparams.eagle3_extract_layers.begin(), + eagle3_hparams.eagle3_extract_layers.end() + ); + + // Allocate tensors array for extraction + eagle3.extract_tensors.resize(eagle3.extract_layer_indices.size(), nullptr); + + LLAMA_LOG_INFO("%s: EAGLE3 extraction enabled for layers [%d, %d, %d]\n", __func__, + eagle3.extract_layer_indices[0], + eagle3.extract_layer_indices[1], + eagle3.extract_layer_indices[2]); +} + llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); @@ -1163,6 +1191,14 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll res->set_inputs(&ubatch); + // EAGLE3: Fill g_embeddings for decoder input + if (model.arch == LLM_ARCH_EAGLE3 && gtype == LLM_GRAPH_TYPE_DECODER && !eagle3.g_embeddings.empty()) { + ggml_tensor * g_embd = ggml_graph_get_tensor(gf, "inp_g_embeddings"); + if (g_embd) { + ggml_backend_tensor_set(g_embd, eagle3.g_embeddings.data(), 0, ggml_nbytes(g_embd)); + } + } + //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); } @@ -1173,6 +1209,11 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll return nullptr; } + // EAGLE3: Extract intermediate layer features after graph execution + if (cparams.eagle3_extract_enabled && !eagle3.extract_tensors.empty()) { + extract_eagle3_features(ubatch); + } + ret = GGML_STATUS_SUCCESS; return res; @@ -1188,7 +1229,8 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_inp(); + // EAGLE3: use 3*target_hidden_size for concatenated features input + const int64_t n_embd = (model.arch == LLM_ARCH_EAGLE3 && batch_inp.embd) ? 3 * hparams.eagle3_target_hidden_size : hparams.n_embd; const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -1274,8 +1316,15 @@ int llama_context::encode(const llama_batch & batch_inp) { GGML_ASSERT(embd != nullptr); const uint32_t n_embd_out = hparams.n_embd_out(); - GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float)); + if (model.arch == LLM_ARCH_EAGLE3) { + // g_embeddings are stored temporarily in embd buffer + const int64_t out_embd = hparams.n_embd; + GGML_ASSERT(n_tokens * out_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens * out_embd * sizeof(float)); + } else { + GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float)); + } } break; case LLAMA_POOLING_TYPE_MEAN: case LLAMA_POOLING_TYPE_CLS: @@ -1666,7 +1715,8 @@ int llama_context::decode(const llama_batch & batch_inp) { auto * t_logits = res->get_logits(); auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; - if (t_embd && res->get_embd_pooled()) { + // For EAGLE3, don't override t_embd with t_embd_pooled - we need the prenorm value during eagle3 decoder autoregressive generation + if (t_embd && res->get_embd_pooled() && model.arch != LLM_ARCH_EAGLE3) { t_embd = res->get_embd_pooled(); } @@ -1681,7 +1731,40 @@ int llama_context::decode(const llama_batch & batch_inp) { if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + + // EAGLE3: Map draft vocab to target vocab + if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) { + static thread_local std::vector eagle3_d2t_map; + static thread_local std::vector eagle3_draft_logits; + + const int64_t draft_vocab_size = t_logits->ne[0]; + const uint32_t last_idx = n_outputs - 1; + + // Load d2t mapping once (on first call) + if (eagle3_d2t_map.empty()) { + eagle3_d2t_map.resize(model.d2t->ne[0]); + ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0, eagle3_d2t_map.size() * sizeof(int64_t)); + } + + // Read only the last token's draft logits + eagle3_draft_logits.resize(draft_vocab_size); + const size_t last_offset = last_idx * draft_vocab_size * sizeof(float); + ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float)); + synchronize(); + + + // Map only the last token's draft logits to target vocab + float * last_logits_out = logits_out + last_idx * n_vocab; + std::fill(last_logits_out, last_logits_out + n_vocab, -std::numeric_limits::infinity()); + + for (int64_t j = 0; j < draft_vocab_size; j++) { + const int64_t target_id = j + eagle3_d2t_map[j]; + GGML_ASSERT(target_id >= 0 && target_id < n_vocab); + last_logits_out[target_id] = eagle3_draft_logits[j]; + } + } else { + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + } } } @@ -2064,7 +2147,16 @@ ggml_cgraph * llama_context::graph_reserve( auto * res = gf_res_reserve.get(); - const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); + // EAGLE3: auto-detect encoder (embeddings+no target_model) or decoder (has target_model) + llm_graph_type gtype = LLM_GRAPH_TYPE_DEFAULT; + if (model.arch == LLM_ARCH_EAGLE3) { + if (cparams.embeddings && model.target_tok_embd == nullptr) { + gtype = LLM_GRAPH_TYPE_ENCODER; + } else if (model.target_tok_embd != nullptr) { + gtype = LLM_GRAPH_TYPE_DECODER; + } + } + const auto gparams = graph_params(res, ubatch, mctx, gtype); res->reset(); @@ -2105,6 +2197,7 @@ llm_graph_params llama_context::graph_params( /*.loras =*/ &loras, /*.mctx =*/ mctx, /*.cross =*/ &cross, + /*.eagle3 =*/ &eagle3, /*.samplers =*/ sampling.samplers, /*.n_outputs =*/ n_outputs, /*.cb =*/ graph_get_cb(), @@ -2149,6 +2242,27 @@ llm_graph_cb llama_context::graph_get_cb() const { ggml_set_name(cur, name); } + // EAGLE3: Extract intermediate layer features if this is an extraction point + if (cparams.eagle3_extract_enabled) { + static constexpr const char * prefix = "eagle3_extract_"; + static constexpr size_t prefix_len = 15; // strlen("eagle3_extract_") + + if (strncmp(name, prefix, prefix_len) == 0) { + // Parse the extraction index from the name (e.g., "eagle3_extract_0" -> 0) + size_t extract_idx = 0; + if (sscanf(name + prefix_len, "%zu", &extract_idx) == 1 && extract_idx < eagle3.extract_tensors.size()) { + // Mark as output tensor to ensure proper backend assignment + ggml_set_output(cur); + // Store this tensor reference for post-execution extraction + eagle3.extract_tensors[extract_idx] = cur; + LLAMA_LOG_DEBUG("%s: EAGLE3 stored tensor reference for extraction: " + "index=%zu, layer=%d, target_layer=%d, tensor=%s\n", + __func__, extract_idx, il, + eagle3.extract_layer_indices[extract_idx], name); + } + } + } + // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer; @@ -2167,6 +2281,54 @@ llm_graph_cb llama_context::graph_get_cb() const { }; } +void llama_context::extract_eagle3_features(const llama_ubatch & ubatch) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_embd = model.hparams.n_embd; + const size_t n_layers = eagle3.extract_tensors.size(); + + // Allocate storage for concatenated features + const int64_t n_embd_concat = n_embd * n_layers; + eagle3.target_features.resize(n_embd_concat * n_tokens); + + // Temporary buffer to hold layer features before transposing + static thread_local std::vector temp_layer_features; + temp_layer_features.resize(n_embd * n_tokens); + + LLAMA_LOG_DEBUG("%s: Start to extract EAGLE3 features: %zu layers, %lld tokens, %lld embd\n", + __func__, n_layers, (long long)n_tokens, (long long)n_embd); + + // Extract each layer's features and interleave into token-major layout + for (size_t layer_idx = 0; layer_idx < n_layers; ++layer_idx) { + ggml_tensor * tensor = eagle3.extract_tensors[layer_idx]; + GGML_ASSERT(tensor != nullptr && "EAGLE3 extraction tensor is null"); + + // Get the backend where this tensor is stored + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), tensor); + GGML_ASSERT(backend != nullptr && "EAGLE3 tensor has no backend"); + + // Verify tensor shape: should be [n_embd, n_tokens] + GGML_ASSERT(tensor->ne[0] == n_embd && tensor->ne[1] == n_tokens && + "EAGLE3 extraction tensor has unexpected shape"); + + // Get layer features to temp buffer + const size_t size_bytes = n_embd * n_tokens * sizeof(float); + ggml_backend_tensor_get_async(backend, tensor, temp_layer_features.data(), 0, size_bytes); + ggml_backend_sched_synchronize(sched.get()); + + // Then copy to correct position in target_features + // target_features layout: [token_0_all_layers, token_1_all_layers, ...] + // Each token has [layer_0_embd, layer_1_embd, layer_2_embd] + for (int64_t token_idx = 0; token_idx < n_tokens; ++token_idx) { + // Source: temp_layer_features[token_idx * n_embd ... (token_idx + 1) * n_embd - 1] + const float * src = temp_layer_features.data() + token_idx * n_embd; + // Dest: target_features[token_idx * n_embd_concat + layer_idx * n_embd] + float * dest = eagle3.target_features.data() + token_idx * n_embd_concat + layer_idx * n_embd; + std::memcpy(dest, src, n_embd * sizeof(float)); + } + } + +} + // // state save/load // @@ -2980,6 +3142,7 @@ llama_context_params llama_context_default_params() { /*.op_offload =*/ true, /*.swa_full =*/ true, /*.kv_unified =*/ false, + /*.target_model =*/ nullptr, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, }; @@ -2995,6 +3158,12 @@ llama_context * llama_init_from_model( return nullptr; } + // Auto-setup for EAGLE3: set target embedding if target_model is provided + if (model->arch == LLM_ARCH_EAGLE3 && params.target_model) { + model->target_tok_embd = params.target_model->tok_embd; + LLAMA_LOG_INFO("%s: EAGLE3 auto-setup: using target model's embedding layer\n", __func__); + } + if (params.n_batch == 0 && params.n_ubatch == 0) { LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__); return nullptr; @@ -3251,6 +3420,16 @@ int32_t llama_apply_adapter_cvec( return res ? 0 : -1; } +// +// eagle3 (tmp) +// + +void llama_set_eagle3( + llama_context * ctx, + const llama_model * model) { + ctx->set_eagle3(model); +} + // // memory // @@ -3698,3 +3877,33 @@ void llama_opt_epoch( callback_train, callback_eval); } + +// +// EAGLE3 member functions +// + +const float * llama_context::get_eagle3_target_features() const { + GGML_ASSERT(!eagle3.target_features.empty() && "EAGLE3 target features not extracted - call llama_encode() on target model first"); + return eagle3.target_features.data(); +} + +void llama_context::set_eagle3_g_embeddings(const float * g_embd, int32_t n_embd, int32_t n_tokens) { + GGML_ASSERT(g_embd != nullptr && "g_embeddings cannot be null"); + GGML_ASSERT(n_embd > 0 && n_tokens > 0 && "invalid dimensions"); + + const size_t size = n_embd * n_tokens; + eagle3.g_embeddings.resize(size); + std::memcpy(eagle3.g_embeddings.data(), g_embd, size * sizeof(float)); +} + +// +// C API wrappers +// + +const float * llama_get_eagle3_target_features(llama_context * ctx) { + return ctx->get_eagle3_target_features(); +} + +void llama_set_eagle3_g_embeddings(llama_context * ctx, const float * g_embd, int32_t n_embd, int32_t n_tokens) { + ctx->set_eagle3_g_embeddings(g_embd, n_embd, n_tokens); +} diff --git a/src/llama-context.h b/src/llama-context.h index 8e71cdd1dc5..90f1e1e4848 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -120,6 +120,9 @@ struct llama_context { int32_t il_start, int32_t il_end); + // TODO: tmp + void set_eagle3(const llama_model * model); + // process a single ubatch with a specific graph type // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation @@ -236,6 +239,12 @@ struct llama_context { ggml_cgraph * graph_reserve( uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr); + // EAGLE3: Get pointer to target model features extracted for EAGLE3 encoder + const float * get_eagle3_target_features() const; + + // EAGLE3: Set g_embeddings from encoder output for decoder input + void set_eagle3_g_embeddings(const float * g_embd, int32_t n_embd, int32_t n_tokens); + bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler); private: @@ -247,6 +256,9 @@ struct llama_context { llm_graph_cb graph_get_cb() const; + // EAGLE3: Extract intermediate layer features from target model + void extract_eagle3_features(const llama_ubatch & ubatch); + // TODO: read/write lora adapters and cvec size_t state_write_data(llama_io_write_i & io); size_t state_read_data (llama_io_read_i & io); @@ -266,6 +278,9 @@ struct llama_context { llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably + mutable llama_eagle3 eagle3; // EAGLE3 draft model support - stores features from target model + // mutable because it's modified during graph building (const function) + std::unique_ptr memory; // decode output (2-dimensional array: [n_outputs][n_vocab]) diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 2da3bbd6f94..4c9f528b245 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -35,6 +35,7 @@ struct llama_cparams { bool warmup; bool op_offload; bool kv_unified; + bool eagle3_extract_enabled; // enable layer extraction for EAGLE3 speculative decoding bool pipeline_parallel; enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index bba747d37b5..d95216e4d0b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -848,6 +848,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : loras (params.loras), mctx (params.mctx), cross (params.cross), + eagle3 (params.eagle3), samplers (params.samplers), cb_func (params.cb), res (params.res), diff --git a/src/llama-graph.h b/src/llama-graph.h index 1d69ff1a6fc..07fa46c7df3 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -72,6 +72,30 @@ struct llama_cross { std::vector> seq_ids_enc; }; +// EAGLE3 support - stores intermediate features from target model +struct llama_eagle3 { + // Configuration: which layers to extract from target model + std::vector extract_layer_indices; + + // Extracted features from target model (for encoder input) + // Concatenated [layer_l, layer_m, layer_h] embeddings + // Shape: [n_layers * n_embd, n_tokens] where n_layers = extract_layer_indices.size() + std::vector target_features; + + // Encoder output (for decoder input) + std::vector g_embeddings; + + // Tensor references for feature extraction from target model + std::vector extract_tensors; + + // Clear all stored data + void clear() { + target_features.clear(); + g_embeddings.clear(); + extract_tensors.clear(); + } +}; + struct llm_graph_params; // @@ -533,6 +557,7 @@ struct llm_graph_params { const llama_adapter_loras * loras; const llama_memory_context_i * mctx; const llama_cross * cross; + llama_eagle3 * eagle3; // non-const: we write extracted features here std::map samplers; @@ -741,6 +766,7 @@ struct llm_graph_context { const llama_adapter_loras * loras; const llama_memory_context_i * mctx; const llama_cross * cross; + llama_eagle3 * eagle3; // non-const: we write extracted features here std::map samplers; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6c695bdbf66..2d3dc1fbf58 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -197,6 +197,16 @@ struct llama_hparams { // qwen3vl deepstack uint32_t n_deepstack_layers = 0; + // EAGLE3 draft model - layer indices to extract from target model + // e.g., for 32-layer target: [2, 16, 29] (low, middle, high) + std::array eagle3_extract_layers = {0, 0, 0}; + + // EAGLE3 draft model - target model hidden size + uint32_t eagle3_target_hidden_size = 0; + + // EAGLE3 draft model - apply hidden_norm before storing residual + bool eagle3_norm_before_residual = false; + // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggml-org/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 674d06c8910..b8659276918 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2372,6 +2372,35 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_EAGLE3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // EAGLE3 layer extraction configuration + // Use array (has template instantiation), then copy first 3 elements + std::array extract_layers_tmp = {}; + if (!ml.get_key_or_arr(LLM_KV_EAGLE3_EXTRACT_LAYERS, extract_layers_tmp, 3, false)) { + throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata"); + } + std::copy_n(extract_layers_tmp.begin(), 3, hparams.eagle3_extract_layers.begin()); + LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__, + hparams.eagle3_extract_layers[0], + hparams.eagle3_extract_layers[1], + hparams.eagle3_extract_layers[2]); + + // EAGLE3 target model hidden size + ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size); + LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__, + hparams.eagle3_target_hidden_size, hparams.n_embd); + + // EAGLE3 norm_before_residual (optional, default false) + // compatible with Readhat eagle3 speculator model + ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false); + if (hparams.eagle3_norm_before_residual) { + LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__); + } + + type = LLM_TYPE_UNKNOWN; + } break; case LLM_ARCH_COGVLM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -6816,6 +6845,64 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0); } } break; + case LLM_ARCH_EAGLE3: + { + const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size; + const int64_t n_embd_attn_input = 2 * n_embd; + + // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if EAGLE3 has different vocab_size than target) + // d2t: draft to target vocabulary mapping + int64_t n_draft_vocab = n_vocab; // Default: same as target vocab + const struct ggml_tensor * d2t_meta = ml.get_tensor_meta("d2t"); + if (d2t_meta) { + n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size + d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } else { + d2t = nullptr; // no d2t, use default vocab size + LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } + + // Feature fusion layer: projects 3 target layers to draft hidden size + fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0); + + // Output layer (uses draft vocab size) + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, 0); + + // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own) + const struct ggml_tensor * tok_embd_meta = ml.get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str()); + if (tok_embd_meta) { + const int64_t n_target_vocab = tok_embd_meta->ne[1]; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab); + } + + // Single decoder layer + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + // input_layernorm: applied to token embeddings + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // Attention takes input_embeds_normed + fused_target_normed as input + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + // EAGLE-3 specific: hidden_norm applied to fused target features + layer.eagle3_hidden_norm = create_tensor(tn(LLM_TENSOR_EAGLE3_HIDDEN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // rope_freqs for llama3 rope scaling (optional - only if EAGLE3 config has rope_scaling) + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED); + } + } break; case LLM_ARCH_KIMI_LINEAR: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8331,6 +8418,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_EAGLE3: + { + if (params.gtype == LLM_GRAPH_TYPE_ENCODER) { + llm = std::make_unique(*this, params); + } else { + llm = std::make_unique(*this, params); + } + } break; case LLM_ARCH_COGVLM: { llm = std::make_unique(*this, params); @@ -8540,6 +8635,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_EAGLE3: case LLM_ARCH_LLAMA_EMBED: case LLM_ARCH_MAINCODER: return LLAMA_ROPE_TYPE_NORM; diff --git a/src/llama-model.h b/src/llama-model.h index 7b580043b33..674ba228f52 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -413,6 +413,9 @@ struct llama_layer { struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_eps = nullptr; + // eagle3 + struct ggml_tensor * eagle3_hidden_norm = nullptr; + // Kimi Linear KDA (using ssm_ prefix for consistency) // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias struct ggml_tensor * ssm_q_conv = nullptr; @@ -474,6 +477,13 @@ struct llama_model { struct ggml_tensor * per_layer_model_proj = nullptr; struct ggml_tensor * per_layer_proj_norm = nullptr; + // eagle3 + struct ggml_tensor * fc = nullptr; // feature fusion layer + struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping + // Reference to target model's embedding layer + // This allows EAGLE3 to use target model's embeddings without copying + struct ggml_tensor * target_tok_embd = nullptr; + std::vector layers; //Dense linear projections for SentenceTransformers models like embeddinggemma diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp new file mode 100644 index 00000000000..c3ef2ed4bfe --- /dev/null +++ b/src/models/eagle3.cpp @@ -0,0 +1,186 @@ +#include "models.h" + +ggml_tensor * llm_build_eagle3_encode::build_inp_embd() const { + const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size; + + ggml_tensor * cur = nullptr; + + // Input: Target model features (3 layers concatenated: low, mid, high) + // Data will be provided via ubatch->embd in encode_eagle3_features() + auto inp_target = std::make_unique(n_embd_target_features); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens); + ggml_set_input(inp_target->embd); + + cur = inp_target->embd; + cb(cur, "inp_embd", -1); + + res->add_input(std::move(inp_target)); + + return cur; +} + +// EAGLE3 Encoder: processes target model features through feature fusion layer +// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high +// Output: g_embeddings e.g. [4096, n_tokens] stored in context +llm_build_eagle3_encode::llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur = nullptr; + + cur = build_inp_embd(); + + // Feature fusion layer + cur = build_lora_mm(model.fc, cur); + cb(cur, "fc_out", -1); + + // Output: g_embeddings e.g. [4096, n_tokens] + res->t_embd = cur; + + ggml_build_forward_expand(gf, cur); +} + +// EAGLE3 Decoder: processes draft tokens using g_embeddings from encoder +// Input: draft tokens + g_embeddings from encoder +// Output: draft logits +llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_layer == 1); // EAGLE-3 has only one decoder layer + + ggml_tensor * cur; + ggml_tensor * inpL; + + // EAGLE3 Decoder receives: + // 1. Token embeddings (e.g.from EAGLE3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) + // 2. g_embeddings from encoder + // Choose token_embd_eagle3: prefer EAGLE3's own if available (Llama 3.3 70B), else use target's (Llama 3.1 8B) + ggml_tensor * token_embd_eagle3 = (model.tok_embd != nullptr) ? model.tok_embd : model.target_tok_embd; + GGML_ASSERT(token_embd_eagle3 != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + ggml_tensor * inp_embd = build_inp_embd(token_embd_eagle3); + cb(inp_embd, "inp_embd", -1); + + // TODO: refactor into llm_graph_input + ggml_tensor * inp_g = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(inp_g); + cb(inp_g, "inp_g_embeddings", -1); // TODO: do not change the name! refactor into llm_graph_input + + inpL = inp_g; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // Single decoder layer (il = 0) + const int il = 0; + { + // Apply input_layernorm to the token embeddings + ggml_tensor * embd_norm = build_norm(inp_embd, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(embd_norm, "embd_norm", il); + + // Apply hidden_norm to inp_g + ggml_tensor * g_norm = build_norm(inp_g, + model.layers[il].eagle3_hidden_norm, NULL, + LLM_NORM_RMS, -1); + cb(g_norm, "g_norm", il); + + // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model) + // - false (default): use raw inp_g for residual + // - true: use normalized g_norm for residual + // inpL is the concatenated input (normalized inp_embd + normalized inp_g) + ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL; + + // Concatenate normalized inp_embd and normalized inp_g + cur = ggml_concat(ctx0, embd_norm, g_norm, il); + cb(cur, "concat_embd", il); + + // Self-attention with concatenated input + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // rope freq factors, returns nullptr if not available + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // RoPE + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Add residual and update it + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // Apply FFN norm to the sum + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "post_attn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + // Output norm with residual + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "eagle3_prenorm", il); + + inpL = cur; + } + + cur = inpL; + + // Output prenorm state (for next token's g_embeddings in autoregressive generation) + ggml_set_output(cur); + res->t_embd = cur; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head - projects to draft vocabulary + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 42b5fcdf42e..3bccb2f902f 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -31,6 +31,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_gra for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, diff --git a/src/models/models.h b/src/models/models.h index cfcbb9aaa5b..120b5b22cc7 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -150,6 +150,16 @@ struct llm_build_dream : public llm_graph_context { llm_build_dream(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_eagle3_encode : public llm_graph_context { + llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params); +private: + ggml_tensor * build_inp_embd() const; +}; + +struct llm_build_eagle3_decode : public llm_graph_context { + llm_build_eagle3_decode(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_ernie4_5 : public llm_graph_context { llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp index dbe3ca1851f..527e8967c51 100644 --- a/src/models/openai-moe-iswa.cpp +++ b/src/models/openai-moe-iswa.cpp @@ -19,6 +19,17 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } + // norm cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index a5cfffa5314..c1f34624c03 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -21,6 +21,17 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 888534fb347..c0b6ff5df97 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -21,6 +21,17 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + // EAGLE3: Extract intermediate layer features from target model at layer INPUT + if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) { + static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"}; + for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) { + if (eagle3->extract_layer_indices[i] == il) { + cb(inpL, eagle3_extract_names[i], il); + break; + } + } + } + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL,