diff --git a/common/arg.cpp b/common/arg.cpp
index 9c85696ebdb..ed320a4563b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3004,7 +3004,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, bool value) {
             params.use_jinja = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -3035,7 +3035,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
         string_format(
@@ -3346,6 +3346,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.p_min = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+    add_opt(common_arg(
+        {"--eagle3"},
+        "use EAGLE3 speculative decoding with the draft model",
+        [](common_params & params) {
+            params.speculative.eagle3 = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-cd", "--ctx-size-draft"}, "N",
         string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
diff --git a/common/common.h b/common/common.h
index b284244530a..612a0a062e5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -280,10 +280,13 @@ struct common_params_speculative {
 
     struct common_params_model mparams_dft;
 
+    llama_model * model_tgt = nullptr; // the target model
     llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
 
     llama_context_params cparams_dft; // these are the parameters for the draft llama_context
 
+    bool    eagle3       = false; // use EAGLE3 speculative decoding
+
     int32_t n_ctx        = 0;  // draft context size
     int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
 
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 3e68c38e49c..53ea52e7400 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -47,6 +47,7 @@ struct common_speculative_config {
             const common_params_speculative & p = common_params_speculative{}) : type(t), params(p) {}
 };
 
+
 static bool common_speculative_are_compatible(
     const llama_model * model_tgt,
     const llama_model * model_dft) {
@@ -210,7 +211,9 @@ struct common_speculative_state_draft : public common_speculative_state {
     ~common_speculative_state_draft() override {
         llama_perf_context_print(ctx_dft);
 
-        llama_free(ctx_dft);
+        if (ctx_dft) {
+            llama_free(ctx_dft);
+        }
 
         common_sampler_free(smpl);
 
@@ -228,11 +231,11 @@ struct common_speculative_state_draft : public common_speculative_state {
             llama_tokens & result) override {
         auto * spec = this;
 
-        auto & batch      = spec->batch;
-        auto & ctx_tgt    = spec->ctx_tgt;
-        auto & ctx_dft    = spec->ctx_dft;
-        auto & smpl       = spec->smpl;
-        auto & prompt_dft = spec->prompt_dft;
+        auto & batch       = spec->batch;
+        auto & ctx_tgt     = spec->ctx_tgt;
+        auto & ctx_dft     = spec->ctx_dft;
+        auto & smpl        = spec->smpl;
+        auto & prompt_dft  = spec->prompt_dft;
 
         auto * mem_dft = llama_get_memory(ctx_dft);
 
@@ -438,7 +441,52 @@ struct common_speculative_state_draft : public common_speculative_state {
 };
 
 struct common_speculative_state_eagle3 : public common_speculative_state {
-    common_speculative_state_eagle3(enum common_speculative_type type) : common_speculative_state(type) {}
+    llama_context * ctx_tgt;
+
+    common_sampler * smpl;
+
+    llama_batch batch;
+
+    struct llama_context * ctx_dft_enc = nullptr;
+    struct llama_context * ctx_dft_dec = nullptr;
+
+    int32_t eagle3_n_past = 0;  // number of verified positions in decoder KV cache
+
+    common_speculative_state_eagle3(
+            enum common_speculative_type type,
+            llama_context * ctx_tgt,
+            llama_context * ctx_dft_enc,
+            llama_context * ctx_dft_dec)
+        : common_speculative_state(type)
+        , ctx_tgt(ctx_tgt)
+        , ctx_dft_enc(ctx_dft_enc)
+        , ctx_dft_dec(ctx_dft_dec)
+    {
+        batch = llama_batch_init(llama_n_batch(ctx_dft_dec), 0, 1);
+
+        // Initialize sampler for EAGLE3 decoder
+        common_params_sampling params;
+        params.no_perf = false;
+        params.top_k = 10; // set 1 for greedy sampling (argmax) to match vLLM's default behavior but >1 always gets higher acceptance rate for eagle3
+        params.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
+        smpl = common_sampler_init(llama_get_model(ctx_dft_dec), params);
+    }
+
+    ~common_speculative_state_eagle3() override {
+        llama_perf_context_print(ctx_dft_dec);
+
+        if (ctx_dft_dec) {
+            llama_free(ctx_dft_dec);
+        }
+
+        if (ctx_dft_enc) {
+            llama_free(ctx_dft_enc);
+        }
+
+        common_sampler_free(smpl);
+
+        llama_batch_free(batch);
+    }
 
     void begin(const llama_tokens & prompt) override {
         GGML_UNUSED(prompt);
@@ -448,12 +496,97 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
             const common_params_speculative & params,
             const llama_tokens & prompt_tgt,
             llama_token id_last,
-            llama_tokens & draft_tokens) override {
-        // TODO: implement
-        GGML_UNUSED(params);
-        GGML_UNUSED(prompt_tgt);
-        GGML_UNUSED(id_last);
-        GGML_UNUSED(draft_tokens);
+            llama_tokens & result) override {
+        auto * spec = this;
+
+        auto & batch       = spec->batch;
+        auto & ctx_tgt     = spec->ctx_tgt;
+        auto & ctx_dft_enc = spec->ctx_dft_enc;
+        auto & ctx_dft_dec = spec->ctx_dft_dec;
+        auto & smpl        = spec->smpl;
+
+        //result = gen_eagle3_draft(spec, params, prompt_tgt, id_last);
+        const int n_embd = llama_model_n_embd(llama_get_model(ctx_dft_enc));
+        const int n      = (int)prompt_tgt.size();
+        const int n_new  = n - spec->eagle3_n_past;
+
+        GGML_ASSERT(n >= 1 && "prompt_tgt is empty");
+        GGML_ASSERT(n_new >= 1 && "must have at least 1 new token");
+
+        // Clear draft positions from decoder KV cache [n_past, inf)
+        llama_memory_seq_rm(llama_get_memory(ctx_dft_dec), 0, spec->eagle3_n_past, -1);
+
+        // Encoder: features → g_embeddings
+        const float * features = llama_get_eagle3_target_features(ctx_tgt);
+        GGML_ASSERT(features && "no target features");
+
+        llama_batch enc_batch = {
+            /*.n_tokens  =*/ n_new,
+            /*.token     =*/ nullptr,
+            /*.embd      =*/ const_cast<float*>(features),
+            /*.pos       =*/ nullptr,
+            /*.n_seq_id  =*/ nullptr,
+            /*.seq_id    =*/ nullptr,
+            /*.logits    =*/ nullptr,
+        };
+        GGML_ASSERT(llama_encode(ctx_dft_enc, enc_batch) == 0);
+
+        const float * g_embd = llama_get_embeddings(ctx_dft_enc);
+        GGML_ASSERT(g_embd && "encoder output failed");
+
+        // Decoder batch: process new tokens with KV cache reuse
+        llama_set_eagle3_g_embeddings(ctx_dft_dec, g_embd, n_embd, n_new);
+
+        common_batch_clear(batch);
+        for (int i = 0; i < n_new; i++) {
+            const int pos = spec->eagle3_n_past + i;
+            const llama_token tok = (pos < n - 1) ? prompt_tgt[pos + 1] : id_last;
+            common_batch_add(batch, tok, pos, {0}, true);
+        }
+
+        GGML_ASSERT(llama_decode(ctx_dft_dec, batch) == 0);
+
+        spec->eagle3_n_past = n;  // update verified positions
+
+        // Sample draft tokens
+        result.clear();
+        common_sampler_reset(smpl);
+
+        // Sample and check probability (consistent with standard speculative decoding)
+        auto sample_and_check = [&](int idx) -> bool {
+            common_sampler_sample(smpl, ctx_dft_dec, idx);
+
+            const auto * cur_p = common_sampler_get_candidates(smpl, true);
+            const llama_token id = cur_p->data[0].id;
+
+            common_sampler_accept(smpl, id, true);
+            result.push_back(id);
+
+            return cur_p->data[0].p >= params.p_min;
+        };
+
+        // First draft token from batch decode
+        if (!sample_and_check(n_new - 1)) {
+            return;
+        }
+
+        // Autoregressive: use prenorm as g_embd (-1 = last output)
+        const float * prenorm = llama_get_embeddings_ith(ctx_dft_dec, -1);
+
+        for (int i = 1; i < params.n_max; i++) {
+            GGML_ASSERT(prenorm && "prenorm failed");
+            llama_set_eagle3_g_embeddings(ctx_dft_dec, prenorm, n_embd, 1);
+
+            common_batch_clear(batch);
+            common_batch_add(batch, result.back(), n - 1 + i, {0}, true);
+            GGML_ASSERT(llama_decode(ctx_dft_dec, batch) == 0);
+
+            prenorm = llama_get_embeddings_ith(ctx_dft_dec, -1);
+
+            if (!sample_and_check(0)) {
+                break;
+            }
+        }
     }
 
     void accept(uint16_t n_accepted) override {
@@ -840,11 +973,35 @@ common_speculative * common_speculative_init(
         common_params_speculative & params,
         llama_context             * ctx_tgt) {
     llama_context * ctx_dft = nullptr;
+
+    llama_context * ctx_dft_enc = nullptr;
+    llama_context * ctx_dft_dec = nullptr;
+
     if (params.model_dft) {
-        ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
-        if (ctx_dft == nullptr) {
-            LOG_ERR("%s", "failed to create draft context\n");
-            return nullptr;
+        if (params.eagle3) {
+            llama_context_params params_enc = params.cparams_dft;
+            params_enc.target_model = nullptr;
+            params_enc.embeddings = true;
+            ctx_dft_enc = llama_init_from_model(params.model_dft, params_enc);
+            if (!ctx_dft_enc) {
+                LOG_ERR("failed to create EAGLE3 encoder context\n");
+                return nullptr;
+            }
+
+            llama_context_params params_dec = params.cparams_dft;
+            params_dec.target_model = params.model_tgt;
+            params_dec.embeddings = true;
+            ctx_dft_dec = llama_init_from_model(params.model_dft, params_dec);
+            if (!ctx_dft_dec) {
+                LOG_ERR("failed to create EAGLE3 decoder context\n");
+                return nullptr;
+            }
+        } else {
+            ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
+            if (ctx_dft == nullptr) {
+                LOG_ERR("%s", "failed to create draft context\n");
+                return nullptr;
+            }
         }
     }
 
@@ -852,7 +1009,7 @@ common_speculative * common_speculative_init(
     std::vector<common_speculative_config> configs = {}; // list of speculative configs to try
     {
         bool has_draft = !params.mparams_dft.path.empty();
-        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
+        bool has_draft_eagle3 = params.eagle3;
 
         bool has_ngram_cache   = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE);
         bool has_ngram_simple  = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
@@ -893,10 +1050,11 @@ common_speculative * common_speculative_init(
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
         }
         if (has_draft) {
-            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params));
-        }
-        if (has_draft_eagle3) {
-            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params));
+            if (has_draft_eagle3) {
+                configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params));
+            } else {
+                configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params));
+            }
         }
     }
 
@@ -916,7 +1074,11 @@ common_speculative * common_speculative_init(
                 break;
             }
             case COMMON_SPECULATIVE_TYPE_EAGLE3: {
-                impls.push_back(std::make_unique<common_speculative_state_eagle3>(config.type));
+                impls.push_back(std::make_unique<common_speculative_state_eagle3>(config.type,
+                    /* .ctx_tgt      = */ ctx_tgt,
+                    /* .ctx_dft_enc  = */ ctx_dft_enc,
+                    /* .ctx_dft_dec  = */ ctx_dft_dec
+                ));
                 break;
             }
             case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 843c00a8969..3dcaa0e7978 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -97,6 +97,7 @@ class ModelBase:
     metadata_override: Path | None
     dir_model_card: Path
     remote_hf_model_id: str | None
+    target_model_dir: Path | None
 
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
@@ -116,7 +117,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
                  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
                  small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
                  disable_mistral_community_chat_template: bool = False,
-                 sentence_transformers_dense_modules: bool = False):
+                 sentence_transformers_dense_modules: bool = False, target_model_dir: Path | None = None):
         if type(self) is ModelBase or \
                 type(self) is TextModel or \
                 type(self) is MmprojModel:
@@ -135,6 +136,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.dry_run = dry_run
         self.remote_hf_model_id = remote_hf_model_id
         self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
+        self.target_model_dir = target_model_dir
         self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
         self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
         self.metadata_override = metadata_override
@@ -2492,6 +2494,9 @@ def prepare_tensors(self):
     "VLlama3ForCausalLM",
     "LlavaForConditionalGeneration",
     "VoxtralForConditionalGeneration",
+    "LlamaForCausalLMEagle3",
+    "Eagle3Speculator",
+    "Eagle3DraftModel",
     "IQuestCoderForCausalLM",
     "LlamaModel")
 class LlamaModel(TextModel):
@@ -2506,7 +2511,60 @@ def __init__(self, *args, **kwargs):
         hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
         self.origin_hf_arch = hparams.get('architectures', [None])[0]
 
+        # detect EAGLE-3 llama checkpoint
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            self.is_eagle3 = True
+            self.model_arch = gguf.MODEL_ARCH.EAGLE3
+            logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
+            # Re-initialize tensor_map with EAGLE3 architecture
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+            # Update gguf_writer architecture
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
+            self.gguf_writer.add_architecture()
+            if not hasattr(self, 'target_model_dir') or not self.target_model_dir:
+                raise ValueError(
+                    "EAGLE3 model requires --target-model-dir to be specified. "
+                    "Please provide the path to the target model directory to read config.json"
+                )
+            # Read both EAGLE3 raw config and target model config
+            with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
+                eagle3_raw_config = json.load(f)
+            with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
+                target_config = json.load(f)
+
+            # EAGLE3 extract_layers
+            target_num_layers = target_config["num_hidden_layers"]
+            extract_layers = [2, target_num_layers // 2, target_num_layers - 3]
+            logger.info(f"EAGLE3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)")
+            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers)
+
+            # EAGLE3 target_hidden_size: prefer EAGLE3 config, fallback to target config
+            if "target_hidden_size" in eagle3_raw_config and eagle3_raw_config["target_hidden_size"] is not None:
+                target_hidden_size = eagle3_raw_config["target_hidden_size"]
+                logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from EAGLE3 config)")
+            else:
+                target_hidden_size = target_config["hidden_size"]
+                logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)")
+            self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
+
+            # Eagle3Speculator norm_before_residual specific handling
+            norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
+            logger.info(f"EAGLE3: norm_before_residual = {norm_before_residual} (from EAGLE3 config)")
+            self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
+
     def set_vocab(self):
+        # For EAGLE-3 models, use tokenizer from target model if provided
+        if hasattr(self, 'is_eagle3') and self.is_eagle3:
+            if self.target_model_dir is None:
+                raise ValueError(
+                    "EAGLE-3 draft model requires --target-model-dir to be specified. "
+                    "Please provide the path to the target model directory containing the tokenizer."
+                )
+            logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
+            # Temporarily swap dir_model to load tokenizer from target model
+            original_dir_model = self.dir_model
+            self.dir_model = self.target_model_dir
+
         if self.origin_hf_arch == "GlmasrModel":
             return self._set_vocab_glmedge()
 
@@ -2550,6 +2608,10 @@ def set_vocab(self):
         if self.hparams.get("vocab_size", 32000) == 49152:
             self.gguf_writer.add_add_bos_token(False)
 
+        # Restore original dir_model for EAGLE-3
+        if hasattr(self, 'is_eagle3') and self.is_eagle3:
+            self.dir_model = original_dir_model
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
@@ -2571,7 +2633,53 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
 
     _experts: list[dict[str, Tensor]] | None = None
 
+    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
+        tensors = super().index_tensors(remote_hf_model_id)
+
+        # Handle Eagle3Speculator nested config
+        if "transformer_layer_config" in self.hparams:
+            self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
+            
+        # EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set)
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            logger.info("EAGLE-3: Renaming midlayer.* or layers.0.* to model.layers.0.*")
+            new_tensors = {}
+            # EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model
+            for name, gen in tensors.items():
+                if name.startswith("midlayer."):
+                    new_name = "model.layers.0." + name[len("midlayer."):]
+                    new_tensors[new_name] = gen
+                elif name.startswith("layers.0."): # layers.0.* -> model.layers.0.* (Eagle3Speculator format)
+                    new_name = "model." + name
+                    new_tensors[new_name] = gen
+                else:
+                    new_tensors[name] = gen
+            return new_tensors
+        else:
+            return tensors
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+
+        # Eagle-3 llama checkpoint special handling
+        if hasattr(self, 'is_eagle3') and self.is_eagle3:
+            # Eagle-3 llama checkpoint special weights handling
+            # fc.weight: feature fusion layer
+            if name == "fc.weight":
+                return [(name, data_torch)]
+            # d2t: draft to target vocabulary mapping
+            elif name == "d2t":
+                # Skip parent class processing (store for manual handling in prepare_tensors)
+                if not hasattr(self, '_eagle3_int_tensors'):
+                    self._eagle3_int_tensors = {}
+                self._eagle3_int_tensors[name] = data_torch
+                return []
+            # t2d: target to draft vocabulary mapping (not used, skip completely)
+            elif name == "t2d":
+                return []
+            # hidden_norm: EAGLE-3 specific layer normalization
+            elif name == "model.layers.0.hidden_norm.weight":
+                return [("blk.0.hidden_norm.weight", data_torch)]
+
         n_head = self.find_hparam(["n_heads", "num_attention_heads"])
         n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
 
@@ -2641,6 +2749,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        # EAGLE3: If no lm_head in draft model, load from target model
+        if hasattr(self, 'is_eagle3') and self.is_eagle3 and "lm_head.weight" not in self.model_tensors:
+            from safetensors import safe_open
+            for sf_file in self.target_model_dir.glob("*.safetensors"):
+                with safe_open(sf_file, framework="pt") as f:
+                    if "lm_head.weight" in f.keys():
+                        lm_head = f.get_tensor("lm_head.weight")
+                        logger.info(f"EAGLE3: No lm_head in draft model, loaded lm_head from {sf_file.name}, shape = {lm_head.shape}")
+                        yield ("output.weight", lm_head)
+                        break
+
         if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
             if rope_params.get("rope_type", '').lower() == "llama3":
                 base = rope_params.get("rope_theta", 10000.0)
@@ -2671,8 +2790,26 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
                 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 
     def prepare_tensors(self):
+        # EAGLE-3: collect original dtypes BEFORE parent class converts them to F32
+        eagle3_original_dtypes = {}
+        if hasattr(self, 'is_eagle3') and self.is_eagle3:
+            for name, data_torch in self.get_tensors():
+                if name == "d2t":
+                    eagle3_original_dtypes[name] = data_torch.dtype
+
         super().prepare_tensors()
 
+        if hasattr(self, 'is_eagle3') and self.is_eagle3 and hasattr(self, '_eagle3_int_tensors'):
+            for name, data_torch in self._eagle3_int_tensors.items():
+                old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
+                # Keep as int64 to match original torch tensor dtype
+                data = data_torch.to(torch.int64).numpy()
+                data_qtype = gguf.GGMLQuantizationType.I64
+
+                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+                logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
+                self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
+
         if self._experts is not None:
             # flatten `list[dict[str, Tensor]]` into `list[str]`
             experts = [k for d in self._experts for k in d.keys()]
@@ -11278,6 +11415,7 @@ class LazyTorchTensor(gguf.LazyBase):
         torch.float16: np.float16,
         torch.float32: np.float32,
         torch.uint8: np.uint8,
+        torch.int64: np.int64,
     }
 
     # only used when byteswapping data. Only correct size is needed
@@ -11438,6 +11576,10 @@ def parse_args() -> argparse.Namespace:
         "--no-tensor-first-split", action="store_true",
         help="do not add tensors to the first split (disabled by default)"
     )
+    parser.add_argument(
+        "--target-model-dir", type=str, default=None,
+        help="directory containing target model tokenizer (for EAGLE-3 draft models that don't have their own tokenizer)",
+    )
     parser.add_argument(
         "--metadata", type=Path,
         help="Specify the path for an authorship metadata override file"
@@ -11610,7 +11752,8 @@ def main() -> None:
                                      split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
                                      small_first_shard=args.no_tensor_first_split,
                                      remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
-                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
+                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
+                                     target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None
                                      )
 
         if args.vocab_only:
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index d8b1f5a480c..336144d3b57 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -4,6 +4,7 @@
 #include "speculative.h"
 #include "log.h"
 #include "llama.h"
+#include "chat.h"
 
 #include <cstdio>
 #include <cstring>
@@ -76,13 +77,53 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
+        params.speculative.model_tgt = model_tgt;
         params.speculative.model_dft = model_dft.get();
         params.speculative.cparams_dft = common_context_params_to_llama(params_dft);
+
+        if (params.speculative.eagle3) {
+            llama_set_eagle3(ctx_tgt, model_dft.get());
+        }
+    }
+
+    // Apply chat template for EAGLE3 if available which can increase the acceptance rate
+    std::string prompt = params.prompt;
+    if (params.speculative.eagle3) {
+        auto chat_templates = common_chat_templates_init(model_tgt, params.chat_template);
+        if (common_chat_templates_was_explicit(chat_templates.get())) {
+            std::vector<common_chat_msg> chat_msgs;
+            common_chat_msg user_msg;
+            user_msg.role = "user";
+            user_msg.content = params.prompt;
+            chat_msgs.push_back(user_msg);
+
+            common_chat_templates_inputs inputs;
+            inputs.messages = chat_msgs;
+            inputs.add_generation_prompt = true;
+            prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
+            LOG_INF("%s: EAGLE3 chat template applied\n", __func__);
+        }
     }
 
+    int n_predict = 0;
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    // used to determine end of generation
+    bool has_eos = false;
+
+    // ================================================
+    // everything until here is standard initialization
+    // the relevant stuff for speculative decoding starts here
+
+    const auto t_enc_start = ggml_time_us();
+
+    // target model sampling context
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
+
     // Tokenize the prompt
     std::vector<llama_token> inp;
-    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
+    inp = common_tokenize(ctx_tgt, prompt, true, true);
 
     if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) {
         LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
@@ -102,33 +143,38 @@ int main(int argc, char ** argv) {
         LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
     }
 
-    int n_predict = 0;
-    int n_drafted = 0;
-    int n_accept  = 0;
-
-    // used to determine end of generation
-    bool has_eos = false;
+    // eval the prompt
+    llama_token id_last;
+    llama_tokens prompt_tgt;
+    int n_past;
 
-    // ================================================
-    // everything until here is standard initialization
-    // the relevant stuff for speculative decoding starts here
+    // TODO: simplify
+    if (params.speculative.eagle3) {
+        // Target model decodes full prompt and sample first token and intermediate features are extracted
+        llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size()));
 
-    const auto t_enc_start = ggml_time_us();
+        id_last = common_sampler_sample(smpl, ctx_tgt, -1);
+        common_sampler_accept(smpl, id_last, true);
+        LOG("%s", common_token_to_piece(ctx_tgt, id_last).c_str());
+        n_predict++;
 
-    // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
+        // all tokens currently in the target context
+        prompt_tgt.assign(inp.begin(), inp.end());
+        prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
 
-    // eval the prompt
-    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
+        n_past = inp.size();
+    } else {
+        llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
 
-    // note: keep the last token separate!
-    llama_token id_last = inp.back();
+        // note: keep the last token separate!
+        id_last = inp.back();
 
-    // all tokens currently in the target context
-    llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
-    prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
+        // all tokens currently in the target context
+        prompt_tgt.assign(inp.begin(), inp.end() - 1);
+        prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
 
-    int n_past = inp.size() - 1;
+        n_past = inp.size() - 1;
+    }
 
     // init the speculator
     const auto & params_spec = params.speculative;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 3af4fffe957..7f4bfd6236d 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -150,6 +150,9 @@ class LLM:
         SWIGLU_CLAMP_SHEXP                = "{arch}.swiglu_clamp_shexp"
         DENSE_FEAT_IN_SIZE                = "{arch}.{dense}_feat_in"
         DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
+        EAGLE3_EXTRACT_LAYERS             = "{arch}.extract_layers"
+        EAGLE3_TARGET_HIDDEN_SIZE         = "{arch}.target_hidden_size"
+        EAGLE3_NORM_BEFORE_RESIDUAL       = "{arch}.norm_before_residual"
 
     class Attention:
         HEAD_COUNT                   = "{arch}.attention.head_count"
@@ -463,6 +466,7 @@ class MODEL_ARCH(IntEnum):
     RND1             = auto()
     PANGU_EMBED      = auto()
     MISTRAL3         = auto()
+    EAGLE3           = auto()
     MIMO2            = auto()
     STEP35           = auto()
     LLAMA_EMBED      = auto()
@@ -770,6 +774,10 @@ class MODEL_TENSOR(IntEnum):
     NEXTN_HNORM          = auto()
     NEXTN_SHARED_HEAD_HEAD = auto()
     NEXTN_SHARED_HEAD_NORM = auto()
+    # EAGLE3 specific tensors
+    EAGLE3_FC          = auto()  # feature fusion layer
+    EAGLE3_HIDDEN_NORM = auto()  # hidden normalization
+    EAGLE3_D2T         = auto()  # draft to target vocabulary mapping
     # lfm2 audio
     A_ENC_NORM_CONV        = auto()
     A_ENC_LINEAR_POS       = auto()
@@ -894,6 +902,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.RND1:             "rnd1",
     MODEL_ARCH.PANGU_EMBED:      "pangu-embedded",
     MODEL_ARCH.MISTRAL3:         "mistral3",
+    MODEL_ARCH.EAGLE3:           "eagle3",
     MODEL_ARCH.MIMO2:            "mimo2",
     MODEL_ARCH.STEP35:           "step35",
     MODEL_ARCH.LLAMA_EMBED:      "llama-embed",
@@ -1209,6 +1218,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.NEXTN_HNORM:               "blk.{bid}.nextn.hnorm",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.nextn.shared_head_head",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.nextn.shared_head_norm",
+    MODEL_TENSOR.EAGLE3_FC:                 "fc",
+    MODEL_TENSOR.EAGLE3_HIDDEN_NORM:        "blk.{bid}.hidden_norm",
+    MODEL_TENSOR.EAGLE3_D2T:                "d2t",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -3348,6 +3360,24 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.EAGLE3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.EAGLE3_FC,
+        MODEL_TENSOR.EAGLE3_HIDDEN_NORM,
+        MODEL_TENSOR.EAGLE3_D2T,
+    ],
     MODEL_ARCH.MIMO2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/include/llama.h b/include/llama.h
index bf4e28a8be1..b04809229cb 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -371,6 +371,10 @@ extern "C" {
                           // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                           // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 
+        // EAGLE3 extraction configuration
+        const struct llama_model * target_model; // reference to target model
+                                                 // only used to share embedding layer with eagle3 model
+
         // [EXPERIMENTAL]
         // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
         // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
@@ -686,6 +690,14 @@ extern "C" {
                          int32_t   il_start,
                          int32_t   il_end);
 
+    //
+    // eagle3 (tmp)
+    //
+
+    LLAMA_API void llama_set_eagle3(
+            struct llama_context * ctx,
+            const struct llama_model * model);
+
     //
     // Memory
     //
@@ -885,6 +897,23 @@ extern "C" {
                     llama_seq_id   dest_seq_id,
            llama_state_seq_flags   flags);
 
+    //
+    // EAGLE3 draft model support
+    //
+
+    // Get pointer to target model features extracted for EAGLE3 encoder
+    // Returns NULL if no features are available
+    // Format: [3*n_embd, n_tokens] - use model.hparams.n_embd and batch.n_tokens for dimensions
+    LLAMA_API const float * llama_get_eagle3_target_features(struct llama_context * ctx);
+
+    // Set g_embeddings from EAGLE3 encoder output for decoder input
+    // g_embd: pointer to encoder output embeddings
+    LLAMA_API void llama_set_eagle3_g_embeddings(
+            struct llama_context * ctx,
+                   const float * g_embd,
+                       int32_t   n_embd,
+                       int32_t   n_tokens);
+
     //
     // Decoding
     //
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2115fc4255f..0641d6d97ae 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -59,6 +59,7 @@ add_library(llama
             models/deepseek2.cpp
             models/dots1.cpp
             models/dream.cpp
+            models/eagle3.cpp
             models/ernie4-5-moe.cpp
             models/ernie4-5.cpp
             models/exaone.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index bd78f1e5562..7f5460817d3 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -117,6 +117,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_RND1,             "rnd1"             },
     { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
     { LLM_ARCH_MISTRAL3,         "mistral3"         },
+    { LLM_ARCH_EAGLE3,           "eagle3"           },
     { LLM_ARCH_MIMO2,            "mimo2"            },
     { LLM_ARCH_STEP35,           "step35"           },
     { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
@@ -262,6 +263,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
 
+    { LLM_KV_EAGLE3_EXTRACT_LAYERS,        "%s.extract_layers"        },
+    { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,    "%s.target_hidden_size"    },
+    { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,  "%s.norm_before_residual"  },
+
     { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
     // sentence-transformers dense modules feature dims
     { LLM_KV_DENSE_2_FEAT_IN,        "%s.dense_2_feat_in"  },
@@ -512,6 +517,10 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_VISEXP_FFN_GATE,                        "blk.%d.vis_gate" },
     { LLM_TENSOR_VISEXP_FFN_DOWN,                        "blk.%d.vis_down" },
     { LLM_TENSOR_VISEXP_FFN_UP,                          "blk.%d.vis_up" },
+    // EAGLE-3 specific layers
+    { LLM_TENSOR_EAGLE3_HIDDEN_NORM,                     "blk.%d.hidden_norm" },
+    { LLM_TENSOR_EAGLE3_FC,                              "fc" },
+    { LLM_TENSOR_EAGLE3_D2T,                             "d2t" },
 };
 
 static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
@@ -2261,6 +2270,28 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_VISEXP_FFN_DOWN,
                 LLM_TENSOR_VISEXP_FFN_UP,
             };
+        case LLM_ARCH_EAGLE3:
+            return {
+                // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own, Llama 3.1 8B EAGLE3 uses target model's)
+                LLM_TENSOR_TOKEN_EMBD,
+                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_OUTPUT,
+                LLM_TENSOR_ROPE_FREQS,
+                // Single decoder layer (blk.0)
+                LLM_TENSOR_ATTN_NORM,
+                LLM_TENSOR_ATTN_Q,
+                LLM_TENSOR_ATTN_K,
+                LLM_TENSOR_ATTN_V,
+                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_FFN_NORM,
+                LLM_TENSOR_FFN_GATE,
+                LLM_TENSOR_FFN_DOWN,
+                LLM_TENSOR_FFN_UP,
+                // EAGLE-3 specific layers
+                LLM_TENSOR_EAGLE3_HIDDEN_NORM,
+                LLM_TENSOR_EAGLE3_FC,
+                LLM_TENSOR_EAGLE3_D2T,
+            };
         case LLM_ARCH_MIMO2:
             return {
                 LLM_TENSOR_TOKEN_EMBD,
@@ -2590,6 +2621,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
     {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    // EAGLE-3 tensors
+    {LLM_TENSOR_EAGLE3_FC,                  {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_EAGLE3_HIDDEN_NORM,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_EAGLE3_D2T,                 {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
 };
 
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index e8263369b80..b57274d59b4 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -127,6 +127,7 @@ enum llm_arch {
     LLM_ARCH_MAINCODER,
     LLM_ARCH_KIMI_LINEAR,
     LLM_ARCH_UNKNOWN,
+    LLM_ARCH_EAGLE3,
 };
 
 enum llm_kv {
@@ -304,6 +305,10 @@ enum llm_kv {
 
     LLM_KV_CLASSIFIER_OUTPUT_LABELS,
 
+    LLM_KV_EAGLE3_EXTRACT_LAYERS,
+    LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,
+    LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,
+
     LLM_KV_SHORTCONV_L_CACHE,
 
     LLM_KV_XIELU_ALPHA_N,
@@ -519,6 +524,9 @@ enum llm_tensor {
     LLM_TENSOR_NEXTN_HNORM,
     LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
     LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+    LLM_TENSOR_EAGLE3_FC,              // eagle3: feature fusion layer
+    LLM_TENSOR_EAGLE3_HIDDEN_NORM,     // eagle3: additional normalization layer
+    LLM_TENSOR_EAGLE3_D2T,             // eagle3: draft to target vocabulary mapping
 };
 
 enum llm_tensor_layer {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index a6df893a311..692cf8fc29b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -156,6 +156,8 @@ llama_context::llama_context(
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
 
+    cparams.eagle3_extract_enabled = false;
+
     // intialized later
     cparams.pipeline_parallel = false;
 
@@ -1114,6 +1116,32 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
+void llama_context::set_eagle3(const llama_model * model) {
+    // Initialize EAGLE3 feature extraction configuration
+    cparams.eagle3_extract_enabled = !!model;
+    if (!cparams.eagle3_extract_enabled) {
+        return;
+    }
+
+    sched_need_reserve = true;
+
+    const auto & eagle3_hparams = model->hparams;
+
+    // Copy feature extraction layer indices from EAGLE3 model's hparams
+    eagle3.extract_layer_indices.assign(
+            eagle3_hparams.eagle3_extract_layers.begin(),
+            eagle3_hparams.eagle3_extract_layers.end()
+            );
+
+    // Allocate tensors array for extraction
+    eagle3.extract_tensors.resize(eagle3.extract_layer_indices.size(), nullptr);
+
+    LLAMA_LOG_INFO("%s: EAGLE3 extraction enabled for layers [%d, %d, %d]\n", __func__,
+            eagle3.extract_layer_indices[0],
+            eagle3.extract_layer_indices[1],
+            eagle3.extract_layer_indices[2]);
+}
+
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
@@ -1163,6 +1191,14 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
 
         res->set_inputs(&ubatch);
 
+        // EAGLE3: Fill g_embeddings for decoder input
+        if (model.arch == LLM_ARCH_EAGLE3 && gtype == LLM_GRAPH_TYPE_DECODER && !eagle3.g_embeddings.empty()) {
+            ggml_tensor * g_embd = ggml_graph_get_tensor(gf, "inp_g_embeddings");
+            if (g_embd) {
+                ggml_backend_tensor_set(g_embd, eagle3.g_embeddings.data(), 0, ggml_nbytes(g_embd));
+            }
+        }
+
         //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
     }
 
@@ -1173,6 +1209,11 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
         return nullptr;
     }
 
+    // EAGLE3: Extract intermediate layer features after graph execution
+    if (cparams.eagle3_extract_enabled && !eagle3.extract_tensors.empty()) {
+        extract_eagle3_features(ubatch);
+    }
+
     ret = GGML_STATUS_SUCCESS;
 
     return res;
@@ -1188,7 +1229,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     const auto & hparams = model.hparams;
 
-    const int64_t n_embd  = hparams.n_embd_inp();
+    // EAGLE3: use 3*target_hidden_size for concatenated features input
+    const int64_t n_embd  = (model.arch == LLM_ARCH_EAGLE3 && batch_inp.embd) ? 3 * hparams.eagle3_target_hidden_size : hparams.n_embd;
     const int64_t n_vocab = model.vocab.n_tokens();
 
     // note: during encode, we always pass the full sequence starting from pos = 0
@@ -1274,8 +1316,15 @@ int llama_context::encode(const llama_batch & batch_inp) {
                     GGML_ASSERT(embd != nullptr);
                     const uint32_t n_embd_out = hparams.n_embd_out();
 
-                    GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
-                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
+                    if (model.arch == LLM_ARCH_EAGLE3) {
+                        // g_embeddings are stored temporarily in embd buffer
+                        const int64_t out_embd = hparams.n_embd;
+                        GGML_ASSERT(n_tokens * out_embd <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens * out_embd * sizeof(float));
+                    } else {
+                        GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
+                    }
                 } break;
             case LLAMA_POOLING_TYPE_MEAN:
             case LLAMA_POOLING_TYPE_CLS:
@@ -1666,7 +1715,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
         auto * t_logits = res->get_logits();
         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
-        if (t_embd && res->get_embd_pooled()) {
+        // For EAGLE3, don't override t_embd with t_embd_pooled - we need the prenorm value during eagle3 decoder autoregressive generation
+        if (t_embd && res->get_embd_pooled() && model.arch != LLM_ARCH_EAGLE3) {
             t_embd = res->get_embd_pooled();
         }
 
@@ -1681,7 +1731,40 @@ int llama_context::decode(const llama_batch & batch_inp) {
             if (n_outputs) {
                 GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
                 GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size);
-                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+
+                // EAGLE3: Map draft vocab to target vocab
+                if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) {
+                    static thread_local std::vector<int64_t> eagle3_d2t_map;
+                    static thread_local std::vector<float>   eagle3_draft_logits;
+
+                    const int64_t draft_vocab_size = t_logits->ne[0];
+                    const uint32_t last_idx = n_outputs - 1;
+
+                    // Load d2t mapping once (on first call)
+                    if (eagle3_d2t_map.empty()) {
+                        eagle3_d2t_map.resize(model.d2t->ne[0]);
+                        ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0, eagle3_d2t_map.size() * sizeof(int64_t));
+                    }
+
+                    // Read only the last token's draft logits
+                    eagle3_draft_logits.resize(draft_vocab_size);
+                    const size_t last_offset = last_idx * draft_vocab_size * sizeof(float);
+                    ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float));
+                    synchronize();
+
+
+                    // Map only the last token's draft logits to target vocab
+                    float * last_logits_out = logits_out + last_idx * n_vocab;
+                    std::fill(last_logits_out, last_logits_out + n_vocab, -std::numeric_limits<float>::infinity());
+
+                    for (int64_t j = 0; j < draft_vocab_size; j++) {
+                        const int64_t target_id = j + eagle3_d2t_map[j];
+                        GGML_ASSERT(target_id >= 0 && target_id < n_vocab);
+                        last_logits_out[target_id] = eagle3_draft_logits[j];
+                    }
+                } else {
+                    ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+                }
             }
         }
 
@@ -2064,7 +2147,16 @@ ggml_cgraph * llama_context::graph_reserve(
 
     auto * res = gf_res_reserve.get();
 
-    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
+    // EAGLE3: auto-detect encoder (embeddings+no target_model) or decoder (has target_model)
+    llm_graph_type gtype = LLM_GRAPH_TYPE_DEFAULT;
+    if (model.arch == LLM_ARCH_EAGLE3) {
+        if (cparams.embeddings && model.target_tok_embd == nullptr) {
+            gtype = LLM_GRAPH_TYPE_ENCODER;
+        } else if (model.target_tok_embd != nullptr) {
+            gtype = LLM_GRAPH_TYPE_DECODER;
+        }
+    }
+    const auto gparams = graph_params(res, ubatch, mctx, gtype);
 
     res->reset();
 
@@ -2105,6 +2197,7 @@ llm_graph_params llama_context::graph_params(
         /*.loras       =*/ &loras,
         /*.mctx        =*/ mctx,
         /*.cross       =*/ &cross,
+        /*.eagle3      =*/ &eagle3,
         /*.samplers    =*/ sampling.samplers,
         /*.n_outputs   =*/ n_outputs,
         /*.cb          =*/ graph_get_cb(),
@@ -2149,6 +2242,27 @@ llm_graph_cb llama_context::graph_get_cb() const {
             ggml_set_name(cur, name);
         }
 
+        // EAGLE3: Extract intermediate layer features if this is an extraction point
+        if (cparams.eagle3_extract_enabled) {
+            static constexpr const char * prefix = "eagle3_extract_";
+            static constexpr size_t prefix_len = 15; // strlen("eagle3_extract_")
+
+            if (strncmp(name, prefix, prefix_len) == 0) {
+                // Parse the extraction index from the name (e.g., "eagle3_extract_0" -> 0)
+                size_t extract_idx = 0;
+                if (sscanf(name + prefix_len, "%zu", &extract_idx) == 1 && extract_idx < eagle3.extract_tensors.size()) {
+                    // Mark as output tensor to ensure proper backend assignment
+                    ggml_set_output(cur);
+                    // Store this tensor reference for post-execution extraction
+                    eagle3.extract_tensors[extract_idx] = cur;
+                    LLAMA_LOG_DEBUG("%s: EAGLE3 stored tensor reference for extraction: "
+                                   "index=%zu, layer=%d, target_layer=%d, tensor=%s\n",
+                                   __func__, extract_idx, il,
+                                   eagle3.extract_layer_indices[extract_idx], name);
+                }
+            }
+        }
+
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
         const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
@@ -2167,6 +2281,54 @@ llm_graph_cb llama_context::graph_get_cb() const {
     };
 }
 
+void llama_context::extract_eagle3_features(const llama_ubatch & ubatch) {
+    const int64_t n_tokens = ubatch.n_tokens;
+    const int64_t n_embd = model.hparams.n_embd;
+    const size_t n_layers = eagle3.extract_tensors.size();
+
+    // Allocate storage for concatenated features
+    const int64_t n_embd_concat = n_embd * n_layers;
+    eagle3.target_features.resize(n_embd_concat * n_tokens);
+
+    // Temporary buffer to hold layer features before transposing
+    static thread_local std::vector<float> temp_layer_features;
+    temp_layer_features.resize(n_embd * n_tokens);
+
+    LLAMA_LOG_DEBUG("%s: Start to extract EAGLE3 features: %zu layers, %lld tokens, %lld embd\n",
+                    __func__, n_layers, (long long)n_tokens, (long long)n_embd);
+
+    // Extract each layer's features and interleave into token-major layout
+    for (size_t layer_idx = 0; layer_idx < n_layers; ++layer_idx) {
+        ggml_tensor * tensor = eagle3.extract_tensors[layer_idx];
+        GGML_ASSERT(tensor != nullptr && "EAGLE3 extraction tensor is null");
+
+        // Get the backend where this tensor is stored
+        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), tensor);
+        GGML_ASSERT(backend != nullptr && "EAGLE3 tensor has no backend");
+
+        // Verify tensor shape: should be [n_embd, n_tokens]
+        GGML_ASSERT(tensor->ne[0] == n_embd && tensor->ne[1] == n_tokens &&
+                    "EAGLE3 extraction tensor has unexpected shape");
+
+        // Get layer features to temp buffer
+        const size_t size_bytes = n_embd * n_tokens * sizeof(float);
+        ggml_backend_tensor_get_async(backend, tensor, temp_layer_features.data(), 0, size_bytes);
+        ggml_backend_sched_synchronize(sched.get());
+
+        // Then copy to correct position in target_features
+        // target_features layout: [token_0_all_layers, token_1_all_layers, ...]
+        // Each token has [layer_0_embd, layer_1_embd, layer_2_embd]
+        for (int64_t token_idx = 0; token_idx < n_tokens; ++token_idx) {
+            // Source: temp_layer_features[token_idx * n_embd ... (token_idx + 1) * n_embd - 1]
+            const float * src = temp_layer_features.data() + token_idx * n_embd;
+            // Dest: target_features[token_idx * n_embd_concat + layer_idx * n_embd]
+            float * dest = eagle3.target_features.data() + token_idx * n_embd_concat + layer_idx * n_embd;
+            std::memcpy(dest, src, n_embd * sizeof(float));
+        }
+    }
+
+}
+
 //
 // state save/load
 //
@@ -2980,6 +3142,7 @@ llama_context_params llama_context_default_params() {
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
         /*.kv_unified                  =*/ false,
+        /*.target_model                =*/ nullptr,
         /*.sampler                     =*/ nullptr,
         /*.n_sampler                   =*/ 0,
     };
@@ -2995,6 +3158,12 @@ llama_context * llama_init_from_model(
         return nullptr;
     }
 
+    // Auto-setup for EAGLE3: set target embedding if target_model is provided
+    if (model->arch == LLM_ARCH_EAGLE3 && params.target_model) {
+        model->target_tok_embd = params.target_model->tok_embd;
+        LLAMA_LOG_INFO("%s: EAGLE3 auto-setup: using target model's embedding layer\n", __func__);
+    }
+
     if (params.n_batch == 0 && params.n_ubatch == 0) {
         LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
         return nullptr;
@@ -3251,6 +3420,16 @@ int32_t llama_apply_adapter_cvec(
     return res ? 0 : -1;
 }
 
+//
+// eagle3 (tmp)
+//
+
+void llama_set_eagle3(
+        llama_context * ctx,
+        const llama_model * model) {
+    ctx->set_eagle3(model);
+}
+
 //
 // memory
 //
@@ -3698,3 +3877,33 @@ void llama_opt_epoch(
         callback_train,
         callback_eval);
 }
+
+//
+// EAGLE3 member functions
+//
+
+const float * llama_context::get_eagle3_target_features() const {
+    GGML_ASSERT(!eagle3.target_features.empty() && "EAGLE3 target features not extracted - call llama_encode() on target model first");
+    return eagle3.target_features.data();
+}
+
+void llama_context::set_eagle3_g_embeddings(const float * g_embd, int32_t n_embd, int32_t n_tokens) {
+    GGML_ASSERT(g_embd != nullptr && "g_embeddings cannot be null");
+    GGML_ASSERT(n_embd > 0 && n_tokens > 0 && "invalid dimensions");
+
+    const size_t size = n_embd * n_tokens;
+    eagle3.g_embeddings.resize(size);
+    std::memcpy(eagle3.g_embeddings.data(), g_embd, size * sizeof(float));
+}
+
+//
+// C API wrappers
+//
+
+const float * llama_get_eagle3_target_features(llama_context * ctx) {
+    return ctx->get_eagle3_target_features();
+}
+
+void llama_set_eagle3_g_embeddings(llama_context * ctx, const float * g_embd, int32_t n_embd, int32_t n_tokens) {
+    ctx->set_eagle3_g_embeddings(g_embd, n_embd, n_tokens);
+}
diff --git a/src/llama-context.h b/src/llama-context.h
index 8e71cdd1dc5..90f1e1e4848 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -120,6 +120,9 @@ struct llama_context {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    // TODO: tmp
+    void set_eagle3(const llama_model * model);
+
     // process a single ubatch with a specific graph type
     // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
@@ -236,6 +239,12 @@ struct llama_context {
     ggml_cgraph * graph_reserve(
         uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
 
+    // EAGLE3: Get pointer to target model features extracted for EAGLE3 encoder
+    const float * get_eagle3_target_features() const;
+
+    // EAGLE3: Set g_embeddings from encoder output for decoder input
+    void set_eagle3_g_embeddings(const float * g_embd, int32_t n_embd, int32_t n_tokens);
+
     bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
 
 private:
@@ -247,6 +256,9 @@ struct llama_context {
 
     llm_graph_cb graph_get_cb() const;
 
+    // EAGLE3: Extract intermediate layer features from target model
+    void extract_eagle3_features(const llama_ubatch & ubatch);
+
     // TODO: read/write lora adapters and cvec
     size_t state_write_data(llama_io_write_i & io);
     size_t state_read_data (llama_io_read_i  & io);
@@ -266,6 +278,9 @@ struct llama_context {
 
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
 
+    mutable llama_eagle3 eagle3; // EAGLE3 draft model support - stores features from target model
+                                 // mutable because it's modified during graph building (const function)
+
     std::unique_ptr<llama_memory_i> memory;
 
     // decode output (2-dimensional array: [n_outputs][n_vocab])
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 2da3bbd6f94..4c9f528b245 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -35,6 +35,7 @@ struct llama_cparams {
     bool warmup;
     bool op_offload;
     bool kv_unified;
+    bool eagle3_extract_enabled;  // enable layer extraction for EAGLE3 speculative decoding
     bool pipeline_parallel;
 
     enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index bba747d37b5..d95216e4d0b 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -848,6 +848,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     loras            (params.loras),
     mctx             (params.mctx),
     cross            (params.cross),
+    eagle3           (params.eagle3),
     samplers         (params.samplers),
     cb_func          (params.cb),
     res              (params.res),
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 1d69ff1a6fc..07fa46c7df3 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -72,6 +72,30 @@ struct llama_cross {
     std::vector<std::set<llama_seq_id>> seq_ids_enc;
 };
 
+// EAGLE3 support - stores intermediate features from target model
+struct llama_eagle3 {
+    // Configuration: which layers to extract from target model
+    std::vector<int> extract_layer_indices;
+
+    // Extracted features from target model (for encoder input)
+    // Concatenated [layer_l, layer_m, layer_h] embeddings
+    // Shape: [n_layers * n_embd, n_tokens] where n_layers = extract_layer_indices.size()
+    std::vector<float> target_features;
+
+    // Encoder output (for decoder input)
+    std::vector<float> g_embeddings;
+
+    // Tensor references for feature extraction from target model
+    std::vector<ggml_tensor *> extract_tensors;
+
+    // Clear all stored data
+    void clear() {
+        target_features.clear();
+        g_embeddings.clear();
+        extract_tensors.clear();
+    }
+};
+
 struct llm_graph_params;
 
 //
@@ -533,6 +557,7 @@ struct llm_graph_params {
     const llama_adapter_loras    * loras;
     const llama_memory_context_i * mctx;
     const llama_cross            * cross;
+    llama_eagle3                 * eagle3;  // non-const: we write extracted features here
 
     std::map<llama_seq_id, llama_sampler *> samplers;
 
@@ -741,6 +766,7 @@ struct llm_graph_context {
     const llama_adapter_loras    * loras;
     const llama_memory_context_i * mctx;
     const llama_cross            * cross;
+    llama_eagle3                 * eagle3;  // non-const: we write extracted features here
 
     std::map<llama_seq_id, llama_sampler *> samplers;
 
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 6c695bdbf66..2d3dc1fbf58 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -197,6 +197,16 @@ struct llama_hparams {
     // qwen3vl deepstack
     uint32_t n_deepstack_layers = 0;
 
+    // EAGLE3 draft model - layer indices to extract from target model
+    // e.g., for 32-layer target: [2, 16, 29] (low, middle, high)
+    std::array<int, 3> eagle3_extract_layers = {0, 0, 0};
+
+    // EAGLE3 draft model - target model hidden size
+    uint32_t eagle3_target_hidden_size = 0;
+
+    // EAGLE3 draft model - apply hidden_norm before storing residual
+    bool eagle3_norm_before_residual = false;
+
     // needed by encoder-decoder models (e.g. T5, FLAN-T5)
     // ref: https://github.com/ggml-org/llama.cpp/pull/8141
     llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 674d06c8910..b8659276918 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2372,6 +2372,35 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_EAGLE3:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                // EAGLE3 layer extraction configuration
+                // Use array<int, 4> (has template instantiation), then copy first 3 elements
+                std::array<int, 4> extract_layers_tmp = {};
+                if (!ml.get_key_or_arr(LLM_KV_EAGLE3_EXTRACT_LAYERS, extract_layers_tmp, 3, false)) {
+                    throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
+                }
+                std::copy_n(extract_layers_tmp.begin(), 3, hparams.eagle3_extract_layers.begin());
+                LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
+                               hparams.eagle3_extract_layers[0],
+                               hparams.eagle3_extract_layers[1],
+                               hparams.eagle3_extract_layers[2]);
+
+                // EAGLE3 target model hidden size
+                ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size);
+                LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__,
+                               hparams.eagle3_target_hidden_size, hparams.n_embd);
+
+                // EAGLE3 norm_before_residual (optional, default false)
+                // compatible with Readhat eagle3 speculator model
+                ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false);
+                if (hparams.eagle3_norm_before_residual) {
+                    LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__);
+                }
+
+                type = LLM_TYPE_UNKNOWN;
+            } break;
         case LLM_ARCH_COGVLM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6816,6 +6845,64 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
                     }
                 } break;
+            case LLM_ARCH_EAGLE3:
+                {
+                    const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size;
+                    const int64_t n_embd_attn_input = 2 * n_embd;
+
+                    // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if EAGLE3 has different vocab_size than target)
+                    // d2t: draft to target vocabulary mapping
+                    int64_t n_draft_vocab = n_vocab;  // Default: same as target vocab
+                    const struct ggml_tensor * d2t_meta = ml.get_tensor_meta("d2t");
+                    if (d2t_meta) {
+                        n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
+                        d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0);
+                        LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
+                    } else {
+                        d2t = nullptr; // no d2t, use default vocab size
+                        LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
+                    }
+
+                    // Feature fusion layer: projects 3 target layers to draft hidden size
+                    fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0);
+
+                    // Output layer (uses draft vocab size)
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_draft_vocab}, 0);
+
+                    // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
+                    const struct ggml_tensor * tok_embd_meta = ml.get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
+                    if (tok_embd_meta) {
+                        const int64_t n_target_vocab = tok_embd_meta->ne[1];
+                        tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
+                        LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
+                    }
+
+                    // Single decoder layer
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        // input_layernorm: applied to token embeddings
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        // Attention takes input_embeds_normed + fused_target_normed as input
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        // EAGLE-3 specific: hidden_norm applied to fused target features
+                        layer.eagle3_hidden_norm = create_tensor(tn(LLM_TENSOR_EAGLE3_HIDDEN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                        // rope_freqs for llama3 rope scaling (optional - only if EAGLE3 config has rope_scaling)
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
+                    }
+                } break;
             case LLM_ARCH_KIMI_LINEAR:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8331,6 +8418,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_minimax_m2>(*this, params);
             } break;
+        case LLM_ARCH_EAGLE3:
+            {
+                if (params.gtype == LLM_GRAPH_TYPE_ENCODER) {
+                    llm = std::make_unique<llm_build_eagle3_encode>(*this, params);
+                } else {
+                    llm = std::make_unique<llm_build_eagle3_decode>(*this, params);
+                }
+            } break;
         case LLM_ARCH_COGVLM:
             {
                 llm = std::make_unique<llm_build_cogvlm>(*this, params);
@@ -8540,6 +8635,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ERNIE4_5:
         case LLM_ARCH_ERNIE4_5_MOE:
         case LLM_ARCH_MISTRAL3:
+        case LLM_ARCH_EAGLE3:
         case LLM_ARCH_LLAMA_EMBED:
         case LLM_ARCH_MAINCODER:
             return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
index 7b580043b33..674ba228f52 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -413,6 +413,9 @@ struct llama_layer {
     struct ggml_tensor * ffn_act_beta    = nullptr;
     struct ggml_tensor * ffn_act_eps     = nullptr;
 
+    // eagle3
+    struct ggml_tensor * eagle3_hidden_norm = nullptr;
+
     // Kimi Linear KDA (using ssm_ prefix for consistency)
     // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
     struct ggml_tensor * ssm_q_conv = nullptr;
@@ -474,6 +477,13 @@ struct llama_model {
     struct ggml_tensor * per_layer_model_proj = nullptr;
     struct ggml_tensor * per_layer_proj_norm  = nullptr;
 
+    // eagle3
+    struct ggml_tensor * fc  = nullptr;  // feature fusion layer
+    struct ggml_tensor * d2t = nullptr;  // draft to target vocabulary mapping
+    // Reference to target model's embedding layer
+    // This allows EAGLE3 to use target model's embeddings without copying
+    struct ggml_tensor * target_tok_embd = nullptr;
+
     std::vector<llama_layer> layers;
 
     //Dense linear projections for SentenceTransformers models like embeddinggemma
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
new file mode 100644
index 00000000000..c3ef2ed4bfe
--- /dev/null
+++ b/src/models/eagle3.cpp
@@ -0,0 +1,186 @@
+#include "models.h"
+
+ggml_tensor * llm_build_eagle3_encode::build_inp_embd() const {
+    const int64_t n_embd_target_features = 3 * hparams.eagle3_target_hidden_size;
+
+    ggml_tensor * cur = nullptr;
+
+    // Input: Target model features (3 layers concatenated: low, mid, high)
+    // Data will be provided via ubatch->embd in encode_eagle3_features()
+    auto inp_target = std::make_unique<llm_graph_input_embd>(n_embd_target_features);
+    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens);
+    ggml_set_input(inp_target->embd);
+
+    cur = inp_target->embd;
+    cb(cur, "inp_embd", -1);
+
+    res->add_input(std::move(inp_target));
+
+    return cur;
+}
+
+// EAGLE3 Encoder: processes target model features through feature fusion layer
+// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
+// Output: g_embeddings e.g. [4096, n_tokens] stored in context
+llm_build_eagle3_encode::llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    ggml_tensor * cur = nullptr;
+
+    cur = build_inp_embd();
+
+    // Feature fusion layer
+    cur = build_lora_mm(model.fc, cur);
+    cb(cur, "fc_out", -1);
+
+    // Output: g_embeddings e.g. [4096, n_tokens]
+    res->t_embd = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+// EAGLE3 Decoder: processes draft tokens using g_embeddings from encoder
+// Input: draft tokens + g_embeddings from encoder
+// Output: draft logits
+llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_layer == 1);  // EAGLE-3 has only one decoder layer
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    // EAGLE3 Decoder receives:
+    // 1. Token embeddings (e.g.from EAGLE3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
+    // 2. g_embeddings from encoder
+    // Choose token_embd_eagle3: prefer EAGLE3's own if available (Llama 3.3 70B), else use target's (Llama 3.1 8B)
+    ggml_tensor * token_embd_eagle3 = (model.tok_embd != nullptr) ? model.tok_embd : model.target_tok_embd;
+    GGML_ASSERT(token_embd_eagle3 != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
+    ggml_tensor * inp_embd = build_inp_embd(token_embd_eagle3);
+    cb(inp_embd, "inp_embd", -1);
+
+    // TODO: refactor into llm_graph_input
+    ggml_tensor * inp_g = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+    ggml_set_input(inp_g);
+    cb(inp_g, "inp_g_embeddings", -1); // TODO: do not change the name! refactor into llm_graph_input
+
+    inpL = inp_g;
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    // Single decoder layer (il = 0)
+    const int il = 0;
+    {
+        // Apply input_layernorm to the token embeddings
+        ggml_tensor * embd_norm = build_norm(inp_embd,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(embd_norm, "embd_norm", il);
+
+        // Apply hidden_norm to inp_g
+        ggml_tensor * g_norm = build_norm(inp_g,
+                model.layers[il].eagle3_hidden_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(g_norm, "g_norm", il);
+
+        // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
+        // - false (default): use raw inp_g for residual
+        // - true: use normalized g_norm for residual
+        // inpL is the concatenated input (normalized inp_embd + normalized inp_g)
+        ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL;
+
+        // Concatenate normalized inp_embd and normalized inp_g
+        cur = ggml_concat(ctx0, embd_norm, g_norm, il);
+        cb(cur, "concat_embd", il);
+
+        // Self-attention with concatenated input
+        ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+        cb(Qcur, "Qcur", il);
+
+        ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+        cb(Kcur, "Kcur", il);
+
+        ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+        cb(Vcur, "Vcur", il);
+
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+        // rope freq factors, returns nullptr if not available
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+        // RoPE
+        Qcur = ggml_rope_ext(
+                ctx0, Qcur, inp_pos, rope_factors,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+        Kcur = ggml_rope_ext(
+                ctx0, Kcur, inp_pos, rope_factors,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+        cb(Qcur, "Qcur_rope", il);
+        cb(Kcur, "Kcur_rope", il);
+
+        cur = build_attn(inp_attn,
+                model.layers[il].wo, NULL,
+                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+
+        if (inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+
+        // Add residual and update it
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // Apply FFN norm to the sum
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "post_attn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        // Output norm with residual
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "eagle3_prenorm", il);
+
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    // Output prenorm state (for next token's g_embeddings in autoregressive generation)
+    ggml_set_output(cur);
+    res->t_embd = cur;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+
+    // lm_head - projects to draft vocabulary
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
index 42b5fcdf42e..3bccb2f902f 100644
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -31,6 +31,16 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
+        // EAGLE3: Extract intermediate layer features from target model at layer INPUT
+        if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) {
+            static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"};
+            for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) {
+                if (eagle3->extract_layer_indices[i] == il) {
+                    cb(inpL, eagle3_extract_names[i], il);
+                    break;
+                }
+            }
+        }
         // norm
         cur = build_norm(inpL,
                 model.layers[il].attn_norm, NULL,
diff --git a/src/models/models.h b/src/models/models.h
index cfcbb9aaa5b..120b5b22cc7 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -150,6 +150,16 @@ struct llm_build_dream : public llm_graph_context {
     llm_build_dream(const llama_model & model, const llm_graph_params & params);
 };
 
+struct llm_build_eagle3_encode : public llm_graph_context {
+    llm_build_eagle3_encode(const llama_model & model, const llm_graph_params & params);
+private:
+    ggml_tensor * build_inp_embd() const;
+};
+
+struct llm_build_eagle3_decode : public llm_graph_context {
+    llm_build_eagle3_decode(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_ernie4_5 : public llm_graph_context {
     llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp
index dbe3ca1851f..527e8967c51 100644
--- a/src/models/openai-moe-iswa.cpp
+++ b/src/models/openai-moe-iswa.cpp
@@ -19,6 +19,17 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
 
         ggml_tensor * inpSA = inpL;
 
+        // EAGLE3: Extract intermediate layer features from target model at layer INPUT
+        if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) {
+            static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"};
+            for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) {
+                if (eagle3->extract_layer_indices[i] == il) {
+                    cb(inpL, eagle3_extract_names[i], il);
+                    break;
+                }
+            }
+        }
+        
         // norm
         cur = build_norm(inpL,
                 model.layers[il].attn_norm, nullptr,
diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp
index a5cfffa5314..c1f34624c03 100644
--- a/src/models/qwen3.cpp
+++ b/src/models/qwen3.cpp
@@ -21,6 +21,17 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
+        // EAGLE3: Extract intermediate layer features from target model at layer INPUT
+        if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) {
+                static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"};
+                for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) {
+                    if (eagle3->extract_layer_indices[i] == il) {
+                        cb(inpL, eagle3_extract_names[i], il);
+                        break;
+                    }
+                }
+            }
+
         // norm
         cur = build_norm(inpL,
                 model.layers[il].attn_norm, NULL,
diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp
index 888534fb347..c0b6ff5df97 100644
--- a/src/models/qwen3moe.cpp
+++ b/src/models/qwen3moe.cpp
@@ -21,6 +21,17 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
+        // EAGLE3: Extract intermediate layer features from target model at layer INPUT
+        if (eagle3 && cparams.eagle3_extract_enabled && !eagle3->extract_layer_indices.empty()) {
+                static const char * eagle3_extract_names[] = {"eagle3_extract_0", "eagle3_extract_1", "eagle3_extract_2"};
+                for (size_t i = 0; i < eagle3->extract_layer_indices.size() && i < 3; ++i) {
+                        if (eagle3->extract_layer_indices[i] == il) {
+                        cb(inpL, eagle3_extract_names[i], il);
+                        break;
+                        }
+                }
+        }
+
         // norm
         cur = build_norm(inpL,
                 model.layers[il].attn_norm, NULL,