diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md
index 06d97ae78ee..197173faed8 100644
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -16,12 +16,12 @@ Pull requests (PRs):
 - New branch names are prefixed with "gg/"
 - Before opening a pull request, ask the user to confirm the description
 - When creating a pull request, look for the repository's PR template and follow it
-- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
+- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]"
 - Ask the user to tell you what model was used and write it in place of [MODEL]
 - Always create the pull requests in draft mode
 
 Commits:
-- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
+- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index 4a1aaa955a8..3e0fe66afff 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
             ggml_init_params params = {
-                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
+                /*.mem_size   =*/ hparams.n_layer()*ggml_tensor_overhead(),
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
@@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) {
     };
 
     // make tensors
-    tensors.reserve(hparams.n_layer);
+    tensors.reserve(hparams.n_layer());
     tensors.push_back(nullptr); // there's never a tensor for layer 0
-    for (size_t il = 1; il < hparams.n_layer; il++) {
+    for (size_t il = 1; il < hparams.n_layer(); il++) {
         ggml_backend_buffer_type_t buft = model.select_buft(il);
         ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
@@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply(
     layer_start = il_start;
     layer_end   = il_end;
 
-    for (size_t il = 1; il < hparams.n_layer; il++) {
+    for (size_t il = 1; il < hparams.n_layer(); il++) {
         assert(tensors[il] != nullptr);
 
         const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f59381a4d75..eff1d8f89f2 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -341,7 +341,7 @@ llama_context::llama_context(
         // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
         bool pipeline_parallel =
             model.n_devices() > 1 &&
-            model.n_gpu_layers() > model.hparams.n_layer &&
+            model.n_gpu_layers() > model.hparams.n_layer() &&
             model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
             cparams.offload_kqv &&
             !model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
 
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
+        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
         if (ubatch.n_tokens < 32 || full_offload) {
             if (il != -1 && strcmp(name, "norm") == 0) {
                 const auto & dev_layer = model.dev_layer(il);
@@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model(
 
     if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
         const uint32_t blck_size = ggml_blck_size(params.type_k);
-        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+        for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
             if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
                 LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
                     __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
@@ -3427,7 +3427,7 @@ llama_context * llama_init_from_model(
 
     if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
         const uint32_t blck_size = ggml_blck_size(params.type_v);
-        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+        for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
             if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
                 LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
                     __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
@@ -3449,7 +3449,7 @@ llama_context * llama_init_from_model(
     }
 
     if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
-        model->hparams.nextn_predict_layers == 0) {
+        model->hparams.n_layer_nextn == 0) {
         LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
         return nullptr;
     }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index f910528d21b..172edf24cb1 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1005,7 +1005,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     cparams          (params.cparams),
     ubatch           (params.ubatch),
     n_embd           (hparams.n_embd),
-    n_layer          (hparams.n_layer),
+    n_layer          (hparams.n_layer()),
     n_rot            (hparams.n_rot()),
     n_ctx            (cparams.n_ctx),
     n_head           (hparams.n_head()),
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 087afec55c6..e1e49d1cc1f 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -7,31 +7,38 @@
 
 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
     if (dense_first) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
             is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
         }
     } else {
-        for (uint32_t il = 0; il < n_layer; ++il) {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
             is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
         }
     }
+
+    for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
+        is_swa_impl[il] = false;
+    }
 }
 
-// TODO: implement
-//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
-//    if (dense_first) {
-//        for (uint32_t il = 0; il < n_layer; ++il) {
-//            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
-//        }
-//    } else {
-//        for (uint32_t il = 0; il < n_layer; ++il) {
-//            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
-//        }
-//    }
-//}
+void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
+    if (dense_first) {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
+            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
+        }
+    } else {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
+            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+        }
+    }
+
+    for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
+        is_recr_impl[il] = false;
+    }
+}
 
 bool llama_hparams::is_swa_any() const {
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
         if (is_swa_impl[il]) {
             return true;
         }
@@ -41,7 +48,7 @@ bool llama_hparams::is_swa_any() const {
 }
 
 uint32_t llama_hparams::n_head(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return n_head_arr[il];
     }
 
@@ -49,7 +56,7 @@ uint32_t llama_hparams::n_head(uint32_t il) const {
 }
 
 uint32_t llama_hparams::n_head_kv(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return n_head_kv_arr[il];
     }
 
@@ -57,7 +64,7 @@ uint32_t llama_hparams::n_head_kv(uint32_t il) const {
 }
 
 uint32_t llama_hparams::n_ff(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return n_ff_arr[il];
     }
 
@@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
 }
 
 uint32_t llama_hparams::n_rot(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return is_swa(il) ? n_rot_swa : n_rot_full;
     }
 
@@ -98,7 +105,7 @@ uint32_t llama_hparams::n_embd_out() const {
 }
 
 uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
     }
 
@@ -106,7 +113,7 @@ uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
 }
 
 uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
     }
 
@@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
 
 bool llama_hparams::is_n_embd_k_gqa_variable() const {
     const uint32_t val = n_embd_k_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
         if (val != n_embd_k_gqa(il)) {
             return true;
         }
@@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const {
 
 bool llama_hparams::is_n_embd_v_gqa_variable() const {
     const uint32_t val = n_embd_v_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
         if (val != n_embd_v_gqa(il)) {
             return true;
         }
@@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const {
 
 uint32_t llama_hparams::n_embd_k_gqa_max() const {
     uint32_t val = n_embd_k_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
         val = std::max(val, n_embd_k_gqa(il));
     }
 
@@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const {
 
 uint32_t llama_hparams::n_embd_v_gqa_max() const {
     uint32_t val = n_embd_v_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
         val = std::max(val, n_embd_v_gqa(il));
     }
 
@@ -207,11 +214,11 @@ uint32_t llama_hparams::n_embd_s() const {
 }
 
 bool llama_hparams::is_recr(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return is_recr_impl[il];
     }
 
-    GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
+    GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
 }
 
 uint32_t llama_hparams::n_pos_per_embd() const {
@@ -219,11 +226,11 @@ uint32_t llama_hparams::n_pos_per_embd() const {
 }
 
 bool llama_hparams::is_swa(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
         return is_swa_impl[il];
     }
 
-    GGML_ABORT("fatal error");
+    GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
 }
 
 bool llama_hparams::is_mla() const {
@@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
 }
 
 bool llama_hparams::has_kv(uint32_t il) const {
-    if (kv_only_nextn) {
-        // MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
-        // the leading trunk blocks are not executed in this graph.
-        return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
-    }
-
     if (n_layer_kv_from_start >= 0) {
         if (il < (uint32_t) n_layer_kv_from_start) {
             return true;
@@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const {
     return true;
 }
 
-uint32_t llama_hparams::n_layer_kv() const {
-    uint32_t res = 0;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        if (has_kv(il)) {
-            res++;
-        }
-    }
-
-    return res;
+uint32_t llama_hparams::n_layer() const {
+    return n_layer_all - n_layer_nextn;
 }
 
 bool llama_hparams::use_mrope() const {
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index e8ed4dd74de..fde6183e878 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -48,12 +48,15 @@ struct llama_hparams {
 
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
-    uint32_t n_layer;
-    int32_t  n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
+    uint32_t n_layer_all;
+    uint32_t n_layer_nextn = 0;
     uint32_t n_expert = 0;
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
 
+    // TODO: this needs to be reworked
+    int32_t  n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
+
     // different head size for full_attention and SWA layers
     uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
     uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
@@ -96,9 +99,6 @@ struct llama_hparams {
     uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
     uint32_t moe_every_n_layers   = 0;
     uint32_t moe_latent_size      = 0;
-    uint32_t nextn_predict_layers = 0;
-
-    bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
 
     float f_norm_eps;
     float f_norm_rms_eps;
@@ -272,8 +272,7 @@ struct llama_hparams {
 
     bool is_swa(uint32_t il) const;
 
-    // TODO: implement
-    //void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
+    void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
 
     // whether or not the given layer is recurrent (for hybrid models)
     bool is_recr(uint32_t il) const;
@@ -329,8 +328,8 @@ struct llama_hparams {
 
     bool has_kv(uint32_t il) const;
 
-    // number of layers for which has_kv() returns true
-    uint32_t n_layer_kv() const;
+    // number of effective layers (excludes nextn layers)
+    uint32_t n_layer() const;
 
     // note that this function uses different SWA parameters from those in the hparams
     // note: inlined on purpose for performance reasons
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 82da38e0b61..60ae42e3786 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -97,7 +97,7 @@ llama_kv_cache::llama_kv_cache(
 
     GGML_ASSERT(kv_size % n_pad == 0);
 
-    const uint32_t n_layer_kv = hparams.n_layer_kv();
+    const uint32_t n_layer = hparams.n_layer_all;
 
     // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
     struct ggml_backend_buft_comparator {
@@ -112,7 +112,7 @@ llama_kv_cache::llama_kv_cache(
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
             ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
+                /*.mem_size   =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()),
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
@@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(
 
     const bool is_mla = hparams.is_mla();
 
-    for (uint32_t il = 0; il < hparams.n_layer; il++) {
+    for (uint32_t il = 0; il < n_layer; il++) {
         if (!hparams.has_kv(il)) {
             LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
             continue;
@@ -230,7 +230,7 @@ llama_kv_cache::llama_kv_cache(
     if (reuse) {
         LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
 
-        for (uint32_t il = 0; il < hparams.n_layer; il++) {
+        for (uint32_t il = 0; il < n_layer; il++) {
             const int32_t il_reuse = reuse(il);
 
             if (il_reuse < 0) {
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index ec5dc5835dd..6a4892fb471 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -26,7 +26,7 @@ llama_memory_recurrent::llama_memory_recurrent(
                  uint32_t   n_seq_max,
                  uint32_t   n_rs_seq,
     const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
-    const int32_t n_layer = hparams.n_layer;
+    const int32_t n_layer = hparams.n_layer();
 
     head = 0;
     size = mem_size;
@@ -863,7 +863,7 @@ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::
 
 void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
     const uint32_t s_trans = 0;
-    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_layer = hparams.n_layer();
 
     io.write(&s_trans, sizeof(s_trans));
     io.write(&n_layer, sizeof(n_layer));
@@ -1047,8 +1047,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
     io.read(&s_trans, sizeof(s_trans));
     io.read(&n_layer, sizeof(n_layer));
 
-    if (n_layer != hparams.n_layer) {
-        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+    if (n_layer != hparams.n_layer()) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer());
         return false;
     }
     if (cell_count > size) {
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 4d7b11067c9..ba08a19ac76 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1050,10 +1050,10 @@ struct ggml_tensor * llama_model_loader::create_tensor(
         if (it == ctx_map.end()) {
             // one ggml context per buffer type
             int max_n_tensors = n_tensors;
-            max_n_tensors += 1;                 // duplicated output tensor
-            max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
+            max_n_tensors += 1;                   // duplicated output tensor
+            max_n_tensors += hparams.n_layer()*2; // duplicated rope freq tensors
             if (files.empty()) {
-                max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
+                max_n_tensors += hparams.n_layer()*256; // this should be well above what any model actually uses
             }
             const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
 
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index 26fda1abfae..b0522878090 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -77,7 +77,7 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
 template <typename Container>
 void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
     GGML_ASSERT(model != nullptr || !per_layer);
-    const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
+    const size_t n_values = per_layer ? size_t(model->hparams.n_layer()) : value.size();
     GGML_ASSERT(n_values <= value.size());
 
     if (n_values == 0) {
@@ -206,7 +206,7 @@ void llama_model_saver::add_kv_from_model() {
     if (hparams.n_embd_out_impl > 0) {
         add_kv(LLM_KV_EMBEDDING_LENGTH_OUT,          hparams.n_embd_out_impl);
     }
-    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
+    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer_all);
     add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
     add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
     add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
@@ -227,7 +227,7 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
     add_kv(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
     add_kv(LLM_KV_MOE_EVERY_N_LAYERS,                hparams.moe_every_n_layers);
-    add_kv(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers);
+    add_kv(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.n_layer_nextn);
     add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS,              hparams.n_deepstack_layers);
     add_kv(LLM_KV_POOLING_TYPE,                      uint32_t(hparams.pooling_type));
     add_kv(LLM_KV_LOGIT_SCALE,                       hparams.f_logit_scale);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index bc7a83b15f5..c98cb27e4d4 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -398,7 +398,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
             rotation = get_il_eff(il) % ud->n_devices;
         } else {
             il = 0;
-            rotation = hparams.n_layer % ud->n_devices;
+            rotation = hparams.n_layer() % ud->n_devices;
         }
         const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str());
         if (tensor_axis_0 == nullptr) {
@@ -1034,7 +1034,7 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out_impl, false);
     ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn,     false);
     ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type,    false);
-    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
+    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer_all);
     ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
     ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
@@ -1089,13 +1089,13 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.swiglu_clamp_exp.begin(),   hparams.swiglu_clamp_exp.end(),   0.0f);
     std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
 
-    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
-    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer(), false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
 
-    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false);
 
     bool rope_finetuned = false;
     ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
@@ -1194,7 +1194,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
     const auto & use_mlock    = params.use_mlock;
     const auto & tensor_split = params.tensor_split;
 
-    const int n_layer      = hparams.n_layer;
+    const int n_layer = hparams.n_layer_all;
     const int n_gpu_layers = this->n_gpu_layers();
 
     const bool use_mmap_buffer = true;
@@ -1251,10 +1251,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
         splits[i] /= split_sum;
     }
 
-    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
+    const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1);
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
+        const bool is_swa = il < n_layer && hparams.is_swa(il);
         if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
             LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
             return {cpu_dev, &pimpl->cpu_buft_list};
@@ -1557,7 +1557,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
     }
 
     if (llama_supports_gpu_offload()) {
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+        const int n_gpu = std::min(n_gpu_layers, n_layer);
 
         int n_repeating = n_gpu;
         if (n_repeating > 0) {
@@ -1566,8 +1566,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
         }
         LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
 
-        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 1;
+        const int max_backend_supported_layers = n_layer + 1;
+        const int max_offloadable_layers       = n_layer + 1;
 
         LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
     }
@@ -1636,7 +1636,7 @@ const float * llama_model::tensor_split() const {
 }
 
 uint32_t llama_model::n_gpu_layers() const {
-    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
+    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer() + 1;
 }
 
 llama_split_mode llama_model::split_mode() const {
@@ -1707,17 +1707,17 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: n_ctx_train           = %u\n",     __func__, hparams.n_ctx_train);
         LLAMA_LOG_INFO("%s: n_embd                = %u\n",     __func__, hparams.n_embd);
         LLAMA_LOG_INFO("%s: n_embd_inp            = %u\n",     __func__, hparams.n_embd_inp());
-        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
-        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer());
+        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer()).c_str());
+        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer()).c_str());
         LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot_full);
         LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
         LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
         LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k_full);
         LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v_full);
-        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer()).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer()).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer()).c_str());
         LLAMA_LOG_INFO("%s: f_norm_eps            = %.1e\n",   __func__, hparams.f_norm_eps);
         LLAMA_LOG_INFO("%s: f_norm_rms_eps        = %.1e\n",   __func__, hparams.f_norm_rms_eps);
         LLAMA_LOG_INFO("%s: f_clamp_kqv           = %.1e\n",   __func__, hparams.f_clamp_kqv);
@@ -1725,7 +1725,7 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: f_logit_scale         = %.1e\n",   __func__, hparams.f_logit_scale);
         LLAMA_LOG_INFO("%s: f_attn_scale          = %.1e\n",   __func__, hparams.f_attention_scale);
         LLAMA_LOG_INFO("%s: f_attn_value_scale    = %.4f\n",   __func__, hparams.f_attn_value_scale);
-        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer()).c_str());
         LLAMA_LOG_INFO("%s: n_expert              = %u\n",     __func__, hparams.n_expert);
         LLAMA_LOG_INFO("%s: n_expert_used         = %u\n",     __func__, hparams.n_expert_used);
         LLAMA_LOG_INFO("%s: n_expert_groups       = %d\n",     __func__, hparams.n_expert_groups);
@@ -1852,7 +1852,7 @@ void llama_model::print_info() const {
             LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
             LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
             LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-            LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
+            LLAMA_LOG_INFO("%s: n_layer_nextn         = %d\n",     __func__, hparams.n_layer_nextn);
         }
 
         if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
@@ -2034,22 +2034,21 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                     llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
                     llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
                     if (arch == LLM_ARCH_FALCON_H1) {
-                        filter_attn = [&](int32_t) { return true; };
-                        filter_recr = [&](int32_t) { return true; };
+                        filter_attn = [&](uint32_t) { return true; };
+                        filter_recr = [&](uint32_t) { return true; };
                     } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
-                        filter_attn = [&](int32_t il) {
+                        filter_attn = [&](uint32_t il) {
                             return !hparams.is_recr(il) && hparams.n_ff(il) == 0;
                         };
-                        filter_recr = [&](int32_t il) {
+                        filter_recr = [&](uint32_t il) {
                             return hparams.is_recr(il) && hparams.n_ff(il) == 0;
                         };
                     } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
-                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
-                        filter_attn = [&, n_main](int32_t il) {
-                            return (uint32_t)il < n_main && !hparams.is_recr(il);
+                        filter_attn = [&](uint32_t il) {
+                            return il < hparams.n_layer() && !hparams.is_recr(il);
                         };
-                        filter_recr = [&, n_main](int32_t il) {
-                            return (uint32_t)il < n_main && hparams.is_recr(il);
+                        filter_recr = [&](uint32_t il) {
+                            return il < hparams.n_layer() && hparams.is_recr(il);
                         };
                     }
 
@@ -2098,9 +2097,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                     llama_kv_cache::layer_filter_cb filter = nullptr;
 
                     if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
-                        reuse = [&](int32_t il) {
-                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
-                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
+                        reuse = [&](uint32_t il) {
+                            GGML_ASSERT(hparams.n_layer_kv_from_start >= 2);
+
+                            if (il >= (uint32_t)hparams.n_layer_kv_from_start) {
+                                return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
                             }
 
                             return -1;
@@ -2108,16 +2109,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                     }
 
                     if (mtp_on_hybrid_qwen35) {
-                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
-                        filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
+                        filter = [&](uint32_t il) { return il >= hparams.n_layer(); };
                     }
 
-                    if (arch == LLM_ARCH_STEP35 && hparams.nextn_predict_layers > 0) {
-                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
+                    if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) {
                         if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) {
-                            filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
+                            filter = [&](uint32_t il) { return il >= hparams.n_layer(); };
                         } else {
-                            filter = [n_main](int32_t il) { return (uint32_t)il <  n_main; };
+                            filter = [&](uint32_t il) { return il <  hparams.n_layer(); };
                         }
                     }
 
@@ -2242,7 +2241,7 @@ int32_t llama_model_n_embd_out(const llama_model * model) {
 }
 
 int32_t llama_model_n_layer(const llama_model * model) {
-    return model->hparams.n_layer;
+    return model->hparams.n_layer();
 }
 
 int32_t llama_model_n_head(const llama_model * model) {
diff --git a/src/llama-model.h b/src/llama-model.h
index a561374ed95..884cfdf5c3a 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -700,7 +700,8 @@ const char * llm_type_name(llm_type type);
 // convenience macro for loading local variables for load_tensors() in llama_model_base
 // note: cast to int64_t since we will use these for the tensor dimensions
 #define LLAMA_LOAD_LOCALS \
-    const int     n_layer        = hparams.n_layer;          GGML_UNUSED(n_layer); \
+    const int     n_layer        = hparams.n_layer();        GGML_UNUSED(n_layer); \
+    const int     n_layer_all    = hparams.n_layer_all;      GGML_UNUSED(n_layer_all); \
     const int64_t n_head         = hparams.n_head();         GGML_UNUSED(n_head); \
     const int64_t n_head_kv      = hparams.n_head_kv();      GGML_UNUSED(n_head_kv); \
     const int64_t n_embd         = hparams.n_embd;           GGML_UNUSED(n_embd); \
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 43e05c3d56f..cf92ce4bb8b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
             qs.has_tied_embeddings = false;
         }
     }
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
 }
 
 //
@@ -1348,7 +1348,7 @@ llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * des
     model->hparams.n_embd             = desc->n_embd;
     model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
     model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
-    model->hparams.n_layer            = desc->n_layer;
+    model->hparams.n_layer_all        = desc->n_layer;
     model->hparams.n_expert           = desc->n_expert;
 
     for (uint32_t i = 0; i < desc->n_layer; i++) {
diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp
index a7c77ee5d28..063b214256e 100644
--- a/src/models/afmoe.cpp
+++ b/src/models/afmoe.cpp
@@ -30,7 +30,7 @@ void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) {
         hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 56: type = LLM_TYPE_6B; break;
         case 32: type = LLM_TYPE_26B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp
index bec7136521c..6dfb8905fbe 100644
--- a/src/models/apertus.cpp
+++ b/src/models/apertus.cpp
@@ -2,12 +2,13 @@
 
 void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);
 
-    switch (hparams.n_layer) {
+    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer());
+    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer());
+    ml.get_key_or_arr(LLM_KV_XIELU_BETA,    hparams.xielu_beta,    hparams.n_layer());
+    ml.get_key_or_arr(LLM_KV_XIELU_EPS,     hparams.xielu_eps,     hparams.n_layer());
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_8B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp
index d086c4717ff..9536e7c5d42 100644
--- a/src/models/arcee.cpp
+++ b/src/models/arcee.cpp
@@ -4,7 +4,7 @@ void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
     // Arcee uses the same structure as Llama
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 36: type = LLM_TYPE_4B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp
index 27deadffeb7..09ee0f752f0 100644
--- a/src/models/arctic.cpp
+++ b/src/models/arctic.cpp
@@ -4,7 +4,7 @@ void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
     if (hparams.n_expert == 128) {
-        switch (hparams.n_layer) {
+        switch (hparams.n_layer()) {
             case 35: type = LLM_TYPE_10B_128x3_66B; break;
             default: type = LLM_TYPE_UNKNOWN;
         }
diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp
index 9bd04127b25..b38b2064785 100644
--- a/src/models/arwkv7.cpp
+++ b/src/models/arwkv7.cpp
@@ -10,7 +10,7 @@ void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
     ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 12:
             switch (hparams.n_embd) {
                 case 768: type = LLM_TYPE_190M; break;
diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp
index 4d26081cd5d..585f3614174 100644
--- a/src/models/baichuan.cpp
+++ b/src/models/baichuan.cpp
@@ -2,7 +2,7 @@
 
 void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 40: type = LLM_TYPE_13B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp
index fe1ae10864b..7faf73c835b 100644
--- a/src/models/bailingmoe.cpp
+++ b/src/models/bailingmoe.cpp
@@ -8,7 +8,7 @@ void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 28: type = LLM_TYPE_16B; break;
         case 88: type = LLM_TYPE_290B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp
index 2f0d44a6259..5000e9c6db8 100644
--- a/src/models/bailingmoe2.cpp
+++ b/src/models/bailingmoe2.cpp
@@ -9,17 +9,13 @@ void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
     ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.n_layer_nextn, false);
 
-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 20: type = LLM_TYPE_16B_A1B; break;
-        case 21: type = LLM_TYPE_16B_A1B; break;
         case 32: type = LLM_TYPE_100B_A6B; break;
-        case 33: type = LLM_TYPE_100B_A6B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
 }
@@ -39,9 +35,9 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) {
     GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
     GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
 
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
         int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             // skip all tensors in the NextN layers
             flags |= TENSOR_SKIP;
         }
@@ -78,7 +74,7 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) {
         }
 
         // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
             layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
             layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
@@ -112,8 +108,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
 
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         // norm
@@ -146,7 +141,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
                     Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
         }
 
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/bert.cpp b/src/models/bert.cpp
index 3c28f419ccf..53ce29f23ca 100644
--- a/src/models/bert.cpp
+++ b/src/models/bert.cpp
@@ -1,9 +1,9 @@
 #include "models.h"
 
 void llama_model_bert::load_arch_hparams(llama_model_loader & ml) {
-    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 3:
             type = LLM_TYPE_17M; break; // bge-micro
         case 6:
diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp
index 7e8125deec4..c8330274580 100644
--- a/src/models/bitnet.cpp
+++ b/src/models/bitnet.cpp
@@ -3,7 +3,7 @@
 void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 26: type = LLM_TYPE_3B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp
index 30b0f3d07d0..609d2ddf998 100644
--- a/src/models/bloom.cpp
+++ b/src/models/bloom.cpp
@@ -3,7 +3,7 @@
 void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1B; break;
         case 30:
             switch (hparams.n_embd) {
diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp
index 4bceaefd63b..4f45acecf84 100644
--- a/src/models/chameleon.cpp
+++ b/src/models/chameleon.cpp
@@ -6,7 +6,7 @@ void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) {
     hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
     ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 48: type = LLM_TYPE_34B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp
index 6766fa71c15..7ae5b938fde 100644
--- a/src/models/chatglm.cpp
+++ b/src/models/chatglm.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 28: {
             if (hparams.n_head(0) == 16) {
                 type = LLM_TYPE_1_5B;
diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp
index 274dd3342a7..de53bb98184 100644
--- a/src/models/codeshell.cpp
+++ b/src/models/codeshell.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 42: type = LLM_TYPE_7B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp
index 2e231bb3f93..750f57a394e 100644
--- a/src/models/cogvlm.cpp
+++ b/src/models/cogvlm.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_13B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp
index a514cf88fc6..61a5945a194 100644
--- a/src/models/cohere2.cpp
+++ b/src/models/cohere2.cpp
@@ -5,6 +5,7 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) {
     uint32_t swa_period = 4;
     ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
     hparams.set_swa_pattern(swa_period);
+
     hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
     hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
 
@@ -12,7 +13,8 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
     ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_8B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp
index adf7fcaa20f..94a46188bb8 100644
--- a/src/models/command-r.cpp
+++ b/src/models/command-r.cpp
@@ -3,7 +3,8 @@
 void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale, false);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 40: type = LLM_TYPE_35B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp
index af71c775365..4f5ac4d06a4 100644
--- a/src/models/dbrx.cpp
+++ b/src/models/dbrx.cpp
@@ -1,14 +1,14 @@
 #include "models.h"
 
 void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) {
-ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
 
-switch (hparams.n_layer) {
-    case 40: type = LLM_TYPE_16x12B; break;
-    default: type = LLM_TYPE_UNKNOWN;
+    switch (hparams.n_layer()) {
+        case 40: type = LLM_TYPE_16x12B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
 }
-        }
 
 void llama_model_dbrx::load_arch_tensors(llama_model_loader &) {
     LLAMA_LOAD_LOCALS;
diff --git a/src/models/deci.cpp b/src/models/deci.cpp
index 567e3535276..cdfcf29e02f 100644
--- a/src/models/deci.cpp
+++ b/src/models/deci.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_deci::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 80: type = LLM_TYPE_70B; break;
         case 162: type = LLM_TYPE_405B; break;
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index 1fe54adc13e..a9e8bc51403 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -5,7 +5,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
 
     // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
-    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
+    const bool is_lite = (hparams.n_layer() == 27 || hparams.n_layer() == 26 || (hparams.n_layer() == 48 && n_vocab == 128256));
 
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
@@ -23,7 +23,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
     if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
         // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
         // that have no expert_gating_func model parameter set
-        if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
+        if ((hparams.n_layer() == 47 || hparams.n_layer() == 48) && n_vocab == 154880) {
             // GLM 4.7 Lite
             hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
         } else {
@@ -43,7 +43,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
 
     hparams.f_attn_temp_offset = 0.0f;
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 27: type = LLM_TYPE_16B; break;
         case 47: type = LLM_TYPE_30B_A3B; break;
         case 60: type = LLM_TYPE_236B; break;
@@ -191,8 +191,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p
 
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < effective_n_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         // norm
@@ -366,7 +365,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p
                             Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
             }
         }
-        if (il == effective_n_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp
index f9e4c98785c..65d31c31b93 100644
--- a/src/models/deepseek2ocr.cpp
+++ b/src/models/deepseek2ocr.cpp
@@ -14,7 +14,7 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) {
         hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 12: type = LLM_TYPE_3B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/deepseek32.cpp b/src/models/deepseek32.cpp
index c92ab60d166..9a20e2ce907 100644
--- a/src/models/deepseek32.cpp
+++ b/src/models/deepseek32.cpp
@@ -31,7 +31,7 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
 
     // Expert gating function
-    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
 
     if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
         // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
@@ -40,13 +40,10 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
     }
 
     // NextN/MTP parameters
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
 
-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 62: type = LLM_TYPE_685B_A37B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
@@ -82,9 +79,9 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
     }
 
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
         int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             // skip all tensors in the NextN layers
             // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
             flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
@@ -142,7 +139,7 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
         }
 
         // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
             layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
             layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
@@ -205,8 +202,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_
 
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < effective_n_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         // norm
@@ -427,7 +423,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_
                         Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
             }
         }
-        if (il == effective_n_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp
index 435d27281c6..07d6ab1b7cd 100644
--- a/src/models/dots1.cpp
+++ b/src/models/dots1.cpp
@@ -8,7 +8,8 @@ void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
     ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 62: type = LLM_TYPE_142B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/dream.cpp b/src/models/dream.cpp
index 12ac6f1ce88..abe737c335a 100644
--- a/src/models/dream.cpp
+++ b/src/models/dream.cpp
@@ -2,8 +2,9 @@
 
 void llama_model_dream::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
     // Dream models are primarily 7B with 28 layers
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 28:
             type = LLM_TYPE_7B;
             break;
diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp
index 9b39c605e35..895cf690bd2 100644
--- a/src/models/ernie4-5.cpp
+++ b/src/models/ernie4-5.cpp
@@ -12,7 +12,7 @@ void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) {
         ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 18: type = LLM_TYPE_0_3B; break;
         case 28: type = LLM_TYPE_21B_A3B; break;
         case 54: type = LLM_TYPE_300B_A47B; break;
diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp
index ddf13c3028f..0948d7de656 100644
--- a/src/models/eurobert.cpp
+++ b/src/models/eurobert.cpp
@@ -3,7 +3,7 @@
 void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    if (hparams.n_layer == 12) {
+    if (hparams.n_layer() == 12) {
         type = LLM_TYPE_SMALL;  // 0.2B
     }
 }
diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp
index 76d91982fc5..bccf169f8c0 100644
--- a/src/models/exaone-moe.cpp
+++ b/src/models/exaone-moe.cpp
@@ -20,13 +20,12 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
     ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
 
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_30B_A3B; break;
-        case 48:
-        case 49: type = LLM_TYPE_235B_A22B; break;
+        case 48: type = LLM_TYPE_235B_A22B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
 }
@@ -50,9 +49,9 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
     }
 
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
         int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             // skip all tensors in the NextN layers
             flags |= TENSOR_SKIP;
         }
@@ -70,7 +69,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
         layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, flags);
 
         // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
-        if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
+        if (i < (int) hparams.n_layer_dense_lead || (i >= n_layer)) {
             layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
             layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
             layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, flags);
@@ -95,7 +94,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
         }
 
         // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
             layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,   "weight", i), {n_embd}, flags);
             layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,   "weight", i), {n_embd}, flags);
@@ -130,8 +129,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
 
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         // use RoPE for SWA layers
@@ -170,7 +168,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
                 Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
             cb(cur, "attn_out", il);
         }
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp
index c7e9960d718..676fb37b5a6 100644
--- a/src/models/exaone.cpp
+++ b/src/models/exaone.cpp
@@ -3,7 +3,7 @@
 void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_8B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp
index b5030eb0545..863268abcef 100644
--- a/src/models/exaone4.cpp
+++ b/src/models/exaone4.cpp
@@ -1,7 +1,7 @@
 #include "models.h"
 
 void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
-    if (hparams.n_layer == 64) {    // 32B
+    if (hparams.n_layer() == 64) {    // 32B
         hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
         hparams.n_swa = 4096;
         uint32_t swa_period = 4;
@@ -15,11 +15,11 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
 
     ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.n_layer_nextn, false);
 
-    switch (hparams.n_layer) {
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
+
+    switch (hparams.n_layer()) {
         case 30: type = LLM_TYPE_1_2B; break;
         case 64: type = LLM_TYPE_32B; break;
         default: type = LLM_TYPE_UNKNOWN;
@@ -40,8 +40,8 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
     }
 
-    for (int i = 0; i < n_layer; ++i) {
-        const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers;
+    for (int i = 0; i < n_layer_all; ++i) {
+        const bool is_nextn = i >= n_layer;
         int flags = 0;
         if (is_nextn) {
             // NextN/MTP layers are preserved in GGUF but are not executed yet.
@@ -109,11 +109,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
     }
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    // MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe).
-    const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers);
-    GGML_ASSERT(n_layer_main > 0);
-
-    for (int il = 0; il < n_layer_main; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         // use RoPE for SWA layers or non-SWA models
@@ -149,7 +145,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
                     Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
             cb(cur, "attn_out", il);
         }
-        if (il == n_layer_main - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp
index c130ccdd49e..d6ef2d51986 100644
--- a/src/models/falcon-h1.cpp
+++ b/src/models/falcon-h1.cpp
@@ -13,7 +13,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) {
 
     std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 36:
             type = LLM_TYPE_0_5B; break;
         case 24:
diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp
index ad546ef2db5..b2ad90b3272 100644
--- a/src/models/falcon.cpp
+++ b/src/models/falcon.cpp
@@ -3,7 +3,7 @@
 void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 60: type = LLM_TYPE_40B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp
index 4e07f5f2bda..80ed3b1a460 100644
--- a/src/models/gemma-embedding.cpp
+++ b/src/models/gemma-embedding.cpp
@@ -21,7 +21,7 @@ void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) {
     GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
     GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_0_3B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp
index 1519682fdf6..651cd7e64de 100644
--- a/src/models/gemma.cpp
+++ b/src/models/gemma.cpp
@@ -3,7 +3,7 @@
 void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 18: type = LLM_TYPE_2B; break;
         case 28: type = LLM_TYPE_7B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp
index ae3f9ffb530..2fbfb15a94a 100644
--- a/src/models/gemma2.cpp
+++ b/src/models/gemma2.cpp
@@ -16,7 +16,7 @@ void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
     ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 26: type = LLM_TYPE_2B; break;
         case 42: type = LLM_TYPE_9B; break;
         case 46: type = LLM_TYPE_27B; break;
diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp
index 63a2b380e71..690194529e3 100644
--- a/src/models/gemma3.cpp
+++ b/src/models/gemma3.cpp
@@ -17,7 +17,7 @@ void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 18: type = LLM_TYPE_270M; break;
         case 26: type = LLM_TYPE_1B; break;
         case 32: type = LLM_TYPE_8B; break; // Rnj-1
diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp
index 6ec3a006081..83eb8250aa9 100644
--- a/src/models/gemma3n.cpp
+++ b/src/models/gemma3n.cpp
@@ -6,14 +6,14 @@ void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) {
     hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
     hparams.set_swa_pattern(swa_period);
 
-    hparams.n_layer_kv_from_start     = 20;
-    hparams.f_attention_scale         = 1.0f;
+    hparams.n_layer_kv_from_start = 20;
+    hparams.f_attention_scale     = 1.0f;
 
     ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
     ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 30: type = LLM_TYPE_E2B; break;
         case 35: type = LLM_TYPE_E4B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp
index 31906de33d9..7198e541116 100644
--- a/src/models/gemma4.cpp
+++ b/src/models/gemma4.cpp
@@ -2,12 +2,12 @@
 
 void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
     hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
 
     uint32_t n_kv_shared_layers = 0;
     ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
 
-    hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
+    hparams.n_layer_kv_from_start = hparams.n_layer_all - (int32_t)n_kv_shared_layers;
     hparams.f_attention_scale     = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)
 
     ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
@@ -19,7 +19,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,  hparams.n_embd_head_v_swa);
     ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 30: type = LLM_TYPE_26B_A4B; break;
         case 35: type = LLM_TYPE_E2B; break;
         case 42: type = LLM_TYPE_E4B; break;
diff --git a/src/models/glm-dsa.cpp b/src/models/glm-dsa.cpp
index af2b55ef563..11d91312def 100644
--- a/src/models/glm-dsa.cpp
+++ b/src/models/glm-dsa.cpp
@@ -33,13 +33,10 @@ void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) {
     }
 
     // NextN/MTP parameters
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 79: type = LLM_TYPE_744B_A40B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
@@ -76,9 +73,9 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
     }
 
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
         int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             // skip all tensors in the NextN layers
             // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
             flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
@@ -135,8 +132,8 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
             layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
         }
 
-        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn
+        if (i >= n_layer) {
             layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
             layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
             layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp
index 27654b8cba3..3105c56b530 100644
--- a/src/models/glm4-moe.cpp
+++ b/src/models/glm4-moe.cpp
@@ -20,16 +20,13 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) {
     }
 
     // NextN/MTP parameters
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
-        case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
-        case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
-        case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
+    switch (hparams.n_layer()) {
+        case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air
+        case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
+        case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5
         default: type = LLM_TYPE_UNKNOWN;
     }
 }
@@ -54,9 +51,9 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) {
 
     // Load ALL tensors including NextN layer to satisfy total tensor count
     // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
         int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             // skip all tensors in the NextN layers
             flags |= TENSOR_SKIP;
         }
@@ -116,7 +113,7 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) {
         }
 
         // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
             layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
             layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
@@ -161,8 +158,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
 
     // Only process up to last layer (skip final NextN layer)
     // Final layer tensors are loaded but not processed in forward pass
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         // Pre-attention norm
@@ -211,7 +207,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
                     model.layers[il].wo, NULL, model.layers[il].wo_s,
                     Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
         }
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index 7c242fed298..b4326c5f210 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -5,13 +5,10 @@ void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
 
     // NextN/MTP parameters (GLM-OCR)
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 17: type = LLM_TYPE_1B; break; // GLM-OCR
         case 40: type = LLM_TYPE_9B; break;
         case 61: type = LLM_TYPE_32B; break;
@@ -32,9 +29,9 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) {
         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
     }
 
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
         int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             // skip all tensors in the NextN layers
             flags |= TENSOR_SKIP;
         }
@@ -55,7 +52,7 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) {
         layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
 
         // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
             layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
             layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
             layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
@@ -100,8 +97,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
 
     // Only process up to last layer (skip final NextN layer)
     // Final layer tensors are loaded but not processed in forward pass
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         // Pre-attention norm
@@ -140,7 +136,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
                     model.layers[il].wo, NULL, model.layers[il].wo_s,
                     Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
         }
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp
index e2dcc8b1521..45afbccc121 100644
--- a/src/models/gpt2.cpp
+++ b/src/models/gpt2.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 12: type = LLM_TYPE_SMALL; break;
         case 24: type = LLM_TYPE_MEDIUM; break;
         case 36: type = LLM_TYPE_LARGE; break;
diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp
index 443e35addf2..ed5e8c50da2 100644
--- a/src/models/gptneox.cpp
+++ b/src/models/gptneox.cpp
@@ -3,7 +3,8 @@
 void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
     ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 6:
             switch (hparams.n_ff()) {
                 case 512:  type = LLM_TYPE_14M; break;
diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp
index 8740d9fc7d9..eb23095aece 100644
--- a/src/models/granite-hybrid.cpp
+++ b/src/models/granite-hybrid.cpp
@@ -19,7 +19,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {
     hparams.rope_finetuned = rope_finetuned;
 
     // A layer is recurrent IFF the n_head_kv value is set to 0
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
         hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
     }
 
diff --git a/src/models/granite-moe.cpp b/src/models/granite-moe.cpp
index 0d89bc1f340..115263c418f 100644
--- a/src/models/granite-moe.cpp
+++ b/src/models/granite-moe.cpp
@@ -12,7 +12,7 @@ void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
     hparams.rope_finetuned = rope_finetuned;
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_3B; break;
         case 40: type = LLM_TYPE_3B; break;
         // Add additional layer/vocab/etc checks here for other model sizes
diff --git a/src/models/granite.cpp b/src/models/granite.cpp
index cda4aa231fa..7aff942da01 100644
--- a/src/models/granite.cpp
+++ b/src/models/granite.cpp
@@ -12,7 +12,7 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
     hparams.rope_finetuned = rope_finetuned;
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_3B; break;
         case 40: type = LLM_TYPE_3B; break;
         // Add additional layer/vocab/etc checks here for other model sizes
diff --git a/src/models/grok.cpp b/src/models/grok.cpp
index 7c46ec1c0f2..42f38af6724 100644
--- a/src/models/grok.cpp
+++ b/src/models/grok.cpp
@@ -26,7 +26,7 @@ void llama_model_grok::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
     ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 64: type = LLM_TYPE_314B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp
index 1cab75adc7f..643a448e59a 100644
--- a/src/models/grovemoe.cpp
+++ b/src/models/grovemoe.cpp
@@ -7,7 +7,7 @@ void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_30B_A3B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp
index deb3c9671f3..4d55f5e7f31 100644
--- a/src/models/hunyuan-moe.cpp
+++ b/src/models/hunyuan-moe.cpp
@@ -5,7 +5,7 @@ void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
     ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_A13B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp
index f9ee37a24b6..f6cfdfb9458 100644
--- a/src/models/internlm2.cpp
+++ b/src/models/internlm2.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 48: type = LLM_TYPE_20B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/jais.cpp b/src/models/jais.cpp
index 2ba162605f1..415103ce23a 100644
--- a/src/models/jais.cpp
+++ b/src/models/jais.cpp
@@ -4,7 +4,7 @@ void llama_model_jais::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
     ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1_3B; break;
         case 40: type = LLM_TYPE_13B; break;
         /* TODO: add variants */
diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp
index 8966131441c..8610fcc9f82 100644
--- a/src/models/jais2.cpp
+++ b/src/models/jais2.cpp
@@ -3,7 +3,7 @@
 void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_8B; break;
         case 68: type = LLM_TYPE_70B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp
index a62b121b3ee..dba160b014f 100644
--- a/src/models/jamba.cpp
+++ b/src/models/jamba.cpp
@@ -8,11 +8,11 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) {
 
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
         hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
         case 12: // 900M  8x???M
         case 32: // 51B  16x?B
diff --git a/src/models/jina-bert-v2.cpp b/src/models/jina-bert-v2.cpp
index 4f8866ece4d..86ff1c84d1a 100644
--- a/src/models/jina-bert-v2.cpp
+++ b/src/models/jina-bert-v2.cpp
@@ -4,7 +4,7 @@ void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
     hparams.f_max_alibi_bias = 8.0f;
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
         case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/jina-bert-v3.cpp b/src/models/jina-bert-v3.cpp
index e0527529f56..1c974a6f16c 100644
--- a/src/models/jina-bert-v3.cpp
+++ b/src/models/jina-bert-v3.cpp
@@ -3,7 +3,7 @@
 void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24:
             type = LLM_TYPE_558M; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index c13f71b5bcb..367f6990d1f 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -14,7 +14,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {
 
     // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
     // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
         hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
     }
 
@@ -25,7 +25,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
     ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp
index 3898b56bb12..97da8a6abb8 100644
--- a/src/models/lfm2.cpp
+++ b/src/models/lfm2.cpp
@@ -5,10 +5,13 @@
 void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+
+    for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
         hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
     }
-    hparams.n_layer_dense_lead = hparams.n_layer;
+
+    hparams.n_layer_dense_lead = hparams.n_layer();
+
     switch (hparams.n_ff()) {
         case  4608: type = LLM_TYPE_350M; break;
         case  6912: type = LLM_TYPE_700M; break;
@@ -16,9 +19,10 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
         case 10752: type = LLM_TYPE_2_6B; break;
         default:    type = LLM_TYPE_UNKNOWN;
     }
+
     if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
         hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-        for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
             hparams.is_swa_impl[il] = !hparams.is_recr_impl[il];
         }
     }
diff --git a/src/models/lfm2moe.cpp b/src/models/lfm2moe.cpp
index 81ced2eaba2..490f5c223eb 100644
--- a/src/models/lfm2moe.cpp
+++ b/src/models/lfm2moe.cpp
@@ -9,11 +9,11 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
     ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
 
-    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+    for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
         hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_8B_A1B;  break;
         case 40: type = LLM_TYPE_24B_A2B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp
index 9722dde9f17..2ae89386447 100644
--- a/src/models/llada-moe.cpp
+++ b/src/models/llada-moe.cpp
@@ -2,11 +2,12 @@
 
 void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
-
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
     // diffusion language model uses non-causal attention
     hparams.causal_attn = false;
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 16: type = LLM_TYPE_A1_7B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/llada.cpp b/src/models/llada.cpp
index 58b2c466e17..87d4259f9a7 100644
--- a/src/models/llada.cpp
+++ b/src/models/llada.cpp
@@ -2,14 +2,16 @@
 
 void llama_model_llada::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
     // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32:
             type = LLM_TYPE_8B;
             break;
         default:
             type = LLM_TYPE_UNKNOWN;
     }
+
     // Set non-causal attention for diffusion models
     hparams.causal_attn = false;
 }
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
index cef66d054b0..c0ec7e0a9ad 100644
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -7,13 +7,13 @@ void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
     if (hparams.n_expert == 8) {
-        switch (hparams.n_layer) {
+        switch (hparams.n_layer()) {
             case 32: type = LLM_TYPE_8x7B; break;
             case 56: type = LLM_TYPE_8x22B; break;
             default: type = LLM_TYPE_UNKNOWN;
         }
     } else {
-        switch (hparams.n_layer) {
+        switch (hparams.n_layer()) {
             case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
             case 22: type = LLM_TYPE_1B; break;
             case 26: type = LLM_TYPE_3B; break;
diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp
index 8f39b3f59a5..7194c72a585 100644
--- a/src/models/llama4.cpp
+++ b/src/models/llama4.cpp
@@ -8,7 +8,7 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) {
     const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
     if (found_swa && hparams.n_swa == 0) {
         hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
-        hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
+        hparams.n_no_rope_layer_step = hparams.n_layer(); // always use rope
     } else {
         hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
         hparams.n_swa                   = 8192;
diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp
index 84cfe399027..ae56a26a1f6 100644
--- a/src/models/maincoder.cpp
+++ b/src/models/maincoder.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_1B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp
index 887a1fa509a..0d94e98281c 100644
--- a/src/models/mamba.cpp
+++ b/src/models/mamba.cpp
@@ -9,7 +9,7 @@ void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) {
 
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24:
             switch (hparams.n_embd) {
                 case 768: type = LLM_TYPE_SMALL; break;
diff --git a/src/models/mamba2.cpp b/src/models/mamba2.cpp
index 3277ca53ec4..c5951cf0f7f 100644
--- a/src/models/mamba2.cpp
+++ b/src/models/mamba2.cpp
@@ -9,7 +9,7 @@ void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) {
 
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24:
             switch (hparams.n_embd) {
                 case 768: type = LLM_TYPE_SMALL; break;
diff --git a/src/models/mellum.cpp b/src/models/mellum.cpp
index 1e1e97e9fa0..28823018bc0 100644
--- a/src/models/mellum.cpp
+++ b/src/models/mellum.cpp
@@ -13,7 +13,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) {
         if (res) {
             hparams.set_swa_pattern(swa_period);
         } else {
-            ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
+            ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
         }
 
         hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
@@ -24,7 +24,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) {
         hparams.swa_type = LLAMA_SWA_TYPE_NONE;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 28: type = LLM_TYPE_12B_A2_5B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp
index 1bcdf696f2e..88989160570 100644
--- a/src/models/mimo2.cpp
+++ b/src/models/mimo2.cpp
@@ -9,18 +9,17 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
     ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa, false);
 
-    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
 
     float value_scale = 0.0f;
     if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) {
         hparams.f_attn_value_scale = value_scale;
     }
 
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
-    switch (hparams.n_layer - hparams.nextn_predict_layers) {
+    switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_310B_A15B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
@@ -35,16 +34,14 @@ void llama_model_mimo2::load_arch_tensors(llama_model_loader &) {
     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
     output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
-    const uint32_t n_nextn = hparams.nextn_predict_layers;
-
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
         auto & layer = layers[i];
         uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
         uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
         uint32_t n_head = hparams.n_head(i);
 
         // NextN/MTP layers (the last n_nextn blocks) are preserved but disabled pending support
-        const bool is_nextn = (n_nextn > 0) && (static_cast<uint32_t>(i) >= n_layer - n_nextn);
+        const bool is_nextn = i >= n_layer;
         const int  skip     = is_nextn ? TENSOR_SKIP : 0;
 
         create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, skip);
@@ -93,10 +90,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param
 
     const float v_scale = hparams.f_attn_value_scale;
 
-    // The last hparams.nextn_predict_layers blocks are MTP heads, currently inactive
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         uint32_t n_head_l    = hparams.n_head(il);
@@ -174,7 +168,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param
             }
         }
 
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp
index 966d3af615c..fc3e5b171d5 100644
--- a/src/models/minicpm.cpp
+++ b/src/models/minicpm.cpp
@@ -3,7 +3,7 @@
 void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) {
     // Backward-compatible defaults for older MiniCPM GGUFs
     hparams.f_embedding_scale = 12.0f;
-    hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
+    hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer()));
     hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
 
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -16,7 +16,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) {
     // MiniCPM uses rope by default, unlike Granite which uses it as a switch
     hparams.rope_finetuned = true;
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 52: type = LLM_TYPE_1B; break;
         case 40: type = LLM_TYPE_2B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp
index 1ffc54fa7c6..e011b1ff0a8 100644
--- a/src/models/minicpm3.cpp
+++ b/src/models/minicpm3.cpp
@@ -5,7 +5,7 @@ void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
     ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 62: type = LLM_TYPE_4B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp
index 22e291d73a3..b25435e4d97 100644
--- a/src/models/minimax-m2.cpp
+++ b/src/models/minimax-m2.cpp
@@ -5,7 +5,7 @@ void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
     ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 62: type = LLM_TYPE_230B_A10B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp
index 1ac5a95ccdc..9a8e3f9a50b 100644
--- a/src/models/mistral3.cpp
+++ b/src/models/mistral3.cpp
@@ -18,7 +18,7 @@ void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) {
         }
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 26: type = LLM_TYPE_3B; break;
         case 34: type = LLM_TYPE_8B; break;
         case 40: type = LLM_TYPE_14B; break;
diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp
index 5ab51867cc0..f3e9407e012 100644
--- a/src/models/modern-bert.cpp
+++ b/src/models/modern-bert.cpp
@@ -22,7 +22,7 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) {
         hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU);
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 12:
             type = LLM_TYPE_47M; break; // granite-embedding-small
         case 22:
diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp
index 0229d20ed36..d094fd9f80b 100644
--- a/src/models/mpt.cpp
+++ b/src/models/mpt.cpp
@@ -5,7 +5,7 @@ void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
     ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 48: type = LLM_TYPE_30B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp
index d2c811d2497..a456269347b 100644
--- a/src/models/nemotron-h.cpp
+++ b/src/models/nemotron-h.cpp
@@ -9,7 +9,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {
 
     // A layer is recurrent IFF the n_head_kv value is set to 0 and
     // the n_ff value is set to 0
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
         hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
     }
 
@@ -22,7 +22,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
     ml.get_key(LLM_KV_MOE_LATENT_SIZE,                   hparams.moe_latent_size, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
         case 56: type = LLM_TYPE_9B; break;
         case 88: type = LLM_TYPE_120B_A12B; break;
diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp
index 5d4a3b5c69e..6e2bd9a33ca 100644
--- a/src/models/nemotron.cpp
+++ b/src/models/nemotron.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_4B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/neo-bert.cpp b/src/models/neo-bert.cpp
index f00d6eddfc9..4a08d7abd40 100644
--- a/src/models/neo-bert.cpp
+++ b/src/models/neo-bert.cpp
@@ -3,7 +3,7 @@
 void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    if (hparams.n_layer == 28) {
+    if (hparams.n_layer() == 28) {
         type = LLM_TYPE_250M;
     }
 }
diff --git a/src/models/nomic-bert-moe.cpp b/src/models/nomic-bert-moe.cpp
index a17abe2c269..da4b62919bb 100644
--- a/src/models/nomic-bert-moe.cpp
+++ b/src/models/nomic-bert-moe.cpp
@@ -4,7 +4,7 @@ void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
     ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
 
-    if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+    if (hparams.n_layer() == 12 && hparams.n_embd == 768) {
         if (arch == LLM_ARCH_NOMIC_BERT) {
             type = LLM_TYPE_137M;
         } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
diff --git a/src/models/nomic-bert.cpp b/src/models/nomic-bert.cpp
index 5a8a5584457..e7fc72286a6 100644
--- a/src/models/nomic-bert.cpp
+++ b/src/models/nomic-bert.cpp
@@ -4,7 +4,7 @@ void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
     ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
 
-    if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+    if (hparams.n_layer() == 12 && hparams.n_embd == 768) {
         if (arch == LLM_ARCH_NOMIC_BERT) {
             type = LLM_TYPE_137M;
         } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp
index cfcf17bcb03..9f7a2ba60ef 100644
--- a/src/models/olmo.cpp
+++ b/src/models/olmo.cpp
@@ -4,7 +4,7 @@ void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
     ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 22: type = LLM_TYPE_1B; break;
         case 32: type = LLM_TYPE_7B; break;
         case 80: type = LLM_TYPE_70B; break;
diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp
index 7cc262f5504..cb52cdef720 100644
--- a/src/models/olmo2.cpp
+++ b/src/models/olmo2.cpp
@@ -17,7 +17,7 @@ void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) {
         hparams.swa_type = LLAMA_SWA_TYPE_NONE;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 16: type = LLM_TYPE_1B; break;
         case 32: type = LLM_TYPE_7B; break;
         case 40: type = LLM_TYPE_13B; break;
diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp
index 7976ae44a51..1e2baeb207f 100644
--- a/src/models/olmoe.cpp
+++ b/src/models/olmoe.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 16: type = LLM_TYPE_A1_7B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp
index 15b6c8c1205..3ab15d61f08 100644
--- a/src/models/openai-moe.cpp
+++ b/src/models/openai-moe.cpp
@@ -14,7 +14,7 @@ void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) {
     hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
     ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_20B; break;
         case 36: type = LLM_TYPE_120B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp
index 9f76350fd4d..13120bd3236 100644
--- a/src/models/openelm.cpp
+++ b/src/models/openelm.cpp
@@ -3,12 +3,12 @@
 void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
-    case 16: type = LLM_TYPE_270M; break;
-    case 20: type = LLM_TYPE_450M; break;
-    case 28: type = LLM_TYPE_1B; break;
-    case 36: type = LLM_TYPE_3B; break;
-    default: type = LLM_TYPE_UNKNOWN;
+    switch (hparams.n_layer()) {
+        case 16: type = LLM_TYPE_270M; break;
+        case 20: type = LLM_TYPE_450M; break;
+        case 28: type = LLM_TYPE_1B; break;
+        case 36: type = LLM_TYPE_3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
     }
 }
 
diff --git a/src/models/orion.cpp b/src/models/orion.cpp
index bcb4bbba4b1..863a2822269 100644
--- a/src/models/orion.cpp
+++ b/src/models/orion.cpp
@@ -3,7 +3,7 @@
 void llama_model_orion::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 40: type = LLM_TYPE_14B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp
index 7593f879b24..90f05c088c1 100644
--- a/src/models/pangu-embed.cpp
+++ b/src/models/pangu-embed.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
         case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp
index 8f3ed5f7b7d..81b1ad12cc0 100644
--- a/src/models/phi2.cpp
+++ b/src/models/phi2.cpp
@@ -3,7 +3,7 @@
 void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1B; break;
         case 32: type = LLM_TYPE_3B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp
index f8a4a4d5aa5..716ff814cc1 100644
--- a/src/models/phi3.cpp
+++ b/src/models/phi3.cpp
@@ -3,7 +3,7 @@
 void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1B; break;
         case 32: type = LLM_TYPE_3B; break;
         case 40: type = LLM_TYPE_14B; break;
diff --git a/src/models/phimoe.cpp b/src/models/phimoe.cpp
index 4575d6139cf..c332553bc7d 100644
--- a/src/models/phimoe.cpp
+++ b/src/models/phimoe.cpp
@@ -3,7 +3,7 @@
 void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_16x3_8B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp
index c7ed1211c31..246144519e4 100644
--- a/src/models/plamo.cpp
+++ b/src/models/plamo.cpp
@@ -3,7 +3,7 @@
 void llama_model_plamo::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 40: type = LLM_TYPE_13B; break;
         default: type = LLM_TYPE_UNKNOWN;
    }
diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp
index 2ffa0898f71..b93cf48bc5c 100644
--- a/src/models/plamo2.cpp
+++ b/src/models/plamo2.cpp
@@ -11,11 +11,11 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
     ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
 
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
         hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 16: type = LLM_TYPE_1B; break;
         case 32:
             if (hparams.n_embd == 2048) {
diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp
index 29f3e803d68..16d0b1dcef7 100644
--- a/src/models/plamo3.cpp
+++ b/src/models/plamo3.cpp
@@ -13,7 +13,7 @@ void llama_model_plamo3::load_arch_hparams(llama_model_loader & ml) {
         hparams.swa_type = LLAMA_SWA_TYPE_NONE;
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_2B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/plm.cpp b/src/models/plm.cpp
index ce050919e6a..8ca325f5e2c 100644
--- a/src/models/plm.cpp
+++ b/src/models/plm.cpp
@@ -3,7 +3,8 @@
 void llama_model_plm::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_1_8B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp
index 00467dbad7d..1f5dff3843c 100644
--- a/src/models/qwen.cpp
+++ b/src/models/qwen.cpp
@@ -3,7 +3,7 @@
 void llama_model_qwen::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 40: type = LLM_TYPE_13B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp
index a5147460bae..e9c2ea80a6b 100644
--- a/src/models/qwen2.cpp
+++ b/src/models/qwen2.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_qwen2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
         case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
         case 32: type = LLM_TYPE_7B; break;
diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp
index 7cb03859deb..e831ed11aad 100644
--- a/src/models/qwen2moe.cpp
+++ b/src/models/qwen2moe.cpp
@@ -5,7 +5,8 @@ void llama_model_qwen2moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
 
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_A2_7B; break;
         case 28: type = LLM_TYPE_57B_A14B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp
index 41b97fed956..1d0d2fab362 100644
--- a/src/models/qwen3.cpp
+++ b/src/models/qwen3.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_qwen3::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
         case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
         case 40: type = LLM_TYPE_14B; break;
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 348650b3796..4b642cff467 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -13,22 +13,20 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
 
     // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
     // Mark recurrent layers (linear attention layers). MTP layers are dense
     // attention-only and must be flagged non-recurrent.
-    if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) {
-        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
-
+    if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) {
         uint32_t full_attn_interval = 4;
         ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
-        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-            hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0);
+        for (uint32_t i = 0; i < hparams.n_layer_all; ++i) {
+            hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0);
         }
     }
 
-    switch (hparams.n_layer - hparams.nextn_predict_layers) {
+    switch (hparams.n_layer()) {
         case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
         case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
         case 64: type = LLM_TYPE_27B; break;
@@ -39,9 +37,7 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) {
     LLAMA_LOAD_LOCALS;
 
-    const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
-    const bool mtp_only   = (hparams.nextn_predict_layers > 0) &&
-                            (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
+    const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
     const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0;
 
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
@@ -122,10 +118,10 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) {
         layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd },              TENSOR_NOT_REQUIRED);
     };
 
-    for (int i = 0; i < (int) n_main; ++i) {
+    for (int i = 0; i < n_layer; ++i) {
         load_block_trunk(i, trunk_flags);
     }
-    for (int i = (int) n_main; i < n_layer; ++i) {
+    for (int i = n_layer; i < n_layer_all; ++i) {
         load_block_mtp(i);
     }
 }
@@ -159,8 +155,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
-    const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
@@ -177,7 +172,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
             cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
         }
 
-        if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
+        if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
@@ -490,15 +485,15 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons
 // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series
 llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
     : llm_graph_context(params) {
-    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0");
-    GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block");
+    GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35 MTP requires n_layer_nextn > 0");
+    GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35 MTP currently only supports a single MTP block");
 
     const int64_t n_embd_head = hparams.n_embd_head_v();
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
 
     // hparams.n_layer includes both main model layers and MTP layers. The MTP
     // layer is stored immediately after the main layers in model.layers[].
-    const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
+    const int il = hparams.n_layer();
     const auto & layer = model.layers[il];
 
     GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index 7d906191cbb..eb5e9a406a1 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -16,22 +16,20 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
 
     // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
     // Mark recurrent layers (linear attention layers). MTP layers are dense
     // attention-only and must be flagged non-recurrent.
-    if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) {
-        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
-
+    if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) {
         uint32_t full_attn_interval = 4;
         ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
-        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-            hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0);
+        for (uint32_t i = 0; i < hparams.n_layer_all; ++i) {
+            hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0);
         }
     }
 
-    switch (hparams.n_layer - hparams.nextn_predict_layers) {
+    switch (hparams.n_layer()) {
         case 40: type = LLM_TYPE_35B_A3B; break;
         case 48: type = LLM_TYPE_122B_A10B; break;
         case 60: type = LLM_TYPE_397B_A17B; break;
@@ -42,9 +40,7 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) {
     LLAMA_LOAD_LOCALS;
 
-    const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
-    const bool mtp_only   = (hparams.nextn_predict_layers > 0) &&
-                            (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
+    const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
     const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0;
 
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
@@ -145,10 +141,10 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) {
         layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd },              TENSOR_NOT_REQUIRED);
     };
 
-    for (int i = 0; i < (int) n_main; ++i) {
+    for (int i = 0; i < n_layer; ++i) {
         load_block_trunk(i, trunk_flags);
     }
-    for (int i = (int) n_main; i < n_layer; ++i) {
+    for (int i = n_layer; i < n_layer_all; ++i) {
         load_block_mtp(i);
     }
 }
@@ -182,8 +178,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
-    const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
@@ -200,7 +195,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
             cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
         }
 
-        if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
+        if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
@@ -555,13 +550,13 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c
 // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE
 llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
     : llm_graph_context(params) {
-    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0");
-    GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block");
+    GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35MOE MTP requires n_layer_nextn > 0");
+    GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35MOE MTP currently only supports a single MTP block");
 
     const int64_t n_embd_head = hparams.n_embd_head_v();
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
 
-    const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
+    const int il = hparams.n_layer();
     const auto & layer = model.layers[il];
 
     GGML_ASSERT(layer.nextn.eh_proj    && "MTP block missing nextn.eh_proj");
diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp
index a4f8e1379c9..317e668bec7 100644
--- a/src/models/qwen3moe.cpp
+++ b/src/models/qwen3moe.cpp
@@ -1,10 +1,10 @@
 #include "models.h"
 
 void llama_model_qwen3moe::load_arch_hparams(llama_model_loader & ml) {
-    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
-
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_30B_A3B; break;
         case 94: type = LLM_TYPE_235B_A22B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
index 9e09ae6f232..97200a44072 100644
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -14,15 +14,15 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
 
     // Mark recurrent layers (linear attention layers)
-    if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) {
+    if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) {
         uint32_t full_attn_interval = 4;
         ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
-        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-            hparams.is_recr_impl[i] = ((i + 1) % full_attn_interval != 0);
+        for (uint32_t i = 0; i < hparams.n_layer_all; ++i) {
+            hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0);
         }
     }
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_80B_A3B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp
index 5defd893944..724d6140d19 100644
--- a/src/models/qwen3vl.cpp
+++ b/src/models/qwen3vl.cpp
@@ -4,7 +4,8 @@ void llama_model_qwen3vl::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
     ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 28: type = LLM_TYPE_1_7B; break;
         case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
         case 64: type = LLM_TYPE_32B; break;
diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp
index 5b77df57122..7c41592f772 100644
--- a/src/models/qwen3vlmoe.cpp
+++ b/src/models/qwen3vlmoe.cpp
@@ -5,7 +5,8 @@ void llama_model_qwen3vlmoe::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
     ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_30B_A3B; break;
         case 94: type = LLM_TYPE_235B_A22B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/refact.cpp b/src/models/refact.cpp
index bf3949a9092..a46c358fa68 100644
--- a/src/models/refact.cpp
+++ b/src/models/refact.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_refact::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_1B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp
index ca8e009615e..fc276ce591b 100644
--- a/src/models/rnd1.cpp
+++ b/src/models/rnd1.cpp
@@ -2,12 +2,13 @@
 
 void llama_model_rnd1::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
-
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_30B_A3B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
+
     // Set non-causal attention for diffusion models
     hparams.causal_attn = false;
 }
diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp
index ba2a9dfa0db..0b5013dc758 100644
--- a/src/models/rwkv6.cpp
+++ b/src/models/rwkv6.cpp
@@ -9,7 +9,7 @@ void llama_model_rwkv6::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
     ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1_6B; break;
         case 32:
             switch (hparams.n_embd) {
diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp
index 566b8cdcb54..6c7db514435 100644
--- a/src/models/rwkv6qwen2.cpp
+++ b/src/models/rwkv6qwen2.cpp
@@ -9,7 +9,7 @@ void llama_model_rwkv6qwen2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
     ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1_6B; break;
         case 32:
             switch (hparams.n_embd) {
diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp
index 7574b252621..67c51f5b59c 100644
--- a/src/models/rwkv7.cpp
+++ b/src/models/rwkv7.cpp
@@ -10,7 +10,7 @@ void llama_model_rwkv7::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
     ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 12:
             switch (hparams.n_embd) {
                 case 768: type = LLM_TYPE_190M; break;
diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp
index 806cba574be..57de881a091 100644
--- a/src/models/seed-oss.cpp
+++ b/src/models/seed-oss.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_seed_oss::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 64: type = LLM_TYPE_36B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp
index 4231cccc666..a8e3d957f1f 100644
--- a/src/models/smallthinker.cpp
+++ b/src/models/smallthinker.cpp
@@ -15,14 +15,14 @@ void llama_model_smallthinker::load_arch_hparams(llama_model_loader & ml) {
         ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
     } else {
         hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
-        hparams.n_no_rope_layer_step = hparams.n_layer;
+        hparams.n_no_rope_layer_step = hparams.n_layer();
     }
 
     ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_4B;  break;
         case 52: type = LLM_TYPE_20B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp
index 90e7d473eaf..c67d967b204 100644
--- a/src/models/smollm3.cpp
+++ b/src/models/smollm3.cpp
@@ -4,7 +4,7 @@ void llama_model_smollm3::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     hparams.n_no_rope_layer_step = 4;
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 36: type = LLM_TYPE_3B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp
index 4da7f7aefcf..bf6087b8796 100644
--- a/src/models/stablelm.cpp
+++ b/src/models/stablelm.cpp
@@ -3,7 +3,7 @@
 void llama_model_stablelm::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1B; break;
         case 32: type = LLM_TYPE_3B; break;
         case 40: type = LLM_TYPE_12B; break;
diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp
index e131af058bc..f73a88fd4e9 100644
--- a/src/models/starcoder.cpp
+++ b/src/models/starcoder.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_starcoder::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 24: type = LLM_TYPE_1B; break;
         case 36: type = LLM_TYPE_3B; break;
         case 42: type = LLM_TYPE_7B; break;
diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp
index 9c207c02885..b81b469374a 100644
--- a/src/models/starcoder2.cpp
+++ b/src/models/starcoder2.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_starcoder2::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 30: type = LLM_TYPE_3B; break;
         case 32: type = LLM_TYPE_7B; break;
         case 40: type = LLM_TYPE_15B; break;
diff --git a/src/models/step35.cpp b/src/models/step35.cpp
index cf9942b200f..e2218c58704 100644
--- a/src/models/step35.cpp
+++ b/src/models/step35.cpp
@@ -23,16 +23,16 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,  hparams.n_swa);
     ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,        hparams.rope_freq_base_train_swa, false);
 
-    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
 
-    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer, false);
-    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer(), false);
+    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer(), false);
 
     // NextN/MTP (Step3p5): extra decoder block appended beyond the main stack.
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
 
-    switch (hparams.n_layer - hparams.nextn_predict_layers) {
+    switch (hparams.n_layer()) {
         case 45: type = LLM_TYPE_196B_A11B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
@@ -41,15 +41,12 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
     LLAMA_LOAD_LOCALS;
 
-    const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
-    const bool mtp_only   = (hparams.nextn_predict_layers > 0) &&
-                            (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
+    const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
     // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP
     // tensors live in a separate file (e.g. user split target/draft). Mark
     // MTP tensors NOT_REQUIRED so the trunk loads cleanly.
-    const std::string mtp_probe = "blk." + std::to_string(n_main) + ".nextn.eh_proj.weight";
-    const bool trunk_only = (hparams.nextn_predict_layers > 0) &&
-                            (ml.get_weight(mtp_probe.c_str()) == nullptr);
+    const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight";
+    const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr);
     const int trunk_flags = mtp_only  ? TENSOR_NOT_REQUIRED : 0;
     const int mtp_flags   = trunk_only ? TENSOR_NOT_REQUIRED : 0;
 
@@ -176,7 +173,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
         layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
     };
 
-    for (int i = 0; i < (int) n_main; ++i) {
+    for (int i = 0; i < n_layer; ++i) {
         load_block_trunk(i, trunk_flags);
     }
     // Only the first MTP block (i == n_main) is required at runtime — the
@@ -184,8 +181,8 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
     // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with
     // all MTP layers still works) but tolerated when absent via the pruning
     // path. See scripts/prune_step35_extra_mtp.py for the pruner.
-    for (int i = (int) n_main; i < n_layer; ++i) {
-        load_block_mtp(i, /*is_first_mtp=*/ i == (int) n_main);
+    for (int i = n_layer; i < n_layer_all; ++i) {
+        load_block_mtp(i, /*is_first_mtp=*/ i == n_layer);
     }
 }
 
@@ -206,8 +203,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
-    const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
         const uint32_t n_head_l    = hparams.n_head(il);
@@ -294,7 +290,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para
             cb(cur, "attn_proj", il);
         }
 
-        if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
+        if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
             cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
@@ -374,7 +370,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para
 // LLM_GRAPH_TYPE_DECODER_MTP draft head for Step3p5 (MoE)
 llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
     : llm_graph_context(params) {
-    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0");
+    GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0");
 
     // Single-block MTP only: always run the first trained MTP block (Qwen
     // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to
@@ -382,7 +378,7 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just
     // block 0) also work — see load_arch_tensors below and
     // scripts/prune_step35_extra_mtp.py.
-    const int il       = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
+    const int il = hparams.n_layer();
     const auto & layer = model.layers[il];
 
     GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
diff --git a/src/models/t5.cpp b/src/models/t5.cpp
index 73e32741406..b0e3f062572 100644
--- a/src/models/t5.cpp
+++ b/src/models/t5.cpp
@@ -9,10 +9,10 @@ void llama_model_t5::load_arch_hparams(llama_model_loader & ml) {
         hparams.dec_start_token_id = dec_start_token_id;
     }
 
-    hparams.dec_n_layer = hparams.n_layer;
+    hparams.dec_n_layer = hparams.n_layer();
     ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 6:  type = LLM_TYPE_60M;  break; // t5-small
         case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
         case 12:
diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp
index 1258eeb19b6..393e8f65bf4 100644
--- a/src/models/talkie.cpp
+++ b/src/models/talkie.cpp
@@ -4,7 +4,7 @@ void llama_model_talkie::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
 
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
         case 40: type = LLM_TYPE_13B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp
index d6d1c7a2e5d..3135001293a 100644
--- a/src/models/xverse.cpp
+++ b/src/models/xverse.cpp
@@ -2,7 +2,8 @@
 
 void llama_model_xverse::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
         case 32: type = LLM_TYPE_7B; break;
         case 40: type = LLM_TYPE_13B; break;
         case 80: type = LLM_TYPE_65B; break;