ggml-org · danbev · Feb 23, 2026 · Feb 25, 2026 · Feb 26, 2026 · Feb 25, 2026
@@ -193,6 +193,7 @@ static struct llama_sampler_i common_reasoning_budget_i = {
     /* .backend_accept    = */ nullptr,
     /* .backend_apply     = */ nullptr,
     /* .backend_set_input = */ nullptr,
+    /* .backend_n_nodes   = */ nullptr,
 };
 
 static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {

diff --git a/include/llama.h b/include/llama.h
@@ -387,6 +387,7 @@ extern "C" {
         // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
         struct llama_sampler_seq_config * samplers;
         size_t                            n_samplers;
+        uint32_t                          n_sampling_outputs_max;
     };
 
     struct llama_model_tensor_override {
@@ -1259,6 +1260,9 @@ extern "C" {
 
         // called before graph execution to set inputs for the current ubatch
         void (*backend_set_input)(struct llama_sampler * smpl);
+
+        // returns the number of ggml tensors that a backend sampler needs.
+        uint32_t (*backend_n_nodes)(const struct llama_sampler * smpl);
     };
 
     struct llama_sampler {

@@ -85,6 +85,8 @@ llama_context::llama_context(
 
     cparams.ctx_type          = params.ctx_type;
 
+    cparams.n_sampling_outputs_max = params.n_sampling_outputs_max;
+
     // Initialize backend samplers here so they are part of the sampling graph
     // before the reserve passes run later in this function. This avoids a later
     // re-reserve when graph nodes change.
@@ -1486,8 +1488,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
     return 0;
 }
 
-static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
-    std::map<llama_seq_id, uint32_t> seq_to_row;
+static std::map<llama_seq_id, std::vector<uint32_t>> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
+    std::map<llama_seq_id, std::vector<uint32_t>> seq_to_row;
     // how many output tokens we have seen so far for this ubatch.
     uint32_t local = 0;
     for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
@@ -1498,96 +1500,111 @@ static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubat
 
         const llama_seq_id seq_id = ubatch.seq_id[i][0];
         // row_offset is the number of output tokens before this ubatch.
-        seq_to_row[seq_id] = row_offset + local;
+        seq_to_row[seq_id].push_back(row_offset + local);
         ++local;
     }
     return seq_to_row;
 }
 
 static void copy_tensor_async_ints(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
+    const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
     const buffer_view<llama_token> & sampled,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
+    const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
     ggml_backend_sched_t sched) {
     if (!sampled.has_data()) {
         return;
     }
 
-    for (const auto & [seq_id, tensor] : tensor_map) {
+    for (const auto & [seq_id, tensors] : tensor_map) {
         auto it = seq_to_row.find(seq_id);
         if (it == seq_to_row.end()) {
             continue;
         }
 
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < sampled.size);
+        const std::vector<uint32_t> & rows = it->second;
+
+        for (size_t i = 0; i < tensors.size(); ++i) {
+            const uint32_t row = rows[i];
+            ggml_tensor * tensor = tensors[i];
 
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
+            GGML_ASSERT(row < sampled.size);
+            GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
 
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
+            ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
+            ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
+        }
     }
 }
 
 static void copy_tensor_async_floats(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
+    const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
     const buffer_view<float> & dst,
     size_t stride,
     std::vector<uint32_t> & counts,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
+    const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
     ggml_backend_sched_t sched) {
     if (!dst.has_data()) {
         return;
     }
 
-    for (const auto & [seq_id, tensor] : tensor_map) {
+    for (const auto & [seq_id, tensors] : tensor_map) {
         auto it = seq_to_row.find(seq_id);
         if (it == seq_to_row.end()) {
             continue;
         }
 
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < counts.size());
+        const std::vector<uint32_t> & rows = it->second;
+
+        for (size_t i = 0; i < tensors.size(); ++i) {
+            const uint32_t row = rows[i];
+            ggml_tensor * tensor = tensors[i];
 
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
+            GGML_ASSERT(row < counts.size());
+            GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
 
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        float * row_ptr = dst.data + (size_t) row * stride;
-        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
+            ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
+            float * row_ptr = dst.data + (size_t) row * stride;
+            ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
 
-        // Update the actual number of logits/probabilities that were written for this row.
-        counts[row] = ggml_nelements(tensor);
+            // Update the actual number of logits/probabilities that were written for this row.
+            counts[row] = ggml_nelements(tensor);
+        }
     }
 }
 
 static void copy_tensor_async_candidates(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
+    const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
     const buffer_view<llama_token> & dst,
     size_t stride,
     std::vector<uint32_t> & counts,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
+    const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
     ggml_backend_sched_t sched) {
     if (!dst.has_data()) {
         return;
     }
 
-    for (const auto & [seq_id, tensor] : tensor_map) {
+    for (const auto & [seq_id, tensors] : tensor_map) {
         auto it = seq_to_row.find(seq_id);
         if (it == seq_to_row.end()) {
             continue;
         }
 
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < counts.size());
+        const std::vector<uint32_t> & rows = it->second;
+
+        for (size_t i = 0; i < tensors.size(); ++i) {
+            const uint32_t row = rows[i];
+            ggml_tensor * tensor = tensors[i];
 
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
+            GGML_ASSERT(row < counts.size());
+            GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
 
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        llama_token * row_ptr = dst.data + (size_t) row * stride;
-        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
+            ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
+            llama_token * row_ptr = dst.data + (size_t) row * stride;
+            ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
 
-        // Update the actual number of candidates that were written.
-        counts[row] = ggml_nelements(tensor);
+            // Update the actual number of candidates that were written.
+            counts[row] = ggml_nelements(tensor);
+        }
     }
 }
 
@@ -1635,30 +1652,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
     const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
 
-    // TODO: avoid this workaround in the future
-    if (has_samplers && batch_inp.logits) {
-        std::vector<int32_t> seq_output_count(n_seq_max, 0);
-
-        for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
-            if (batch_inp.logits[i] == 0) {
-                continue;
-            }
-
-            const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
-
-            for (int32_t s = 0; s < ns; ++s) {
-                const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
-
-                seq_output_count[seq_id]++;
-                if (seq_output_count[seq_id] > 1) {
-                    LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
-                            __func__, seq_id, seq_output_count[seq_id]);
-                    return -1;
-                }
-            }
-        }
-    }
-
     if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
@@ -2192,14 +2185,35 @@ void llama_context::output_reorder() {
 //
 
 uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
+    uint32_t res;
     if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
-        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+        res = std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+    } else {
+        res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
+        for (const auto & lora : model.loras) {
+            res += lora->get_n_nodes();
+        }
     }
-    uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
-    for (const auto & lora : model.loras) {
-        res += lora->get_n_nodes();
+
+    // Account for backend sampling with multiple outputs per sequence.
+    uint32_t sampling_nodes = 0;
+    if (!sampling.samplers.empty()) {
+        uint32_t backend_n_nodes = 0;
+        for (const auto & [seq_id, sampler] : sampling.samplers) {
+            if (sampler->iface->backend_n_nodes) {
+                backend_n_nodes += sampler->iface->backend_n_nodes(sampler);
+            }
+        }
+
+        const uint32_t sampling_outputs = std::min<uint32_t>(n_tokens, cparams.n_sampling_outputs_max);
+        const uint32_t max_samplers = cparams.n_seq_max;
+        const uint32_t n_active_samplers = (uint32_t) sampling.samplers.size();
+        const uint32_t logits_t_pad = 1;
+        // each active sampler contributes one logits_seq view plus its backend tensors per output
+        sampling_nodes = (backend_n_nodes + n_active_samplers) * sampling_outputs * max_samplers + logits_t_pad;
     }
-    return res;
+
+    return res + sampling_nodes;
 }
 
 llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -3353,6 +3367,7 @@ llama_context_params llama_context_default_params() {
         /*.kv_unified                  =*/ false,
         /*.sampler                     =*/ nullptr,
         /*.n_sampler                   =*/ 0,
+        /*.n_sampling_outputs_max      =*/ 32,
     };
 
     return result;

@@ -46,6 +46,8 @@ struct llama_cparams {
     enum llama_context_type ctx_type;
     enum llama_pooling_type pooling_type;
 
+    uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling
+
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
 };