Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/reasoning-budget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ static struct llama_sampler_i common_reasoning_budget_i = {
/* .backend_accept = */ nullptr,
/* .backend_apply = */ nullptr,
/* .backend_set_input = */ nullptr,
/* .backend_n_nodes = */ nullptr,
};

static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
Expand Down
4 changes: 4 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ extern "C" {
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
struct llama_sampler_seq_config * samplers;
size_t n_samplers;
uint32_t n_sampling_outputs_max;
};

struct llama_model_tensor_override {
Expand Down Expand Up @@ -1259,6 +1260,9 @@ extern "C" {

// called before graph execution to set inputs for the current ubatch
void (*backend_set_input)(struct llama_sampler * smpl);

// returns the number of ggml tensors that a backend sampler needs.
uint32_t (*backend_n_nodes)(const struct llama_sampler * smpl);
};

struct llama_sampler {
Expand Down
139 changes: 77 additions & 62 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ llama_context::llama_context(

cparams.ctx_type = params.ctx_type;

cparams.n_sampling_outputs_max = params.n_sampling_outputs_max;

// Initialize backend samplers here so they are part of the sampling graph
// before the reserve passes run later in this function. This avoids a later
// re-reserve when graph nodes change.
Expand Down Expand Up @@ -1486,8 +1488,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
return 0;
}

static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
std::map<llama_seq_id, uint32_t> seq_to_row;
static std::map<llama_seq_id, std::vector<uint32_t>> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
std::map<llama_seq_id, std::vector<uint32_t>> seq_to_row;
// how many output tokens we have seen so far for this ubatch.
uint32_t local = 0;
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
Expand All @@ -1498,96 +1500,111 @@ static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubat

const llama_seq_id seq_id = ubatch.seq_id[i][0];
// row_offset is the number of output tokens before this ubatch.
seq_to_row[seq_id] = row_offset + local;
seq_to_row[seq_id].push_back(row_offset + local);
++local;
}
return seq_to_row;
}

static void copy_tensor_async_ints(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
const buffer_view<llama_token> & sampled,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
ggml_backend_sched_t sched) {
if (!sampled.has_data()) {
return;
}

for (const auto & [seq_id, tensor] : tensor_map) {
for (const auto & [seq_id, tensors] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}

const uint32_t row = it->second;
GGML_ASSERT(row < sampled.size);
const std::vector<uint32_t> & rows = it->second;

for (size_t i = 0; i < tensors.size(); ++i) {
const uint32_t row = rows[i];
ggml_tensor * tensor = tensors[i];

GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
GGML_ASSERT(row < sampled.size);
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");

ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
}
}
}

static void copy_tensor_async_floats(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
const buffer_view<float> & dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
ggml_backend_sched_t sched) {
if (!dst.has_data()) {
return;
}

for (const auto & [seq_id, tensor] : tensor_map) {
for (const auto & [seq_id, tensors] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}

const uint32_t row = it->second;
GGML_ASSERT(row < counts.size());
const std::vector<uint32_t> & rows = it->second;

for (size_t i = 0; i < tensors.size(); ++i) {
const uint32_t row = rows[i];
ggml_tensor * tensor = tensors[i];

GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
GGML_ASSERT(row < counts.size());
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");

ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
float * row_ptr = dst.data + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
float * row_ptr = dst.data + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));

// Update the actual number of logits/probabilities that were written for this row.
counts[row] = ggml_nelements(tensor);
// Update the actual number of logits/probabilities that were written for this row.
counts[row] = ggml_nelements(tensor);
}
}
}

static void copy_tensor_async_candidates(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
const buffer_view<llama_token> & dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
ggml_backend_sched_t sched) {
if (!dst.has_data()) {
return;
}

for (const auto & [seq_id, tensor] : tensor_map) {
for (const auto & [seq_id, tensors] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}

const uint32_t row = it->second;
GGML_ASSERT(row < counts.size());
const std::vector<uint32_t> & rows = it->second;

for (size_t i = 0; i < tensors.size(); ++i) {
const uint32_t row = rows[i];
ggml_tensor * tensor = tensors[i];

GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
GGML_ASSERT(row < counts.size());
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");

ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
llama_token * row_ptr = dst.data + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
llama_token * row_ptr = dst.data + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));

// Update the actual number of candidates that were written.
counts[row] = ggml_nelements(tensor);
// Update the actual number of candidates that were written.
counts[row] = ggml_nelements(tensor);
}
}
}

Expand Down Expand Up @@ -1635,30 +1652,6 @@ int llama_context::decode(const llama_batch & batch_inp) {

const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;

// TODO: avoid this workaround in the future
if (has_samplers && batch_inp.logits) {
std::vector<int32_t> seq_output_count(n_seq_max, 0);

for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
if (batch_inp.logits[i] == 0) {
continue;
}

const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;

for (int32_t s = 0; s < ns; ++s) {
const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;

seq_output_count[seq_id]++;
if (seq_output_count[seq_id] > 1) {
LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
__func__, seq_id, seq_output_count[seq_id]);
return -1;
}
}
}
}

if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
return -1;
Expand Down Expand Up @@ -2192,14 +2185,35 @@ void llama_context::output_reorder() {
//

uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
uint32_t res;
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
res = std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
} else {
res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
for (const auto & lora : model.loras) {
res += lora->get_n_nodes();
}
}
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
for (const auto & lora : model.loras) {
res += lora->get_n_nodes();

// Account for backend sampling with multiple outputs per sequence.
uint32_t sampling_nodes = 0;
if (!sampling.samplers.empty()) {
uint32_t backend_n_nodes = 0;
for (const auto & [seq_id, sampler] : sampling.samplers) {
if (sampler->iface->backend_n_nodes) {
backend_n_nodes += sampler->iface->backend_n_nodes(sampler);
}
}

const uint32_t sampling_outputs = std::min<uint32_t>(n_tokens, cparams.n_sampling_outputs_max);
const uint32_t max_samplers = cparams.n_seq_max;
const uint32_t n_active_samplers = (uint32_t) sampling.samplers.size();
const uint32_t logits_t_pad = 1;
// each active sampler contributes one logits_seq view plus its backend tensors per output
sampling_nodes = (backend_n_nodes + n_active_samplers) * sampling_outputs * max_samplers + logits_t_pad;
}
return res;

return res + sampling_nodes;
}

llm_graph_result * llama_context::get_gf_res_reserve() const {
Expand Down Expand Up @@ -3353,6 +3367,7 @@ llama_context_params llama_context_default_params() {
/*.kv_unified =*/ false,
/*.sampler =*/ nullptr,
/*.n_sampler =*/ 0,
/*.n_sampling_outputs_max =*/ 32,
};

return result;
Expand Down
2 changes: 2 additions & 0 deletions src/llama-cparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct llama_cparams {
enum llama_context_type ctx_type;
enum llama_pooling_type pooling_type;

uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling

ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
};
Loading
Loading