Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/common/build-info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
#include <cstdio>
#include <string>

int LLAMA_BUILD_NUMBER = 9254;
char const * LLAMA_COMMIT = "e947228";
int LLAMA_BUILD_NUMBER = 9297;
char const * LLAMA_COMMIT = "b0df4c0";
char const * LLAMA_COMPILER = "unknown";
char const * LLAMA_BUILD_TARGET = "unknown";

Expand Down
1 change: 1 addition & 0 deletions cpp/common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ struct common_chat_parser_params {
bool reasoning_in_content = false;
std::string generation_prompt;
bool parse_tool_calls = true;
bool is_continuation = false;
bool echo = false; // Include assistant prefilled msg in output
bool debug = false; // Enable debug output for PEG parser
common_peg_arena parser = {};
Expand Down
2 changes: 2 additions & 0 deletions cpp/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,8 @@ struct common_params_speculative_draft {
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.0f; // minimum speculative decoding probability (greedy)

bool backend_sampling = true; // offload draft sampling to the backend (default: on)

common_params_model mparams;

llama_context * ctx_tgt = nullptr;
Expand Down
44 changes: 37 additions & 7 deletions cpp/common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,15 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
};

static std::string common_speculative_get_devices_str(const std::vector<lm_ggml_backend_dev_t> & devices) {
if (devices.empty()) {
return "default";
}

std::string result;
for (size_t i = 0; i < devices.size(); i++) {
if (i > 0) result += ", ";
if (devices[i] == nullptr) {
continue;
}
if (!result.empty()) result += ", ";
result += lm_ggml_backend_dev_name(devices[i]);
}
return result;
return result.empty() ? "default" : result;
}

struct common_speculative_config {
Expand Down Expand Up @@ -414,6 +413,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

std::vector<common_sampler_ptr> smpls;

// backend sampler chain per seq, attached to ctx_dft
std::vector<llama_sampler *> backend_chains;

int32_t n_embd = 0;

// Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
Expand Down Expand Up @@ -445,7 +447,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
n_embd = llama_model_n_embd(llama_get_model(ctx_dft));

LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
this->params.n_gpu_layers,
lm_ggml_type_name(this->params.cache_type_k),
Expand All @@ -469,6 +471,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
}

// offload draft sampling to the backend
backend_chains.assign(n_seq, nullptr);
if (this->params.backend_sampling) {
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));

if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
llama_sampler_free(chain);
chain = nullptr;
}
backend_chains[seq_id] = chain;
}
}

llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);

Expand All @@ -484,6 +502,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
}

~common_speculative_impl_draft_mtp() override {
auto * ctx_dft = this->params.ctx_dft;
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
if (backend_chains[seq_id] == nullptr) {
continue;
}
if (ctx_dft) {
llama_set_sampler(ctx_dft, seq_id, nullptr);
}
llama_sampler_free(backend_chains[seq_id]);
}
backend_chains.clear();

if (batch.token != nullptr) {
free(batch.token);
batch.token = nullptr;
Expand Down
6 changes: 3 additions & 3 deletions cpp/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ void lm_ggml_backend_tensor_get_2d_async(lm_ggml_backend_t backend, const struct
LM_GGML_ASSERT(tensor);
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");

if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) {
for (size_t i = 0; i < n_copies; i++) {
lm_ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
Expand All @@ -317,7 +317,7 @@ void lm_ggml_backend_tensor_get_2d_async(lm_ggml_backend_t backend, const struct
}

LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
LM_GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
LM_GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}

Expand Down Expand Up @@ -379,7 +379,7 @@ void lm_ggml_backend_tensor_get_2d(const struct lm_ggml_tensor * tensor, void *
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");

if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
if (n_copies <= 1 || buf->iface.get_tensor_2d == NULL) {
for (size_t i = 0; i < n_copies; i++) {
lm_ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
Expand Down
7 changes: 4 additions & 3 deletions cpp/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2735,9 +2735,10 @@ static bool lm_ggml_hexagon_supported_ssm_conv(const struct lm_ggml_hexagon_sess
if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
return false;
}

// TODO: add support for non-contiguous tensors
if (!lm_ggml_is_contiguous(src0) || !lm_ggml_is_contiguous(src1) || !lm_ggml_is_contiguous(dst)) {
if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
return false;
}
if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
return false;
}

Expand Down
Loading