Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/cpp/llama-cpp/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

LLAMA_VERSION?=22d66b567eef11cf2e9832f04db64ee0323a0fd0
LLAMA_VERSION?=d6588daa800058dfa54f1d7ea695b1a810c8ae18
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

CMAKE_ARGS?=
Expand Down
15 changes: 13 additions & 2 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2204,7 +2204,15 @@ class BackendServiceImpl final : public backend::Backend::Service {
// content element — attaching to both would duplicate the first
// token since oaicompat_msg_diffs is the same for both.
json first_res_json = first_result->to_json();
if (first_res_json.is_array()) {
// Upstream llama.cpp (ggml-org/llama.cpp#23884) now emits an initial
// "begin" partial whose to_json() returns null, used only to signal the
// HTTP layer to flush 200 status headers before any token. gRPC has no
// such concept, so there is nothing to emit — the real tokens arrive in
// the loop below. Feeding this null into build_reply_from_json would
// throw (uncaught) and surface as a generic RPC error.
if (first_res_json.is_null()) {
// skip the begin-of-stream marker
} else if (first_res_json.is_array()) {
for (const auto & res : first_res_json) {
auto reply = build_reply_from_json(res, first_result.get());
// Skip chat deltas for role-init elements (have "role" in
Expand Down Expand Up @@ -2234,7 +2242,10 @@ class BackendServiceImpl final : public backend::Backend::Service {
}

json res_json = result->to_json();
if (res_json.is_array()) {
if (res_json.is_null()) {
// begin-of-stream marker (see note above) — nothing to emit
continue;
} else if (res_json.is_array()) {
for (const auto & res : res_json) {
auto reply = build_reply_from_json(res, result.get());
bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
Expand Down
Loading