Skip to content

Commit 09db3a4

Browse files
authored
Update webui to handle reasoning content and include usage stats in server only when requested (#791)
* handle reasoning content in webui server : include usage statistics only when user request them (#16052) server : only attempt to enable thinking if using jinja (#15967) * config reasoning_content in webui and change default to auto --------- Co-authored-by: firecoperana <firecoperana>
1 parent 45afaf3 commit 09db3a4

File tree

7 files changed

+87
-58
lines changed

7 files changed

+87
-58
lines changed

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ struct gpt_params {
253253
bool use_jinja = false; // NOLINT
254254
std::string system_prompt = "";
255255
bool enable_chat_template = true;
256-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
256+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
257257
int reasoning_budget = -1;
258258
bool prefill_assistant = true;
259259

260 Bytes
Binary file not shown.

examples/server/server.cpp

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ struct server_task_result {
173173
std::vector<llama_token> tokens;
174174

175175
bool stream;
176+
bool include_usage;
176177
std::string prompt;
177178
//slot_params generation_params;
178179

@@ -500,22 +501,22 @@ struct server_task_result {
500501
{"model", oaicompat_model},
501502
{"object", "chat.completion.chunk"},
502503
});
503-
504-
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
505-
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
506-
deltas.push_back({
507-
{"choices", json::array()},
508-
{"created", t},
509-
{"id", oaicompat_cmpl_id},
510-
{"model", oaicompat_model},
511-
{"object", "chat.completion.chunk"},
512-
{"usage", json {
513-
{"completion_tokens", n_decoded},
514-
{"prompt_tokens", n_prompt_tokens},
515-
{"total_tokens", n_decoded + n_prompt_tokens},
516-
}},
517-
});
518-
504+
if (include_usage) {
505+
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
506+
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
507+
deltas.push_back({
508+
{"choices", json::array()},
509+
{"created", t},
510+
{"id", oaicompat_cmpl_id},
511+
{"model", oaicompat_model},
512+
{"object", "chat.completion.chunk"},
513+
{"usage", json {
514+
{"completion_tokens", n_decoded},
515+
{"prompt_tokens", n_prompt_tokens},
516+
{"total_tokens", n_decoded + n_prompt_tokens},
517+
}},
518+
});
519+
}
519520
if (timings.prompt_n >= 0) {
520521
deltas.back().push_back({ "timings", timings.to_json() });
521522
}
@@ -547,6 +548,7 @@ struct server_task_multi {
547548

548549
struct slot_params {
549550
bool stream = true;
551+
bool include_usage = false;
550552
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
551553

552554
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -1359,7 +1361,7 @@ struct server_context {
13591361
// thinking is enabled if:
13601362
// 1. It's not explicitly disabled (reasoning_budget == 0)
13611363
// 2. The chat template supports it
1362-
const bool enable_thinking = params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
1364+
const bool enable_thinking = params.use_jinja && params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
13631365
//LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
13641366

13651367
oai_parser_opt = {
@@ -1514,6 +1516,8 @@ struct server_context {
15141516
}
15151517
slot.params.timings_per_token = json_value(data, "timings_per_token", false);
15161518
slot.params.stream = json_value(data, "stream", false);
1519+
auto stream_opt = json_value(data, "stream_options", json::object());
1520+
slot.params.include_usage = json_value(stream_opt, "include_usage", false);
15171521
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
15181522
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
15191523
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
@@ -2206,6 +2210,7 @@ struct server_context {
22062210
res.error = false;
22072211
res.stop = true; // to do: set value
22082212
res.stream = slot.params.stream;
2213+
res.include_usage = slot.params.include_usage;
22092214
res.content = slot.generated_text;
22102215
res.oaicompat = slot.params.oaicompat;
22112216
res.oaicompat_model = slot.params.oaicompat_model;

examples/server/webui/dist/index.html

Lines changed: 37 additions & 37 deletions
Large diffs are not rendered by default.

examples/server/webui/src/Config.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ export const CONFIG_DEFAULT = {
1616
showTokensPerSecond: false,
1717
showThoughtInProgress: false,
1818
excludeThoughtOnReq: true,
19+
reasoning_format: 'auto',
1920
// make sure these default values are in sync with `common.h`
2021
samplers: 'dkypmxnt',
2122
temperature: 0.8,
@@ -42,6 +43,7 @@ export const CONFIG_DEFAULT = {
4243
pyIntepreterEnabled: false,
4344
};
4445
export const CONFIG_INFO: Record<string, string> = {
46+
reasoning_format : 'Specify how to parse reasoning content. none: reasoning content in content block. auto: reasoning content in reasoning_content. ',
4547
apiKey: 'Set the API Key if you are using --api-key option for the server.',
4648
systemMessage: 'The starting message that defines how model should behave.',
4749
samplers:

examples/server/webui/src/components/SettingDialog.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import toast from 'react-hot-toast'
2222
type SettKey = keyof typeof CONFIG_DEFAULT;
2323

2424
const BASIC_KEYS: SettKey[] = [
25+
'reasoning_format',
2526
'temperature',
2627
'top_k',
2728
'top_p',

examples/server/webui/src/utils/app.context.tsx

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ export const AppContextProvider = ({
215215
messages,
216216
stream: true,
217217
cache_prompt: true,
218-
reasoning_format: 'none',
218+
reasoning_format: config.reasoning_format===''?'auto':config.reasoning_format,
219219
samplers: config.samplers,
220220
temperature: config.temperature,
221221
dynatemp_range: config.dynatemp_range,
@@ -226,7 +226,7 @@ export const AppContextProvider = ({
226226
typical_p: config.typical_p,
227227
xtc_probability: config.xtc_probability,
228228
xtc_threshold: config.xtc_threshold,
229-
top_n_sigma: config.top_n_sigma,
229+
top_n_sigma: config.top_n_sigma,
230230
repeat_last_n: config.repeat_last_n,
231231
repeat_penalty: config.repeat_penalty,
232232
presence_penalty: config.presence_penalty,
@@ -257,14 +257,35 @@ export const AppContextProvider = ({
257257
throw new Error(body?.error?.message || 'Unknown error');
258258
}
259259
const chunks = getSSEStreamAsync(fetchResponse);
260+
let thinkingTagOpen = false;
260261
for await (const chunk of chunks) {
261262
// const stop = chunk.stop;
262263
if (chunk.error) {
263264
throw new Error(chunk.error?.message || 'Unknown error');
264265
}
266+
267+
const reasoningContent = chunk.choices?.[0]?.delta?.reasoning_content;
268+
if (reasoningContent) {
269+
if (pendingMsg.content === null || pendingMsg.content === '') {
270+
thinkingTagOpen = true;
271+
pendingMsg = {
272+
...pendingMsg,
273+
content: '<think>' + reasoningContent,
274+
};
275+
} else {
276+
pendingMsg = {
277+
...pendingMsg,
278+
content: pendingMsg.content + reasoningContent,
279+
};
280+
}
281+
}
265282
const addedContent = chunk.choices?.[0]?.delta?.content;
266-
const lastContent = pendingMsg.content || '';
283+
let lastContent = pendingMsg.content || '';
267284
if (addedContent) {
285+
if (thinkingTagOpen) {
286+
lastContent = lastContent + '</think>';
287+
thinkingTagOpen = false;
288+
}
268289
pendingMsg = {
269290
...pendingMsg,
270291
content: lastContent + addedContent,

0 commit comments

Comments
 (0)