Update webui to handle reasoning content and include usage stats in server only when requested (#791)

firecoperana · web-flow · commit 09db3a494fa8 · 2025-09-24T07:45:09.000+02:00
* handle reasoning content in webui
server : include usage statistics only when user request them (#16052)
server : only attempt to enable thinking if using jinja (#15967)

* config reasoning_content in webui and change default to auto

---------

Co-authored-by: firecoperana &lt;firecoperana&gt;
diff --git a/common/common.h b/common/common.h
@@ -253,7 +253,7 @@ struct gpt_params {
     bool use_jinja = false;                                                                                 // NOLINT
     std::string system_prompt = "";
     bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
     int reasoning_budget = -1;
     bool prefill_assistant = true;
 
diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -173,6 +173,7 @@ struct server_task_result {
     std::vector<llama_token> tokens;
 
     bool stream;
+    bool include_usage;
     std::string prompt;
     //slot_params generation_params;
 
@@ -500,22 +501,22 @@ struct server_task_result {
             {"model",              oaicompat_model},
             {"object",             "chat.completion.chunk"},
          });
-
-        // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
-        // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
-        deltas.push_back({
-            {"choices", json::array()},
-            {"created",            t},
-            {"id",                 oaicompat_cmpl_id},
-            {"model",              oaicompat_model},
-            {"object",             "chat.completion.chunk"},
-            {"usage", json {
-                {"completion_tokens", n_decoded},
-                {"prompt_tokens",     n_prompt_tokens},
-                {"total_tokens",      n_decoded + n_prompt_tokens},
-            }},
-            });
-
+        if (include_usage) {
+            // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
+            // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
+            deltas.push_back({
+                {"choices", json::array()},
+                {"created",            t},
+                {"id",                 oaicompat_cmpl_id},
+                {"model",              oaicompat_model},
+                {"object",             "chat.completion.chunk"},
+                {"usage", json {
+                    {"completion_tokens", n_decoded},
+                    {"prompt_tokens",     n_prompt_tokens},
+                    {"total_tokens",      n_decoded + n_prompt_tokens},
+                }},
+                });
+        }
         if (timings.prompt_n >= 0) {
             deltas.back().push_back({ "timings", timings.to_json() });
         }
@@ -547,6 +548,7 @@ struct server_task_multi {
 
 struct slot_params {
     bool stream       = true;
+    bool include_usage = false;
     bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
 
     int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
@@ -1359,7 +1361,7 @@ struct server_context {
         // thinking is enabled if:
         // 1. It's not explicitly disabled (reasoning_budget == 0)
         // 2. The chat template supports it
-        const bool enable_thinking = params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
+        const bool enable_thinking = params.use_jinja && params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
         //LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
 
         oai_parser_opt = {
@@ -1514,6 +1516,8 @@ struct server_context {
         }
         slot.params.timings_per_token = json_value(data, "timings_per_token", false);
         slot.params.stream             = json_value(data, "stream",            false);
+        auto stream_opt = json_value(data, "stream_options", json::object());
+        slot.params.include_usage = json_value(stream_opt, "include_usage", false);
         slot.params.cache_prompt       = json_value(data, "cache_prompt",      true);
         slot.params.n_predict          = json_value(data, "n_predict",         json_value(data, "max_tokens", default_params.n_predict));
         slot.sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
@@ -2206,6 +2210,7 @@ struct server_context {
         res.error    = false;
         res.stop = true; // to do: set value
         res.stream = slot.params.stream;
+        res.include_usage = slot.params.include_usage;
         res.content = slot.generated_text;
         res.oaicompat = slot.params.oaicompat;
         res.oaicompat_model = slot.params.oaicompat_model;
diff --git a/examples/server/webui/dist/index.html b/examples/server/webui/dist/index.html
diff --git a/examples/server/webui/src/Config.ts b/examples/server/webui/src/Config.ts
@@ -16,6 +16,7 @@ export const CONFIG_DEFAULT = {
   showTokensPerSecond: false,
   showThoughtInProgress: false,
   excludeThoughtOnReq: true,
+  reasoning_format: 'auto',
   // make sure these default values are in sync with `common.h`
   samplers: 'dkypmxnt',
   temperature: 0.8,
@@ -42,6 +43,7 @@ export const CONFIG_DEFAULT = {
   pyIntepreterEnabled: false,
 };
 export const CONFIG_INFO: Record<string, string> = {
+  reasoning_format : 'Specify how to parse reasoning content. none: reasoning content in content block. auto: reasoning content in reasoning_content. ',
   apiKey: 'Set the API Key if you are using --api-key option for the server.',
   systemMessage: 'The starting message that defines how model should behave.',
   samplers:
diff --git a/examples/server/webui/src/components/SettingDialog.tsx b/examples/server/webui/src/components/SettingDialog.tsx
@@ -22,6 +22,7 @@ import toast from 'react-hot-toast'
 type SettKey = keyof typeof CONFIG_DEFAULT;
 
 const BASIC_KEYS: SettKey[] = [
+  'reasoning_format',
   'temperature',
   'top_k',
   'top_p',
diff --git a/examples/server/webui/src/utils/app.context.tsx b/examples/server/webui/src/utils/app.context.tsx
@@ -215,7 +215,7 @@ export const AppContextProvider = ({
         messages,
         stream: true,
         cache_prompt: true,
-        reasoning_format: 'none',
+        reasoning_format: config.reasoning_format===''?'auto':config.reasoning_format,
         samplers: config.samplers,
         temperature: config.temperature,
         dynatemp_range: config.dynatemp_range,
@@ -226,7 +226,7 @@ export const AppContextProvider = ({
         typical_p: config.typical_p,
         xtc_probability: config.xtc_probability,
         xtc_threshold: config.xtc_threshold,
-		top_n_sigma: config.top_n_sigma,
+		    top_n_sigma: config.top_n_sigma,
         repeat_last_n: config.repeat_last_n,
         repeat_penalty: config.repeat_penalty,
         presence_penalty: config.presence_penalty,
@@ -257,14 +257,35 @@ export const AppContextProvider = ({
         throw new Error(body?.error?.message || 'Unknown error');
       }
       const chunks = getSSEStreamAsync(fetchResponse);
+      let thinkingTagOpen = false;
       for await (const chunk of chunks) {
         // const stop = chunk.stop;
         if (chunk.error) {
           throw new Error(chunk.error?.message || 'Unknown error');
         }
+        
+        const reasoningContent = chunk.choices?.[0]?.delta?.reasoning_content;
+        if (reasoningContent) {
+          if (pendingMsg.content === null || pendingMsg.content === '') {
+            thinkingTagOpen = true;
+            pendingMsg = {
+              ...pendingMsg,
+              content: '<think>' + reasoningContent,
+            };
+          } else {
+            pendingMsg = {
+              ...pendingMsg,
+              content: pendingMsg.content + reasoningContent,
+            };
+          }
+        }
         const addedContent = chunk.choices?.[0]?.delta?.content;
-        const lastContent = pendingMsg.content || '';
+        let lastContent = pendingMsg.content || '';
         if (addedContent) {
+            if (thinkingTagOpen) {
+              lastContent = lastContent + '</think>';
+              thinkingTagOpen = false;
+            }
           pendingMsg = {
             ...pendingMsg,
             content: lastContent + addedContent,