PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 2 additions & 0 deletions b/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎custom_ops/setup_ops.py‎
Lines changed: 3 additions & 1 deletion b/‎custom_ops/setup_ops.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fastdeploy/demo/tokenzier_client_demo.py‎
Lines changed: 74 additions & 0 deletions b/‎fastdeploy/demo/tokenzier_client_demo.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎fastdeploy/engine/args_utils.py‎
Lines changed: 10 additions & 0 deletions b/‎fastdeploy/engine/args_utils.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎fastdeploy/engine/engine.py‎
Lines changed: 1 addition & 1 deletion b/‎fastdeploy/engine/engine.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/entrypoints/chat_utils.py‎
Lines changed: 7 additions & 0 deletions b/‎fastdeploy/entrypoints/chat_utils.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎fastdeploy/entrypoints/llm.py‎
Lines changed: 1 addition & 1 deletion b/‎fastdeploy/entrypoints/llm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/entrypoints/openai/api_server.py‎
Lines changed: 12 additions & 2 deletions b/‎fastdeploy/entrypoints/openai/api_server.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎fastdeploy/entrypoints/openai/protocol.py‎
Lines changed: 4 additions & 2 deletions b/‎fastdeploy/entrypoints/openai/protocol.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎fastdeploy/entrypoints/openai/response_processors.py‎
Lines changed: 145 additions & 0 deletions b/‎fastdeploy/entrypoints/openai/response_processors.py‎
Lines changed: 145 additions & 0 deletions
@@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("recv_expert_count"), py::arg("block_size"),
         "per token per block quant");
 
+#ifdef ENABLE_MACHETE
   /*machete/machete_mm.cu
    * machete_mm
    */
@@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
    * machete_supported_schedules
    */
   m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
+#endif
 
   /**
    * moe/fused_moe/moe_topk_select.cu
 
@@ -373,6 +373,7 @@ def find_end_files(directory, end_str):
         if not os.listdir(json_dir):
             raise ValueError("Git clone nlohmann_json failed!")
 
+    cc_compile_args = []
     nvcc_compile_args = get_gencode_flags(archs)
     nvcc_compile_args += ["-DPADDLE_DEV"]
     nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
@@ -519,12 +520,13 @@ def find_end_files(directory, end_str):
         sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
         os.system("python gpu_ops/machete/generate.py")
         sources += find_end_files("gpu_ops/machete", ".cu")
+        cc_compile_args += ["-DENABLE_MACHETE"]
 
     setup(
         name="fastdeploy_ops",
         ext_modules=CUDAExtension(
             sources=sources,
-            extra_compile_args={"nvcc": nvcc_compile_args},
+            extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
             libraries=["cublasLt"],
             extra_link_args=["-lcuda"],
         ),
 
@@ -0,0 +1,74 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import asyncio
+
+from fastdeploy.input.tokenzier_client import (
+    AsyncTokenizerClient,
+    ImageDecodeRequest,
+    ImageEncodeRequest,
+    VideoEncodeRequest,
+)
+
+
+async def main():
+    """
+    测试AsyncTokenizerClient类
+    """
+    base_url = "http://example.com/"
+
+    client = AsyncTokenizerClient(base_url=base_url)
+
+    # # 测试图片编码请求
+    image_encode_request = ImageEncodeRequest(
+        version="v1", req_id="req_image_001", is_gen=False, resolution=512, image_url="http://example.com/image.jpg"
+    )
+
+    image_encode_ret = await client.encode_image(image_encode_request)
+    print(f"Image encode result:{image_encode_ret}")
+
+    # 测试视频编码请求
+    video_encode_req = VideoEncodeRequest(
+        version="v1",
+        req_id="req_video_001",
+        video_url="http://example.com/video.mp4",
+        is_gen=False,
+        resolution=1024,
+        start_ts=0,
+        end_ts=5,
+        frames=1,
+    )
+    video_encode_result = await client.encode_video(video_encode_req)
+    print(f"Video Encode Result:{video_encode_result}")
+    # 测试图片解码请求
+    with open("./image_decode_demo.json", "r", encoding="utf-8") as file:
+        import json
+        import time
+
+        start_time = time.time()
+        start_process_time = time.process_time()  # 记录开始时间
+        json_data = json.load(file)
+        image_decoding_request = ImageDecodeRequest(req_id="req_image_001", data=json_data.get("data"))
+        # import pdb; pdb.set_trace()
+        image_decode_result = await client.decode_image(image_decoding_request)
+        print(f"Image decode result:{image_decode_result}")
+        elapsed_time = time.time() - start_time
+        elapsed_process_time = time.process_time() - start_process_time
+        print(f"decode elapsed_time: {elapsed_time:.6f}s, elapsed_process_time: {elapsed_process_time:.6f}s")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -71,6 +71,10 @@ class EngineArgs:
     """
     The name or path of the tokenizer (defaults to model path if not provided).
     """
+    tokenizer_base_url: str = None
+    """
+    The base URL of the remote tokenizer service (used instead of local tokenizer if provided).
+    """
     max_model_len: int = 2048
     """
     Maximum context length supported by the model.
@@ -426,6 +430,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.tokenizer,
             help="Tokenizer name or path (defaults to model path if not specified).",
         )
+        model_group.add_argument(
+            "--tokenizer-base-url",
+            type=nullable_str,
+            default=EngineArgs.tokenizer_base_url,
+            help="The base URL of the remote tokenizer service (used instead of local tokenizer if provided).",
+        )
         model_group.add_argument(
             "--max-model-len",
             type=int,
 
@@ -377,7 +377,7 @@ def _setting_environ_variables(self):
             "PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
             "FLAGS_use_append_attn": 1,
             "NCCL_ALGO": "Ring",
-            "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 32768)),
+            "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
             "FLAGS_hardamard_moe_block_size": int(os.getenv("FLAGS_hardamard_moe_block_size", 128)),
             "FLAGS_hardamard_use_diagonal_block_matrix": int(
                 os.getenv("FLAGS_hardamard_use_diagonal_block_matrix", 0)
 
@@ -14,6 +14,7 @@
 # limitations under the License.
 """
 
+import os
 import uuid
 from copy import deepcopy
 from pathlib import Path
@@ -162,9 +163,15 @@ def parse_chat_messages(messages):
 
 def load_chat_template(
     chat_template: Union[Path, str],
+    model_path: Path = None,
     is_literal: bool = False,
 ) -> Optional[str]:
     if chat_template is None:
+        if model_path:
+            chat_template_file = os.path.join(model_path, "chat_template.jinja")
+            if os.path.exists(chat_template_file):
+                with open(chat_template_file) as f:
+                    return f.read()
         return None
     if is_literal:
         if isinstance(chat_template, Path):
 
@@ -102,7 +102,7 @@ def __init__(
         self.master_node_ip = self.llm_engine.cfg.master_ip
         self._receive_output_thread = threading.Thread(target=self._receive_output, daemon=True)
         self._receive_output_thread.start()
-        self.chat_template = load_chat_template(chat_template)
+        self.chat_template = load_chat_template(chat_template, model)
 
     def _check_master(self):
         """
 
@@ -77,10 +77,13 @@
     help="max waiting time for connection, if set value -1 means no waiting time limit",
 )
 parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency")
+parser.add_argument(
+    "--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. "
+)
 parser = EngineArgs.add_cli_args(parser)
 args = parser.parse_args()
 args.model = retrive_model_from_server(args.model, args.revision)
-chat_template = load_chat_template(args.chat_template)
+chat_template = load_chat_template(args.chat_template, args.model)
 if args.tool_parser_plugin:
     ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 llm_engine = None
@@ -176,7 +179,14 @@ async def lifespan(app: FastAPI):
     )
     app.state.model_handler = model_handler
     chat_handler = OpenAIServingChat(
-        engine_client, app.state.model_handler, pid, args.ips, args.max_waiting_time, chat_template
+        engine_client,
+        app.state.model_handler,
+        pid,
+        args.ips,
+        args.max_waiting_time,
+        chat_template,
+        args.enable_mm_output,
+        args.tokenizer_base_url,
     )
     completion_handler = OpenAIServingCompletion(
         engine_client,
 
@@ -163,8 +163,9 @@ class ChatMessage(BaseModel):
     Chat message.
     """
 
-    role: str
-    content: str
+    role: Optional[str] = None
+    content: Optional[str] = None
+    multimodal_content: Optional[List[Any]] = None
     reasoning_content: Optional[str] = None
     tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
     prompt_token_ids: Optional[List[int]] = None
@@ -226,6 +227,7 @@ class DeltaMessage(BaseModel):
 
     role: Optional[str] = None
     content: Optional[str] = None
+    multimodal_content: Optional[List[Any]] = None
     prompt_token_ids: Optional[List[int]] = None
     completion_token_ids: Optional[List[int]] = None
     reasoning_content: Optional[str] = None
 
@@ -0,0 +1,145 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Any, List, Optional
+
+from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
+
+
+class ChatResponseProcessor:
+    """
+    A decoder class to build multimodal content (text/image) from token_ids.
+
+    Attributes:
+        eoi_token_id: Token ID indicating the end of an image (<eoi>).
+    """
+
+    def __init__(
+        self,
+        data_processor,
+        enable_mm_output: Optional[bool] = False,
+        eoi_token_id: Optional[int] = 101032,
+        eos_token_id: Optional[int] = 2,
+        decoder_base_url: Optional[str] = None,
+    ):
+        self.data_processor = data_processor
+        self.enable_mm_output = enable_mm_output
+        self.eoi_token_id = eoi_token_id
+        self.eos_token_id = eos_token_id
+        if decoder_base_url is not None:
+            self.decoder_client = AsyncTokenizerClient(base_url=decoder_base_url)
+        self._mm_buffer: List[Any] = []  # Buffer for accumulating image token_ids
+        self._end_image_code_request_output: Optional[Any] = None
+        self._multipart_buffer = []
+
+    def enable_multimodal_content(self):
+        return self.enable_mm_output
+
+    def accumulate_token_ids(self, request_output):
+        decode_type = request_output["outputs"].get("decode_type", 0)
+
+        if not self._multipart_buffer:
+            self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
+        else:
+            last_part = self._multipart_buffer[-1]
+
+            if last_part["decode_type"] == decode_type:
+                last_token_ids = last_part["request_output"]["outputs"]["token_ids"]
+                last_token_ids.extend(request_output["outputs"]["token_ids"])
+                request_output["outputs"]["token_ids"] = last_token_ids
+                last_part["request_output"] = request_output
+            else:
+                self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
+
+    async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output):
+        """
+        Process a list of responses into a generator that yields each processed response as it's generated.
+        Args:
+            request_outputs: The list of outputs to be processed.
+            stream: Whether or not to stream the output.
+            enable_thinking: Whether or not to show thinking messages.
+            include_stop_str_in_output: Whether or not to include stop strings in the output.
+        """
+        for request_output in request_outputs:
+            if not self.enable_mm_output:
+                yield self.data_processor.process_response_dict(
+                    response_dict=request_output,
+                    stream=stream,
+                    enable_thinking=enable_thinking,
+                    include_stop_str_in_output=include_stop_str_in_output,
+                )
+            elif stream:
+                decode_type = request_output["outputs"].get("decode_type", 0)
+                token_ids = request_output["outputs"]["token_ids"]
+                if decode_type == 0:
+                    if self.eoi_token_id and self.eoi_token_id in token_ids:
+                        if self._mm_buffer:
+                            all_tokens = self._mm_buffer
+                            self._mm_buffer = []
+                            image = {"type": "image"}
+                            if self.decoder_client:
+                                req_id = request_output["request_id"]
+                                image_ret = await self.decoder_client.decode_image(
+                                    request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
+                                )
+                                image["url"] = image_ret["http_url"]
+                            image_output = self._end_image_code_request_output
+                            image_output["outputs"]["multipart"] = [image]
+                            image_output["outputs"]["token_ids"] = all_tokens
+                            yield image_output
+
+                    self.data_processor.process_response_dict(
+                        response_dict=request_output,
+                        stream=stream,
+                        enable_thinking=enable_thinking,
+                        include_stop_str_in_output=include_stop_str_in_output,
+                    )
+                    text = {"type": "text", "text": request_output["outputs"]["text"]}
+                    request_output["outputs"]["multipart"] = [text]
+                    yield request_output
+
+                elif decode_type == 1:
+                    self._mm_buffer.extend(token_ids)
+                    self._end_image_code_request_output = request_output
+            else:
+                self.accumulate_token_ids(request_output)
+                token_ids = request_output["outputs"]["token_ids"]
+                if token_ids[-1] == self.eos_token_id:
+                    multipart = []
+                    for part in self._multipart_buffer:
+                        if part["decode_type"] == 0:
+                            self.data_processor.process_response_dict(
+                                response_dict=part["request_output"],
+                                stream=False,
+                                enable_thinking=enable_thinking,
+                                include_stop_str_in_output=include_stop_str_in_output,
+                            )
+                            text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}
+                            multipart.append(text)
+                        elif part["decode_type"] == 1:
+                            image = {"type": "image"}
+                            if self.decoder_client:
+                                req_id = part["request_output"]["request_id"]
+                                all_tokens = part["request_output"]["outputs"]["token_ids"]
+                                image_ret = await self.decoder_client.decode_image(
+                                    request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
+                                )
+                                image["url"] = image_ret["http_url"]
+                            multipart.append(image)
+
+                    lasrt_request_output = self._multipart_buffer[-1]["request_output"]
+                    lasrt_request_output["outputs"]["multipart"] = multipart
+                    yield lasrt_request_output