FlorianJoncour · AguirreNicolas · Jan 18, 2024 · Jan 18, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/examples/openai_tools_calls.py b/examples/openai_tools_calls.py
@@ -0,0 +1,155 @@
+"""
+Inspired by the OpenAI example found here:
+    https://platform.openai.com/docs/guides/function-calling/parallel-function-calling
+"""
+
+import datetime
+from openai import OpenAI
+import json
+
+client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1")
+models = client.models.list()
+model = models.data[0].id
+stream = True
+
+
+def get_current_date_utc():
+    print("Calling get_current_date_utc client side.")
+    return datetime.datetime.now(datetime.timezone.utc).strftime(
+        "The current UTC datetime is (day: %A, date (day/month/year): %d/%m/%Y, time: %H:%M)."
+    )
+
+
+# Example dummy function hard coded to return the same weather
+# In production, this could be your backend API or an external API
+def get_current_weather(location, unit="fahrenheit"):
+    """Get the current weather in a given location"""
+    print("Calling get_current_weather client side.")
+    if "tokyo" in location.lower():
+        return json.dumps({
+            "location": "Tokyo",
+            "temperature": "10",
+            "unit": unit
+        })
+    elif "san francisco" in location.lower():
+        return json.dumps({
+            "location": "San Francisco",
+            "temperature": "72",
+            "unit": unit
+        })
+    elif "paris" in location.lower():
+        return json.dumps({
+            "location": "Paris",
+            "temperature": "22",
+            "unit": unit
+        })
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
+
+
+def run_conversation():
+    # Step 1: send the conversation and available functions to the model
+    # messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
+    messages = [{
+        "role":
+        "user",
+        "content":
+        "What's the weather like in San Francisco, Tokyo, and Paris ? We also need to know the current date."
+    }]
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"]
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }, {
+        "type": "function",
+        "function": {
+            "name": "get_current_date_utc",
+            "description": "Get the current UTC time",
+        },
+    }]
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        tools=tools,
+        stream=stream,
+        tool_choice="auto",  # auto is default, but we'll be explicit
+    )
+    response_message = ""
+    tool_calls = None
+    if stream:
+        text_message = ""
+        for chunk in response:
+            if chunk.choices[0].finish_reason is not None:
+                if chunk.choices[0].finish_reason == "tool_calls":
+                    tool_calls = chunk.choices[0].delta.tool_calls
+                break
+            if chunk.choices[0].delta.content is not None:
+                text_message += chunk.choices[0].delta.content
+        response_message = {"role": "assistant", "content": text_message}
+    else:
+        if not len(response.choices):
+            return None
+        response_message = response.choices[0].message
+        # print(str(response_message))
+        tool_calls = response_message.tool_calls
+
+    # Step 2: check if the model wanted to call a function
+    if tool_calls:
+        # Step 3: call the function
+        # Note: the JSON response may not always be valid; be sure to handle errors
+        available_functions = {
+            "get_current_weather": get_current_weather,
+            "get_current_date_utc": get_current_date_utc,
+        }
+        messages.append(
+            response_message)  # extend conversation with assistant's reply
+        # Step 4: send the info for each function call and function response to the model
+        for tool_call in tool_calls:
+            function_name = tool_call.function.name
+            function_to_call = available_functions[function_name]
+            if function_name == "get_current_weather":
+                function_args = json.loads(tool_call.function.arguments)
+                function_response = function_to_call(
+                    location=function_args.get("location"),
+                    unit=function_args.get("unit"),
+                )
+            else:
+                function_response = function_to_call()
+
+            messages.append({
+                "tool_call_id": tool_call.id,
+                "role": "tool",
+                "name": function_name,
+                "content": function_response,
+            })  # extend conversation with function response
+        second_response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+        )  # get a new response from the model where it can see the function response
+
+        for it_msg, msg in enumerate(messages):
+            print("Message %i:\n    %s\n" % (it_msg, str(msg)))
+
+        return second_response
+
+
+result = run_conversation()
+print("Final response:\n%s" % result)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -1,6 +1,7 @@
 import argparse
 import asyncio
 import json
+import sys
 from contextlib import asynccontextmanager
 from aioprometheus import MetricsMiddleware
 from aioprometheus.asgi.starlette import metrics
@@ -19,9 +20,12 @@
 from vllm.logger import init_logger
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.tools import OpenAIToolsPrompter
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
+vllm_engine = None
+vllm_engine_args = None
 openai_serving_chat: OpenAIServingChat = None
 openai_serving_completion: OpenAIServingCompletion = None
 logger = init_logger(__name__)
@@ -33,9 +37,9 @@ async def lifespan(app: fastapi.FastAPI):
     async def _force_log():
         while True:
             await asyncio.sleep(10)
-            await engine.do_log_stats()
+            await vllm_engine.do_log_stats()
 
-    if not engine_args.disable_log_stats:
+    if not vllm_engine_args.disable_log_stats:
         asyncio.create_task(_force_log())
 
     yield
@@ -76,6 +80,14 @@ def parse_args():
                         help="The file path to the chat template, "
                         "or the template in single-line form "
                         "for the specified model")
+    parser.add_argument("--tools-template",
+                        type=str,
+                        default=None,
+                        help="The file path to alternative tools template")
+    parser.add_argument("--enable-api-tools",
+                        action="store_true",
+                        help="Enable OpenAI-like tools API "
+                        "(only function calls are currently supported)")
     parser.add_argument("--response-role",
                         type=str,
                         default="assistant",
@@ -89,6 +101,12 @@ def parse_args():
                         type=str,
                         default=None,
                         help="The file path to the SSL cert file")
+    parser.add_argument(
+        "--dev-mode",
+        action="store_true",
+        help=
+        "Enable API internals and templates reloading but do not deallocate the engine. This should only be used for development purpose."
+    )
     parser.add_argument(
         "--root-path",
         type=str,
@@ -99,6 +117,30 @@ def parse_args():
     return parser.parse_args()
 
 
+def _loadServingServices():
+    """ Load or reload the OpenAI service.
+        This function should only be called once on initialization, but may be called to reload the API internals.
+        Reloading must be used for development purpose only. """
+    global openai_serving_chat
+    global openai_serving_completion
+    if openai_serving_chat is not None:
+        del openai_serving_chat
+    if openai_serving_completion is not None:
+        del openai_serving_completion
+
+    openai_tools_prompter = OpenAIToolsPrompter(
+        template_path=args.tools_template) if args.enable_api_tools else None
+    openai_serving_chat = OpenAIServingChat(
+        engine=vllm_engine,
+        served_model=served_model,
+        response_role=args.response_role,
+        chat_template=args.chat_template,
+        openai_tools_prompter=openai_tools_prompter,
+        dev_mode=args.dev_mode)
+    openai_serving_completion = OpenAIServingCompletion(
+        vllm_engine, served_model)
+
+
 app.add_middleware(MetricsMiddleware)  # Trace HTTP server metrics
 app.add_route("/metrics", metrics)  # Exposes HTTP metrics
 
@@ -115,6 +157,16 @@ async def health() -> Response:
     return Response(status_code=200)
 
 
+if "--dev-mode" in sys.argv:
+
+    @app.get("/privileged")
+    async def privileged() -> Response:
+        """Reload the API internals. Danger !"""
+        logger.warning("privileged called.")
+        _loadServingServices()
+        return Response(status_code=200)
+
+
 @app.get("/v1/models")
 async def show_available_models():
     models = await openai_serving_chat.show_available_models()
@@ -156,21 +208,26 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     )
 
     logger.info(f"args: {args}")
+    if args.dev_mode:
+        logger.warning(
+            "\n"
+            "######################################################################\n"
+            "dev-mode enabled. This should only be used for development purpose.\n"
+            "If It's not the case, you should disable this !\n"
+            "######################################################################\n"
+        )
 
     if args.served_model_name is not None:
         served_model = args.served_model_name
     else:
         served_model = args.model
 
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
-    openai_serving_chat = OpenAIServingChat(engine, served_model,
-                                            args.response_role,
-                                            args.chat_template)
-    openai_serving_completion = OpenAIServingCompletion(engine, served_model)
+    vllm_engine_args = AsyncEngineArgs.from_cli_args(args)
+    vllm_engine = AsyncLLMEngine.from_engine_args(vllm_engine_args)
+    _loadServingServices()
 
     # Register labels for metrics
-    add_global_metrics_labels(model_name=engine_args.model)
+    add_global_metrics_labels(model_name=vllm_engine_args.model)
 
     app.root_path = args.root_path
     uvicorn.run(app,