elastic · codefromthecrypt · Nov 3, 2025 · Nov 3, 2025
@@ -1,7 +1,7 @@
 # run like this: uv run --exact -q --env-file .env agent.py
 # /// script
 # dependencies = [
-#     "openai-agents",
+#     "openai-agents @ git+https://github.com/openai/openai-agents-python.git@refs/pull/2034/head",
 #     "httpx",
 #     "mcp",
 #     "elastic-opentelemetry",
@@ -16,23 +16,25 @@
 # This must precede any other imports you want to instrument!
 auto_instrumentation.initialize()
 
+import argparse
 import asyncio
 import os
 from datetime import datetime, timedelta
 
 from agents import (
     Agent,
+    HostedMCPTool,
     OpenAIProvider,
     RunConfig,
     Runner,
     Tool,
 )
 from agents.mcp import MCPServerStreamableHttp, MCPUtil
+from openai.types.responses.tool_param import Mcp
 
 
-async def run_agent(tools: list[Tool]):
-    model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
-    model = OpenAIProvider(use_responses=False).get_model(model_name)
+async def run_agent(tools: list[Tool], model_name: str, use_responses: bool):
+    model = OpenAIProvider(use_responses=use_responses).get_model(model_name)
     agent = Agent(
         name="flight-search-agent",
         model=model,
@@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]):
 
 
 async def main():
+    parser = argparse.ArgumentParser(description="MCP-enabled flight search agent")
+    parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents")
+    args = parser.parse_args()
+
+    model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
+    mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com")
+    mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h)
+
+    if args.use_responses_api:
+        # Server-side MCP via Responses API
+        tools = [
+            HostedMCPTool(
+                tool_config=Mcp(
+                    type="mcp",
+                    server_url=mcp_url,
+                    server_label="kiwi-flights",
+                    headers=mcp_headers,
+                    require_approval="never",
+                )
+            )
+        ]
+        await run_agent(tools, model_name, use_responses=True)
+        return
+
+    # Client-side MCP orchestration
     async with MCPServerStreamableHttp(
-        {
-            "url": os.getenv("MCP_URL", "https://mcp.kiwi.com"),
-            "headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h),
-            "timeout": 30.0,
-        },
+        {"url": mcp_url, "headers": mcp_headers, "timeout": 30.0},
         client_session_timeout_seconds=60.0,
     ) as server:
         tools = await server.list_tools()
         util = MCPUtil()
         tools = [util.to_function_tool(tool, server, False) for tool in tools]
-        await run_agent(tools)
+        await run_agent(tools, model_name, use_responses=False)
 
 
 if __name__ == "__main__":

@@ -1,10 +1,9 @@
 # Llama Stack
 
-This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an
-OpenAI compatible API.
+This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI
+compatible API.
 
-This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces
-and metrics from signals recorded with Llama Stack's observability SDK.
+**Note**: Telemetry is currently broken in v0.3.1, but not on main.
 
 ## Prerequisites
 
@@ -36,16 +35,30 @@ Or, for the OpenAI Responses API
 uv run --exact -q --env-file env.local ../chat.py --use-responses-api
 ```
 
+### MCP Agent
+
+```bash
+uv run --exact -q --env-file env.local ../agent.py --use-responses-api
+```
+
 ## Notes
 
-Here are some constraints about the LlamaStack implementation:
-* Only supports llama models (so not Qwen)
-* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks.
+* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw
+  which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`.
+* Until [this PR][openai-agents-pr] merges, the agent requires the fix branch
+  for handling providers that don't return token usage details.
+
+* Uses the `starter` distribution with its built-in `remote::openai` provider,
+  pointing to Ollama via `OPENAI_BASE_URL` environment variable.
+* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) as of
+  [PR #3822][prefix-pr].
 * Until [this issue][docker] resolves, running docker on Apple Silicon
   requires emulation.
 
 ---
 [docs]: https://llama-stack.readthedocs.io/en/latest/index.html
 [otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
+[prefix-pr]: https://github.com/meta-llama/llama-stack/pull/3822
 [docker]: https://github.com/llamastack/llama-stack/issues/406
+[openai-agents-pr]: https://github.com/openai/openai-agents-python/pull/2034
@@ -7,17 +7,19 @@ services:
     env_file:
       - env.local
     entrypoint: sh
-    command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}'
+    command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}'
     extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
       - "localhost:host-gateway"
 
   llama-stack:
     depends_on:
       ollama-pull:
         condition: service_completed_successfully
-    image: llamastack/distribution-starter:0.2.20
+    # TODO: switch to 0.3.2 or 0.4.0
+    image: llamastack/distribution-starter:local
     container_name: llama-stack
-    platform: linux/amd64  # Force amd64 with emulation
+    # TODO: put back as published images are amd64 only
+    # platform: linux/amd64  # Force amd64 with emulation
     tty: true
     env_file:
       - env.local
@@ -26,7 +28,7 @@ services:
     # Ensure the container which specially treats localhost routes back to the
     # host machine, e.g. your laptop.
     environment:
-      - OLLAMA_URL=http://host.docker.internal:11434
+      - OPENAI_BASE_URL=http://host.docker.internal:11434/v1
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318
     extra_hosts:
       - "host.docker.internal:host-gateway"
@@ -1,14 +1,11 @@
-# Override default ENV variables for llama-stack
-OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1
+# OpenAI-compatible endpoint configuration
+OPENAI_BASE_URL=http://localhost:8321/v1
 OPENAI_API_KEY=unused
-CHAT_MODEL=llama3.2:1b
-
-# Variables used by llama-stack
-OLLAMA_URL=http://localhost:11434
-INFERENCE_MODEL=llama3.2:1b
+# Models require `provider_id/` prefix, in this case `openai`
+CHAT_MODEL=openai/qwen3:0.6b
+AGENT_MODEL=openai/qwen3:1.7b
 
 # OpenTelemetry configuration
-TELEMETRY_SINKS=otel_trace,otel_metric
 OTEL_SERVICE_NAME=llama-stack
 OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
 OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf