Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 33 additions & 10 deletions inference-platforms/agent.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# run like this: uv run --exact -q --env-file .env agent.py
# /// script
# dependencies = [
# "openai-agents",
# "openai-agents @ git+https://github.com/openai/openai-agents-python.git@refs/pull/2034/head",
# "httpx",
# "mcp",
# "elastic-opentelemetry",
Expand All @@ -16,23 +16,25 @@
# This must precede any other imports you want to instrument!
auto_instrumentation.initialize()

import argparse
import asyncio
import os
from datetime import datetime, timedelta

from agents import (
Agent,
HostedMCPTool,
OpenAIProvider,
RunConfig,
Runner,
Tool,
)
from agents.mcp import MCPServerStreamableHttp, MCPUtil
from openai.types.responses.tool_param import Mcp


async def run_agent(tools: list[Tool]):
model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
model = OpenAIProvider(use_responses=False).get_model(model_name)
async def run_agent(tools: list[Tool], model_name: str, use_responses: bool):
model = OpenAIProvider(use_responses=use_responses).get_model(model_name)
agent = Agent(
name="flight-search-agent",
model=model,
Expand All @@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]):


async def main():
parser = argparse.ArgumentParser(description="MCP-enabled flight search agent")
parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents")
args = parser.parse_args()

model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com")
mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h)

if args.use_responses_api:
# Server-side MCP via Responses API
tools = [
HostedMCPTool(
tool_config=Mcp(
type="mcp",
server_url=mcp_url,
server_label="kiwi-flights",
headers=mcp_headers,
require_approval="never",
)
)
]
await run_agent(tools, model_name, use_responses=True)
return

# Client-side MCP orchestration
async with MCPServerStreamableHttp(
{
"url": os.getenv("MCP_URL", "https://mcp.kiwi.com"),
"headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h),
"timeout": 30.0,
},
{"url": mcp_url, "headers": mcp_headers, "timeout": 30.0},
client_session_timeout_seconds=60.0,
) as server:
tools = await server.list_tools()
util = MCPUtil()
tools = [util.to_function_tool(tool, server, False) for tool in tools]
await run_agent(tools)
await run_agent(tools, model_name, use_responses=False)


if __name__ == "__main__":
Expand Down
27 changes: 20 additions & 7 deletions inference-platforms/llama-stack/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# Llama Stack

This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an
OpenAI compatible API.
This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI
compatible API.

This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces
and metrics from signals recorded with Llama Stack's observability SDK.
**Note**: Telemetry is currently broken in v0.3.1, but not on main.

## Prerequisites

Expand Down Expand Up @@ -36,16 +35,30 @@ Or, for the OpenAI Responses API
uv run --exact -q --env-file env.local ../chat.py --use-responses-api
```

### MCP Agent

```bash
uv run --exact -q --env-file env.local ../agent.py --use-responses-api
```

## Notes

Here are some constraints about the LlamaStack implementation:
* Only supports llama models (so not Qwen)
* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks.
* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw
which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`.
* Until [this PR][openai-agents-pr] merges, the agent requires the fix branch
for handling providers that don't return token usage details.

* Uses the `starter` distribution with its built-in `remote::openai` provider,
pointing to Ollama via `OPENAI_BASE_URL` environment variable.
* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) as of
[PR #3822][prefix-pr].
* Until [this issue][docker] resolves, running docker on Apple Silicon
requires emulation.

---
[docs]: https://llama-stack.readthedocs.io/en/latest/index.html
[otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
[uv]: https://docs.astral.sh/uv/getting-started/installation/
[prefix-pr]: https://github.com/meta-llama/llama-stack/pull/3822
[docker]: https://github.com/llamastack/llama-stack/issues/406
[openai-agents-pr]: https://github.com/openai/openai-agents-python/pull/2034
10 changes: 6 additions & 4 deletions inference-platforms/llama-stack/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,19 @@ services:
env_file:
- env.local
entrypoint: sh
command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}'
command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}'
extra_hosts: # send localhost traffic to the docker host, e.g. your laptop
- "localhost:host-gateway"

llama-stack:
depends_on:
ollama-pull:
condition: service_completed_successfully
image: llamastack/distribution-starter:0.2.20
# TODO: switch to 0.3.2 or 0.4.0
image: llamastack/distribution-starter:local
container_name: llama-stack
platform: linux/amd64 # Force amd64 with emulation
# TODO: put back as published images are amd64 only
# platform: linux/amd64 # Force amd64 with emulation
tty: true
env_file:
- env.local
Expand All @@ -26,7 +28,7 @@ services:
# Ensure the container which specially treats localhost routes back to the
# host machine, e.g. your laptop.
environment:
- OLLAMA_URL=http://host.docker.internal:11434
- OPENAI_BASE_URL=http://host.docker.internal:11434/v1
- OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318
extra_hosts:
- "host.docker.internal:host-gateway"
13 changes: 5 additions & 8 deletions inference-platforms/llama-stack/env.local
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
# Override default ENV variables for llama-stack
OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1
# OpenAI-compatible endpoint configuration
OPENAI_BASE_URL=http://localhost:8321/v1
OPENAI_API_KEY=unused
CHAT_MODEL=llama3.2:1b

# Variables used by llama-stack
OLLAMA_URL=http://localhost:11434
INFERENCE_MODEL=llama3.2:1b
# Models require `provider_id/` prefix, in this case `openai`
CHAT_MODEL=openai/qwen3:0.6b
AGENT_MODEL=openai/qwen3:1.7b

# OpenTelemetry configuration
TELEMETRY_SINKS=otel_trace,otel_metric
OTEL_SERVICE_NAME=llama-stack
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
Expand Down