diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
index 8cb15bbe..99857cc6 100644
--- a/.github/workflows/cicd.yml
+++ b/.github/workflows/cicd.yml
@@ -50,6 +50,7 @@ jobs:
           # Set dummy secrets for unit tests
           sed -i 's/HF_TOKEN=.*/HF_TOKEN=dummy_token/' .env
           sed -i 's/BRAVE_SEARCH_API=.*/BRAVE_SEARCH_API=dummy_api/' .env
+          sed -i 's/E2B_API_KEY=.*/E2B_API_KEY=dummy_token/' .env
 
       - name: pyright
         run: uv run pyright
@@ -73,7 +74,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.GH_AWS_ACCESS_KEY }}
           aws-secret-access-key: ${{ secrets.GH_AWS_SECRET_KEY }}
-          aws-region: "eu-west-1"
+          aws-region: "us-east-1"
       - name: Start EC2 runner
         id: start-ec2-runner
         uses: NillionNetwork/ec2-github-runner@v2.2
@@ -82,12 +83,12 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           runners-per-machine: 3
           number-of-machines: 1
-          ec2-image-id: ami-0174a246556e8750b
-          ec2-instance-type: g4dn.xlarge
-          subnet-id: subnet-0ec4c353621eabae2
-          security-group-id: sg-03ee5c56e1f467aa0
-          key-name: production-github-runner-key
-          iam-role-name: github-runners-production-github-runner-ec2
+          ec2-image-id: ami-0e70d84403fc045d7
+          ec2-instance-type: g6.xlarge
+          subnet-id: subnet-0bb357f46d1bc355c
+          security-group-id: sg-022a5cdcf57e9618b
+          key-name: us-east-1-github-runner-key
+          iam-role-name: github-runners-us-east-1-github-runner-ec2
           aws-resource-tags: >
             [
               {"Key": "Name", "Value": "github-runner-${{ github.run_id }}-${{ github.run_number }}"},
@@ -96,7 +97,7 @@ jobs:
               {"Key": "Deployment", "Value": "github-runners"},
               {"Key": "Type", "Value": "GithubRunner"},
               {"Key": "User", "Value": "ec2-user"},
-              {"Key": "Environment", "Value": "production"}
+              {"Key": "Environment", "Value": "us-east-1"}
             ]
 
   build-images:
@@ -149,7 +150,7 @@ jobs:
          sed -i 's/NILDB_COLLECTION=.*/NILDB_COLLECTION=${{ secrets.NILDB_COLLECTION }}/' .env
 
       - name: Compose docker-compose.yml
-        run: python3 ./scripts/docker-composer.py --dev -f docker/compose/docker-compose.llama-1b-gpu.ci.yml -o development-compose.yml
+        run: python3 ./scripts/docker-composer.py --dev -f docker/compose/docker-compose.gpt-20b-gpu.ci.yml -o development-compose.yml
 
       - name: GPU stack versions (non-fatal)
         shell: bash
@@ -327,7 +328,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.GH_AWS_ACCESS_KEY }}
           aws-secret-access-key: ${{ secrets.GH_AWS_SECRET_KEY }}
-          aws-region: "eu-west-1"
+          aws-region: "us-east-1"
 
       - name: Stop EC2 runner
         uses: NillionNetwork/ec2-github-runner@v2.2
diff --git a/docker/compose/docker-compose.gpt-20b-gpu.ci.yml b/docker/compose/docker-compose.gpt-20b-gpu.ci.yml
new file mode 100644
index 00000000..dcfef4cb
--- /dev/null
+++ b/docker/compose/docker-compose.gpt-20b-gpu.ci.yml
@@ -0,0 +1,45 @@
+services:
+  gpt_20b_gpu:
+    image: nillion/nilai-vllm:latest
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    env_file:
+      - .env
+    restart: unless-stopped
+    depends_on:
+      etcd:
+        condition: service_healthy
+    command: >
+      --model openai/gpt-oss-20b
+      --gpu-memory-utilization 0.95
+      --max-model-len 10000
+      --max-num-batched-tokens 10000
+      --max-num-seqs 2
+      --tensor-parallel-size 1
+      --uvicorn-log-level warning
+      --async-scheduling
+    environment:
+      - SVC_HOST=gpt_20b_gpu
+      - SVC_PORT=8000
+      - ETCD_HOST=etcd
+      - ETCD_PORT=2379
+      - TOOL_SUPPORT=true
+    volumes:
+      - hugging_face_models:/root/.cache/huggingface  # cache models
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      retries: 10
+      start_period: 900s
+      timeout: 15s
+volumes:
+  hugging_face_models:
diff --git a/docker/nilauth/config.yaml b/docker/nilauth/config.yaml
index 01819219..9c4a3135 100644
--- a/docker/nilauth/config.yaml
+++ b/docker/nilauth/config.yaml
@@ -12,7 +12,7 @@ payments:
 
   subscriptions:
     renewal_threshold_seconds: 1000
-    length_seconds: 120
+    length_seconds: 900
     dollar_cost:
       nilai: 1
       nildb: 1
diff --git a/nilai-api/pyproject.toml b/nilai-api/pyproject.toml
index 2edc960d..d98386ec 100644
--- a/nilai-api/pyproject.toml
+++ b/nilai-api/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "uvicorn>=0.32.1",
     "httpx>=0.27.2",
     "nilrag>=0.1.11",
-    "openai>=1.59.9",
+    "openai>=1.99.2",
     "pg8000>=1.31.2",
     "prometheus_fastapi_instrumentator>=7.0.2",
     "asyncpg>=0.30.0",
diff --git a/nilai-api/src/nilai_api/config/__init__.py b/nilai-api/src/nilai_api/config/__init__.py
index fc768f26..59f1d601 100644
--- a/nilai-api/src/nilai_api/config/__init__.py
+++ b/nilai-api/src/nilai_api/config/__init__.py
@@ -1,5 +1,7 @@
 # Import all configuration models
 import json
+import logging
+from pydantic import BaseModel
 from .environment import EnvironmentConfig
 from .database import DatabaseConfig, EtcdConfig, RedisConfig
 from .auth import AuthConfig, DocsConfig
@@ -7,8 +9,6 @@
 from .web_search import WebSearchSettings
 from .rate_limiting import RateLimitingConfig
 from .utils import create_config_model, CONFIG_DATA
-from pydantic import BaseModel
-import logging
 
 
 class NilAIConfig(BaseModel):
@@ -38,19 +38,25 @@ class NilAIConfig(BaseModel):
 
     def prettify(self):
         """Print the config in a pretty format removing passwords and other sensitive information"""
-        config_dict = self.model_dump()
-        keywords = ["pass", "token", "key"]
-        for key, value in config_dict.items():
-            if isinstance(value, str):
-                for keyword in keywords:
-                    print(key, keyword, keyword in key)
-                    if keyword in key and value is not None:
-                        config_dict[key] = "***************"
-            if isinstance(value, dict):
-                for k, v in value.items():
-                    for keyword in keywords:
-                        if keyword in k and v is not None:
-                            value[k] = "***************"
+        config_dict = self.model_dump(mode="json")
+
+        keywords = {"pass", "token", "key"}
+        for key, value in list(config_dict.items()):
+            if (
+                isinstance(value, str)
+                and any(k in key for k in keywords)
+                and value is not None
+            ):
+                config_dict[key] = "***************"
+            elif isinstance(value, dict):
+                for k, v in list(value.items()):
+                    if (
+                        isinstance(v, str)
+                        and any(kw in k for kw in keywords)
+                        and v is not None
+                    ):
+                        value[k] = "***************"
+
         return json.dumps(config_dict, indent=4)
 
 
diff --git a/nilai-api/src/nilai_api/config/config.yaml b/nilai-api/src/nilai_api/config/config.yaml
index 2f9d9d8a..574bbed3 100644
--- a/nilai-api/src/nilai_api/config/config.yaml
+++ b/nilai-api/src/nilai_api/config/config.yaml
@@ -13,6 +13,7 @@ auth:
 docs:
   token: null
 
+
 # Web Search Configuration
 web_search:
   api_key: null
@@ -30,8 +31,8 @@ rate_limiting:
   user_rate_limit_minute: 100
   user_rate_limit_hour: 1000
   user_rate_limit_day: 10000
-  web_search_rate_limit_minute: 1
-  web_search_rate_limit_hour: 3
+  web_search_rate_limit_minute: 6
+  web_search_rate_limit_hour: 18
   web_search_rate_limit_day: 72
   web_search_rate_limit: null # For-good rate limit
   model_concurrent_rate_limit:
diff --git a/nilai-api/src/nilai_api/handlers/tools/responses_tool_router.py b/nilai-api/src/nilai_api/handlers/tools/responses_tool_router.py
new file mode 100644
index 00000000..54e88eb4
--- /dev/null
+++ b/nilai-api/src/nilai_api/handlers/tools/responses_tool_router.py
@@ -0,0 +1,221 @@
+from __future__ import annotations
+
+import json
+import asyncio
+import logging
+import uuid
+from typing import List, Tuple, Union, cast
+
+from openai import AsyncOpenAI
+from nilai_common import (
+    ResponseRequest,
+    Response,
+    ResponseFunctionToolCall,
+    FunctionCallOutput,
+    ResponseInputItemParam,
+    EasyInputMessageParam,
+    ResponseFunctionToolCallParam,
+)
+
+from . import code_execution
+
+logger = logging.getLogger(__name__)
+
+SUPPORTED_TOOLS = {"execute_python"}
+
+
+async def route_and_execute_tool_call(
+    tool_call: ResponseFunctionToolCallParam,
+) -> FunctionCallOutput:
+    """Route and execute a single tool call, returning the result as a FunctionCallOutput.
+
+    Currently supports:
+    - execute_python: Executes Python code in a sandbox environment
+
+    Args:
+        tool_call: Tool call parameter containing name, arguments, and call_id
+
+    Returns:
+        FunctionCallOutput object with execution result
+    """
+    tool_name = tool_call["name"]
+    arguments_json = tool_call["arguments"] or "{}"
+
+    match tool_name:
+        case "execute_python":
+            try:
+                parsed_arguments = json.loads(arguments_json)
+                code = parsed_arguments.get("code", "")
+                if not str(code).strip():
+                    output_json_string = json.dumps(
+                        {"error": "No code provided by the model."}
+                    )
+                else:
+                    result = await code_execution.execute_python(code)
+                    output_json_string = json.dumps({"result": str(result).strip()})
+            except json.JSONDecodeError:
+                logger.error("[responses_tool] invalid JSON in tool call arguments")
+                output_json_string = json.dumps(
+                    {"error": "Invalid JSON in tool call arguments."}
+                )
+            except Exception as e:
+                logger.error(f"[responses_tool] error executing tool: {e}")
+                output_json_string = json.dumps({"error": f"Error executing tool: {e}"})
+        case _:
+            output_json_string = json.dumps({"result": ""})
+
+    return FunctionCallOutput(
+        id=str(uuid.uuid4()),
+        call_id=tool_call["call_id"],
+        output=output_json_string,
+        type="function_call_output",
+    )
+
+
+async def process_tool_calls(
+    tool_calls: List[ResponseFunctionToolCallParam],
+) -> List[FunctionCallOutput]:
+    """Process multiple tool calls concurrently using asyncio.gather.
+
+    Executes all tool calls in parallel for optimal performance.
+
+    Args:
+        tool_calls: List of tool call parameters to execute
+
+    Returns:
+        List of FunctionCallOutput objects with execution results
+    """
+    tasks = [route_and_execute_tool_call(tc) for tc in tool_calls]
+    return await asyncio.gather(*tasks)
+
+
+def extract_function_tool_calls_from_response(
+    response: Response,
+) -> List[ResponseFunctionToolCallParam]:
+    """Extract all function tool calls from a Response object's output.
+
+    Filters the response output for ResponseFunctionToolCall items and converts
+    them to the parameter format required for tool execution.
+
+    Args:
+        response: Response object from the model containing potential tool calls
+
+    Returns:
+        List of ResponseFunctionToolCallParam objects ready for execution
+    """
+    if not response.output:
+        return []
+    return [
+        ResponseFunctionToolCallParam(
+            call_id=item.call_id,
+            name=item.name,
+            arguments=item.arguments,
+            type="function_call",
+        )
+        for item in response.output
+        if isinstance(item, ResponseFunctionToolCall)
+    ]
+
+
+async def handle_responses_tool_workflow(
+    client: AsyncOpenAI,
+    req: ResponseRequest,
+    input_items: Union[str, List[ResponseInputItemParam]],
+    first_response: Response,
+) -> Tuple[Response, int, int]:
+    """Handle the complete tool workflow for responses API.
+
+    This function manages the multi-turn tool execution flow:
+    1. Extracts tool calls from the model's first response
+    2. Validates all tools are available in the registry
+    3. If any tool is unavailable, shortcuts the workflow and returns the first response
+    4. If all tools are available, executes them concurrently
+    5. Constructs a new input with original messages + tool calls + tool results
+    6. Makes a follow-up API call with tool results
+    7. Returns the final response with aggregated token usage
+
+    Args:
+        client: AsyncOpenAI client for making API calls
+        req: Original request parameters
+        input_items: Original input messages (string or list)
+        first_response: Initial response from the model containing tool calls
+
+    Returns:
+        Tuple of (final_response, total_prompt_tokens, total_completion_tokens)
+    """
+    logger.info("[responses_tool] evaluating tool workflow for response")
+
+    prompt_tokens = first_response.usage.input_tokens if first_response.usage else 0
+    completion_tokens = (
+        first_response.usage.output_tokens if first_response.usage else 0
+    )
+
+    tool_calls = extract_function_tool_calls_from_response(first_response)
+    logger.info(f"[responses_tool] extracted tool_calls: {tool_calls}")
+
+    if not tool_calls:
+        return first_response, prompt_tokens, completion_tokens
+
+    unsupported_tool_calls = [
+        tc for tc in tool_calls if tc["name"] not in SUPPORTED_TOOLS
+    ]
+    if unsupported_tool_calls:
+        logger.info(
+            "[responses_tool] unknown tool(s): %s. Returning first response unchanged.",
+            [tc["name"] for tc in unsupported_tool_calls],
+        )
+        return first_response, prompt_tokens, completion_tokens
+
+    tool_results = await process_tool_calls(tool_calls)
+    logger.info(f"[responses_tool] tool_results: {tool_results}")
+
+    new_input_items: List[ResponseInputItemParam] = []
+    if isinstance(input_items, str):
+        new_input_items.append(
+            EasyInputMessageParam(
+                role="user",
+                content=input_items,
+                type="message",
+            )
+        )
+    elif isinstance(input_items, list):
+        new_input_items = list(input_items)
+
+    if first_response.output:
+        new_input_items.extend(
+            [
+                cast(
+                    ResponseInputItemParam,
+                    item.model_dump(exclude_unset=True, mode="json"),
+                )
+                for item in first_response.output
+            ]
+        )
+
+    new_input_items.extend(
+        [
+            cast(
+                ResponseInputItemParam,
+                result.model_dump(exclude_unset=True, mode="json"),
+            )
+            for result in tool_results
+        ]
+    )
+
+    request_kwargs = {
+        "model": req.model,
+        "input": new_input_items,
+        "instructions": req.instructions,
+        "top_p": req.top_p,
+        "temperature": req.temperature,
+        "max_output_tokens": req.max_output_tokens,
+    }
+
+    logger.info("[responses_tool] performing follow-up completion with tool outputs")
+    second_response: Response = await client.responses.create(**request_kwargs)
+
+    if second_response.usage:
+        prompt_tokens += second_response.usage.input_tokens
+        completion_tokens += second_response.usage.output_tokens
+
+    return second_response, prompt_tokens, completion_tokens
diff --git a/nilai-api/src/nilai_api/handlers/tools/tool_router.py b/nilai-api/src/nilai_api/handlers/tools/tool_router.py
index 6fd8041e..69df7c21 100644
--- a/nilai-api/src/nilai_api/handlers/tools/tool_router.py
+++ b/nilai-api/src/nilai_api/handlers/tools/tool_router.py
@@ -20,6 +20,8 @@
 
 logger = logging.getLogger(__name__)
 
+SUPPORTED_TOOLS = {"execute_python"}
+
 
 async def route_and_execute_tool_call(
     tool_call: ChatCompletionMessageToolCall,
@@ -30,29 +32,33 @@ async def route_and_execute_tool_call(
     with role="tool".
     """
     func_name = tool_call.function.name
-    arguments = tool_call.function.arguments or "{}"
-
-    if func_name == "execute_python":
-        # arguments is a JSON string
-        try:
-            args = json.loads(arguments)
-        except Exception:
-            args = {}
-        code = args.get("code", "")
-        result = await code_execution.execute_python(code)
-        logger.info(f"[tool] execute_python result: {result}")
-        return MessageAdapter.new_tool_message(
-            name="execute_python",
-            content=result,
-            tool_call_id=tool_call.id,
-        )
-
-    # Unknown tool: return an error message to the model
-    return MessageAdapter.new_tool_message(
-        name=func_name,
-        content=f"Tool '{func_name}' not implemented",
-        tool_call_id=tool_call.id,
-    )
+    arguments_json = tool_call.function.arguments or "{}"
+
+    match func_name:
+        case "execute_python":
+            try:
+                parsed_arguments = json.loads(arguments_json)
+                code = parsed_arguments.get("code", "")
+                if not str(code).strip():
+                    content = json.dumps({"error": "No code provided by the model."})
+                else:
+                    result = await code_execution.execute_python(code)
+                    content = str(result).strip()
+            except json.JSONDecodeError:
+                logger.error("[tools] invalid JSON in tool call arguments")
+                content = json.dumps({"error": "Invalid JSON in tool call arguments."})
+            except Exception as e:
+                logger.error(f"[tools] error executing tool: {e}")
+                content = json.dumps({"error": f"Error executing tool: {e}"})
+            return MessageAdapter.new_tool_message(
+                name="execute_python",
+                content=content,
+                tool_call_id=tool_call.id,
+            )
+        case _:
+            return MessageAdapter.new_tool_message(
+                name=func_name, content="", tool_call_id=tool_call.id
+            )
 
 
 async def process_tool_calls(
@@ -163,6 +169,14 @@ async def handle_tool_workflow(
     logger.info(f"[tools] extracted tool_calls: {tool_calls}")
 
     if not tool_calls:
+        return first_response, prompt_tokens, completion_tokens
+
+    unknown = [tc for tc in tool_calls if tc.function.name not in SUPPORTED_TOOLS]
+    if unknown:
+        logger.info(
+            "[tools] unknown tool(s): %s. Returning first response unchanged.",
+            [tc.function.name for tc in unknown],
+        )
         return first_response, 0, 0
 
     assistant_tool_call_msg = MessageAdapter.new_assistant_tool_call_message(tool_calls)
diff --git a/nilai-api/src/nilai_api/handlers/web_search.py b/nilai-api/src/nilai_api/handlers/web_search.py
index 84e88ea0..5bcaf5a3 100644
--- a/nilai-api/src/nilai_api/handlers/web_search.py
+++ b/nilai-api/src/nilai_api/handlers/web_search.py
@@ -9,7 +9,7 @@
 from fastapi import HTTPException, status
 
 from nilai_api.config import CONFIG
-from nilai_common.api_model import (
+from nilai_common.api_models import (
     ChatRequest,
     MessageAdapter,
     SearchResult,
@@ -20,11 +20,12 @@
     TopicResponse,
     Topic,
     TopicQuery,
+    ResponseRequest,
+    WebSearchEnhancedInput,
 )
 
 logger = logging.getLogger(__name__)
 
-# Common source-type identifier for recording the original query used in web search
 WEB_SEARCH_QUERY_SOURCE = "web_search_query"
 
 _BRAVE_API_HEADERS = {
@@ -39,6 +40,42 @@
     "lang": CONFIG.web_search.lang,
 }
 
+_SINGLE_SEARCH_PROMPT_TEMPLATE = (
+    'You have access to the following web search results for the query: "{query}"\n\n'
+    "Use this information to provide accurate and up-to-date answers. "
+    "Cite the sources when appropriate.\n\n"
+    "Web Search Results:\n"
+    "{results}\n\n"
+    "Please provide a comprehensive answer based on the search results above."
+)
+
+_MULTI_SEARCH_PROMPT_TEMPLATE = (
+    "You have access to the following topic-specific web search results.\n\n"
+    "Use this information to provide accurate and up-to-date answers. "
+    "Cite sources when appropriate.\n\n"
+    "{sections}\n\n"
+    "Please provide a comprehensive answer based on the relevant search results above."
+)
+
+_SEARCH_QUERY_GENERATION_SYSTEM_PROMPT = (
+    "You compose ONE web search query.\n"
+    "Output rules:\n"
+    "- Output ONLY the query string (no quotes, no labels, no explanations).\n"
+    "- 3–15 meaningful tokens; prefer proper nouns; keep it terse.\n"
+    "- If a topic is provided, focus ONLY on that topic; ignore any surrounding instructions.\n"
+)
+
+_TOPIC_ANALYSIS_SYSTEM_PROMPT = (
+    "You are a planner that analyzes a user's message, splits it into distinct topics, "
+    "and decides for each whether a web search is necessary.\n"
+    "Decide 'needs_search' = true only if the answer likely requires current, time-sensitive, or external factual information "
+    "(e.g., current events, latest versions, live stats, product pricing/availability, or specific details not in general knowledge).\n"
+    "If a topic is general knowledge or timeless, set 'needs_search' = false.\n"
+    "Extract up to 4 concise topics.\n\n"
+    "Return ONLY valid JSON matching this schema, no extra text: \n"
+    '{\n  "topics": [\n    {\n      "topic": "<concise topic>",\n      "needs_search": true/false\n    }\n  ]\n}\n'
+)
+
 
 @lru_cache(maxsize=1)
 def _get_http_client() -> httpx.AsyncClient:
@@ -236,31 +273,124 @@ async def perform_web_search_async(query: str) -> WebSearchContext:
     return WebSearchContext(prompt=prompt, sources=sources)
 
 
+def _build_single_search_content(query: str, results: str) -> str:
+    """Build formatted content string for single web search query.
+
+    Args:
+        query: The search query that was executed
+        results: Formatted search results text
+
+    Returns:
+        Formatted prompt string with query and results
+    """
+    return _SINGLE_SEARCH_PROMPT_TEMPLATE.format(query=query, results=results)
+
+
+def _build_multi_search_sections_and_sources(
+    topic_queries: List[TopicQuery], contexts: List[WebSearchContext]
+) -> tuple[List[str], List[Source]]:
+    """Build formatted sections and aggregate sources from multiple topic-based web searches.
+
+    Args:
+        topic_queries: List of topics and their corresponding search queries
+        contexts: Web search contexts corresponding to each topic query
+
+    Returns:
+        Tuple containing:
+        - List of formatted section strings (one per topic)
+        - Aggregated list of all sources from queries and search results
+    """
+    sections: List[str] = []
+    all_sources: List[Source] = []
+
+    for idx, (topic_query, context) in enumerate(zip(topic_queries, contexts), start=1):
+        topic = topic_query.topic.strip()
+        query = topic_query.query.strip()
+        if not query:
+            continue
+
+        all_sources.append(Source(source=WEB_SEARCH_QUERY_SOURCE, content=query))
+
+        header = f'Topic {idx}: {topic}\nQuery: "{query}"\n\nWeb Search Results:\n'
+        block = context.prompt.strip() if context.prompt else "(no results)"
+        sections.append(header + block)
+        all_sources.extend(context.sources)
+
+    return sections, all_sources
+
+
+def _build_multi_search_content(sections: List[str]) -> str:
+    """Build formatted content string for multiple topic-based web searches.
+
+    Args:
+        sections: List of formatted section strings (one per topic)
+
+    Returns:
+        Formatted prompt string with all topic sections
+    """
+    return _MULTI_SEARCH_PROMPT_TEMPLATE.format(sections="\n\n".join(sections))
+
+
+async def _generate_topic_query(
+    topic_obj: Topic, user_query: str, model_name: str, client: Any
+) -> TopicQuery | None:
+    """Generate a search query for a specific topic using the LLM.
+
+    Args:
+        topic_obj: Topic object containing the topic string
+        user_query: Original user query for context
+        model_name: Name of the LLM model to use
+        client: LLM client instance for API calls
+
+    Returns:
+        TopicQuery object with topic and generated query, or None if generation fails
+    """
+    topic_str = topic_obj.topic.strip()
+    if not topic_str:
+        return None
+    try:
+        query = await generate_search_query_from_llm(
+            user_query, model_name, client, topic=topic_str
+        )
+        return TopicQuery(topic=topic_str, query=query)
+    except Exception:
+        logger.exception("Failed generating query for topic '%s'", topic_str)
+        return None
+
+
+async def _perform_search(query: str) -> WebSearchContext:
+    """Execute a web search with error handling.
+
+    Args:
+        query: Search query string
+
+    Returns:
+        WebSearchContext with results, or empty context if search fails
+    """
+    try:
+        return await perform_web_search_async(query)
+    except Exception:
+        logger.exception("Search failed for query '%s'", query)
+        return WebSearchContext(prompt="", sources=[])
+
+
 async def enhance_messages_with_web_search(
     req: ChatRequest, query: str
 ) -> WebSearchEnhancedMessages:
-    """Enhance a list of messages with web search context.
+    """Enhance chat messages with web search context for a single query.
 
     Args:
-        messages: List of conversation messages to enhance
+        req: ChatRequest containing conversation messages
         query: Search query to retrieve web search results for
 
     Returns:
-        WebSearchEnhancedMessages containing the original messages with web search
-        context prepended as a system message, along with source information
+        WebSearchEnhancedMessages with web search context added to system messages
+        and source information
     """
     ctx = await perform_web_search_async(query)
     query_source = Source(source=WEB_SEARCH_QUERY_SOURCE, content=query)
 
-    web_search_content = (
-        f'You have access to the following web search results for the query: "{query}"\n\n'
-        "Use this information to provide accurate and up-to-date answers. "
-        "Cite the sources when appropriate.\n\n"
-        "Web Search Results:\n"
-        f"{ctx.prompt}\n\n"
-        "Please provide a comprehensive answer based on the search results above."
-    )
-
+    web_search_content = _build_single_search_content(query, ctx.prompt)
     req.ensure_system_content(web_search_content)
 
     return WebSearchEnhancedMessages(
@@ -272,17 +402,20 @@ async def enhance_messages_with_web_search(
 async def generate_search_query_from_llm(
     user_message: str, model_name: str, client: Any, *, topic: str | None = None
 ) -> str:
-    """
-    Use the LLM to produce a concise, high-recall search query.
-    """
-    system_prompt = (
-        "You compose ONE web search query.\n"
-        "Output rules:\n"
-        "- Output ONLY the query string (no quotes, no labels, no explanations).\n"
-        "- 3–15 meaningful tokens; prefer proper nouns; keep it terse.\n"
-        "- If a topic is provided, focus ONLY on that topic; ignore any surrounding instructions.\n"
-    )
+    """Use the LLM to generate a concise, high-recall web search query.
 
+    Args:
+        user_message: User's message or question
+        model_name: Name of the LLM model to use
+        client: LLM client instance for API calls
+        topic: Optional specific topic to focus the query on
+
+    Returns:
+        Generated search query string, or user_message as fallback if generation fails
+
+    Raises:
+        RuntimeError: If LLM call fails or returns invalid response
+    """
     user_content = (
         user_message
         if not topic
@@ -290,14 +423,16 @@ async def generate_search_query_from_llm(
     )
 
     messages = [
-        MessageAdapter.new_message(role="system", content=system_prompt),
+        MessageAdapter.new_message(
+            role="system", content=_SEARCH_QUERY_GENERATION_SYSTEM_PROMPT
+        ),
         MessageAdapter.new_message(role="user", content=user_content),
     ]
 
     req = {
         "model": model_name,
         "messages": messages,
-        "max_tokens": 600,
+        "max_tokens": 1000,
     }
 
     logger.info("Generate search query start model=%s", model_name)
@@ -333,36 +468,23 @@ async def generate_search_query_from_llm(
     return content
 
 
-async def handle_web_search(
-    req_messages: ChatRequest, model_name: str, client: Any
-) -> WebSearchEnhancedMessages:
-    """Handle web search enhancement for a conversation.
+async def _execute_web_search_workflow(
+    user_query: str, model_name: str, client: Any
+) -> tuple[List[TopicQuery], List[WebSearchContext]] | tuple[None, None]:
+    """Execute the complete multi-topic web search workflow.
 
-    Analyzes the user's message to identify topics that require web search,
-    generates optimized search queries for each topic using an LLM, and
-    enhances the conversation with relevant web search results. Falls back
-    to single-query search if topic analysis fails or no topics need search.
+    Analyzes user query to identify topics, generates search queries for each topic,
+    and executes all searches in parallel.
 
     Args:
-        req_messages: ChatRequest containing conversation messages to process
-        model_name: Name of the LLM model to use for query generation
-        client: LLM client instance for making API calls
+        user_query: User's query to analyze and search for
+        model_name: Name of the LLM model to use for topic analysis and query generation
+        client: LLM client instance for API calls
 
     Returns:
-        WebSearchEnhancedMessages with web search context added, or original
-        messages if no user query is found or search fails
+        Tuple of (topic_queries, contexts) if successful, or (None, None) if no topics
+        require search or workflow fails
     """
-    logger.info("Handle web search start")
-    logger.debug(
-        "Handle web search messages_in=%d model=%s",
-        len(req_messages.messages),
-        model_name,
-    )
-    user_query = req_messages.get_last_user_query()
-    if not user_query:
-        logger.info("No user query found")
-        return WebSearchEnhancedMessages(messages=req_messages.messages, sources=[])
-
     try:
         topics = await analyze_web_search_topics(user_query, model_name, client)
         topics_to_search = [t for t in topics if t.needs_search][:3]
@@ -371,25 +493,12 @@ async def handle_web_search(
             logger.info(
                 "No topics require web search; falling back to single-query enrichment"
             )
-            concise_query = await generate_search_query_from_llm(
-                user_query, model_name, client
-            )
-            return await enhance_messages_with_web_search(req_messages, concise_query)
+            return None, None
 
-        async def _generate_query(topic_obj: Topic) -> TopicQuery | None:
-            topic_str = topic_obj.topic.strip()
-            if not topic_str:
-                return None
-            try:
-                query = await generate_search_query_from_llm(
-                    user_query, model_name, client, topic=topic_str
-                )
-                return TopicQuery(topic=topic_str, query=query)
-            except Exception:
-                logger.exception("Failed generating query for topic '%s'", topic_str)
-                return None
-
-        query_generation_tasks = [_generate_query(t) for t in topics_to_search]
+        query_generation_tasks = [
+            _generate_topic_query(t, user_query, model_name, client)
+            for t in topics_to_search
+        ]
         generated_results = await asyncio.gather(*query_generation_tasks)
         topic_queries: List[TopicQuery] = [res for res in generated_results if res]
 
@@ -397,21 +506,43 @@ async def _generate_query(topic_obj: Topic) -> TopicQuery | None:
             logger.info(
                 "No valid topic queries generated; falling back to single query"
             )
+            return None, None
+
+        search_tasks = [_perform_search(tq.query) for tq in topic_queries]
+        contexts = await asyncio.gather(*search_tasks)
+
+        return topic_queries, contexts
+
+    except Exception:
+        logger.exception("Error during web search workflow")
+        return None, None
+
+
+async def handle_web_search(
+    req_messages: ChatRequest, model_name: str, client: Any
+) -> WebSearchEnhancedMessages:
+    logger.info("Handle web search start")
+    logger.debug(
+        "Handle web search messages_in=%d model=%s",
+        len(req_messages.messages),
+        model_name,
+    )
+    user_query = req_messages.get_last_user_query()
+    if not user_query:
+        logger.info("No user query found")
+        return WebSearchEnhancedMessages(messages=req_messages.messages, sources=[])
+
+    try:
+        topic_queries, contexts = await _execute_web_search_workflow(
+            user_query, model_name, client
+        )
+
+        if topic_queries is None or contexts is None:
             concise_query = await generate_search_query_from_llm(
                 user_query, model_name, client
             )
             return await enhance_messages_with_web_search(req_messages, concise_query)
 
-        async def _search(q: str) -> WebSearchContext:
-            try:
-                return await perform_web_search_async(q)
-            except Exception:
-                logger.exception("Search failed for query '%s'", q)
-                return WebSearchContext(prompt="", sources=[])
-
-        search_tasks = [_search(tq.query) for tq in topic_queries]
-        contexts = await asyncio.gather(*search_tasks)
-
         return await enhance_messages_with_multi_web_search(
             req_messages, topic_queries, contexts
         )
@@ -428,23 +559,21 @@ async def _search(q: str) -> WebSearchContext:
 async def analyze_web_search_topics(
     user_message: str, model_name: str, client: Any
 ) -> List[Topic]:
-    """Use the LLM to identify topics and whether each needs web search.
+    """Use the LLM to identify topics in user message and determine which need web search.
 
-    Returns a list of Pydantic Topic objects.
-    """
-    system_prompt = (
-        "You are a planner that analyzes a user's message, splits it into distinct topics, "
-        "and decides for each whether a web search is necessary.\n"
-        "Decide 'needs_search' = true only if the answer likely requires current, time-sensitive, or external factual information "
-        "(e.g., current events, latest versions, live stats, product pricing/availability, or specific details not in general knowledge).\n"
-        "If a topic is general knowledge or timeless, set 'needs_search' = false.\n"
-        "Extract up to 4 concise topics.\n\n"
-        "Return ONLY valid JSON matching this schema, no extra text: \n"
-        '{\n  "topics": [\n    {\n      "topic": "<concise topic>",\n      "needs_search": true/false\n    }\n  ]\n}\n'
-    )
+    Args:
+        user_message: User's message to analyze
+        model_name: Name of the LLM model to use
+        client: LLM client instance for API calls
 
+    Returns:
+        List of Topic objects, each indicating whether it needs web search. Empty list if
+        analysis fails.
+    """
     messages = [
-        MessageAdapter.new_message(role="system", content=system_prompt),
+        MessageAdapter.new_message(
+            role="system", content=_TOPIC_ANALYSIS_SYSTEM_PROMPT
+        ),
         MessageAdapter.new_message(role="user", content=user_message),
     ]
 
@@ -471,38 +600,149 @@ async def enhance_messages_with_multi_web_search(
     topic_queries: List[TopicQuery],
     contexts: List[WebSearchContext],
 ) -> WebSearchEnhancedMessages:
-    """Enhance messages with multiple topic-specific web search contexts."""
+    """Enhance chat messages with multiple topic-specific web search contexts.
+
+    Args:
+        req: ChatRequest containing conversation messages
+        topic_queries: List of topics and their corresponding search queries
+        contexts: Web search contexts corresponding to each topic query
+
+    Returns:
+        WebSearchEnhancedMessages with all topic-specific web search contexts added
+        to system messages and aggregated source information
+    """
     if not topic_queries or not contexts:
         return WebSearchEnhancedMessages(messages=req.messages, sources=[])
 
-    # Build a merged content block
-    sections: List[str] = []
-    all_sources: List[Source] = []
+    sections, all_sources = _build_multi_search_sections_and_sources(
+        topic_queries, contexts
+    )
 
-    for idx, (topic_query, context) in enumerate(zip(topic_queries, contexts), start=1):
-        topic = topic_query.topic.strip()
-        query = topic_query.query.strip()
-        if not query:
-            continue
+    if not sections:
+        return WebSearchEnhancedMessages(messages=req.messages, sources=[])
 
-        all_sources.append(Source(source=WEB_SEARCH_QUERY_SOURCE, content=query))
+    web_search_content = _build_multi_search_content(sections)
+    req.ensure_system_content(web_search_content)
 
-        header = f'Topic {idx}: {topic}\nQuery: "{query}"\n\nWeb Search Results:\n'
-        block = context.prompt.strip() if context.prompt else "(no results)"
-        sections.append(header + block)
-        all_sources.extend(context.sources)
+    return WebSearchEnhancedMessages(messages=req.messages, sources=all_sources)
+
+
+async def enhance_input_with_web_search(
+    req: ResponseRequest, query: str
+) -> WebSearchEnhancedInput:
+    """Enhance response input with web search context for a single query.
+
+    Args:
+        req: ResponseRequest containing input and instructions
+        query: Search query to retrieve web search results for
+
+    Returns:
+        WebSearchEnhancedInput with web search context added to instructions
+        and source information
+    """
+    ctx = await perform_web_search_async(query)
+    query_source = Source(source=WEB_SEARCH_QUERY_SOURCE, content=query)
+
+    web_search_instructions = _build_single_search_content(query, ctx.prompt)
+    req.ensure_instructions(web_search_instructions)
+
+    return WebSearchEnhancedInput(
+        input=req.input,
+        instructions=req.instructions,
+        sources=[query_source] + ctx.sources,
+    )
+
+
+async def enhance_input_with_multi_web_search(
+    req: ResponseRequest,
+    topic_queries: List[TopicQuery],
+    contexts: List[WebSearchContext],
+) -> WebSearchEnhancedInput:
+    """Enhance response input with multiple topic-specific web search contexts.
+
+    Args:
+        req: ResponseRequest containing input and instructions
+        topic_queries: List of topics and their corresponding search queries
+        contexts: Web search contexts corresponding to each topic query
+
+    Returns:
+        WebSearchEnhancedInput with all topic-specific web search contexts added
+        to instructions and aggregated source information
+    """
+    if not topic_queries or not contexts:
+        return WebSearchEnhancedInput(
+            input=req.input, instructions=req.instructions, sources=[]
+        )
+
+    sections, all_sources = _build_multi_search_sections_and_sources(
+        topic_queries, contexts
+    )
 
     if not sections:
-        return WebSearchEnhancedMessages(messages=req.messages, sources=[])
+        return WebSearchEnhancedInput(
+            input=req.input, instructions=req.instructions, sources=[]
+        )
+
+    web_search_instructions = _build_multi_search_content(sections)
+    req.ensure_instructions(web_search_instructions)
 
-    web_search_content = (
-        "You have access to the following topic-specific web search results.\n\n"
-        "Use this information to provide accurate and up-to-date answers. "
-        "Cite sources when appropriate.\n\n"
-        + "\n\n".join(sections)
-        + "\n\nPlease provide a comprehensive answer based on the relevant search results above."
+    return WebSearchEnhancedInput(
+        input=req.input, instructions=req.instructions, sources=all_sources
     )
 
-    req.ensure_system_content(web_search_content)
 
-    return WebSearchEnhancedMessages(messages=req.messages, sources=all_sources)
+async def handle_web_search_for_responses(
+    req: ResponseRequest, model_name: str, client: Any
+) -> WebSearchEnhancedInput:
+    """Handle web search enhancement for response requests.
+
+    Analyzes the user's input to identify topics that require web search,
+    generates optimized search queries for each topic using an LLM, and
+    enhances the request with relevant web search results. Falls back to
+    single-query search if topic analysis fails or no topics need search.
+
+    Args:
+        req: ResponseRequest containing input to process
+        model_name: Name of the LLM model to use for query generation
+        client: LLM client instance for making API calls
+
+    Returns:
+        WebSearchEnhancedInput with web search context added, or original
+        input if no user query is found or search fails
+    """
+    logger.info("Handle web search for responses start")
+    logger.debug(
+        "Handle web search for responses model=%s",
+        model_name,
+    )
+    user_query = req.get_last_user_query()
+    if not user_query:
+        logger.info("No user query found")
+        return WebSearchEnhancedInput(
+            input=req.input, instructions=req.instructions, sources=[]
+        )
+
+    try:
+        topic_queries, contexts = await _execute_web_search_workflow(
+            user_query, model_name, client
+        )
+
+        if topic_queries is None or contexts is None:
+            concise_query = await generate_search_query_from_llm(
+                user_query, model_name, client
+            )
+            return await enhance_input_with_web_search(req, concise_query)
+
+        return await enhance_input_with_multi_web_search(req, topic_queries, contexts)
+
+    except HTTPException:
+        logger.exception("Web search provider error")
+        return WebSearchEnhancedInput(
+            input=req.input, instructions=req.instructions, sources=[]
+        )
+
+    except Exception:
+        logger.exception("Unexpected error during web search handling")
+        return WebSearchEnhancedInput(
+            input=req.input, instructions=req.instructions, sources=[]
+        )
diff --git a/nilai-api/src/nilai_api/rate_limiting.py b/nilai-api/src/nilai_api/rate_limiting.py
index 8205b553..c2d03273 100644
--- a/nilai-api/src/nilai_api/rate_limiting.py
+++ b/nilai-api/src/nilai_api/rate_limiting.py
@@ -1,16 +1,15 @@
-import asyncio
 from asyncio import iscoroutine
 from typing import Callable, Tuple, Awaitable, Annotated
 
 from nilai_api.db.users import RateLimits
 from pydantic import BaseModel
-from nilai_api.config import CONFIG
 
 from fastapi.params import Depends
 from fastapi import status, HTTPException, Request
 from redis.asyncio import from_url, Redis
 
 from nilai_api.auth import get_auth_info, AuthenticationInfo, TokenRateLimits
+from nilai_api.config import CONFIG
 
 LUA_RATE_LIMIT_SCRIPT = """
 local key = KEYS[1]
@@ -157,53 +156,42 @@ async def __call__(
             )
 
             if web_search_enabled:
-                allowed_rps = min(
-                    CONFIG.web_search.rps,
-                    max(
-                        1,
-                        CONFIG.web_search.max_concurrent_requests
-                        // CONFIG.web_search.count,
-                    ),
+                await self.check_bucket(
+                    redis,
+                    redis_rate_limit_command,
+                    f"web_search_minute:{user_limits.subscription_holder}",
+                    user_limits.rate_limits.web_search_rate_limit_minute,
+                    MINUTE_MS,
                 )
-                await self.wait_for_bucket(
+                await self.check_bucket(
+                    redis,
+                    redis_rate_limit_command,
+                    f"web_search_hour:{user_limits.subscription_holder}",
+                    user_limits.rate_limits.web_search_rate_limit_hour,
+                    HOUR_MS,
+                )
+                await self.check_bucket(
+                    redis,
+                    redis_rate_limit_command,
+                    f"web_search_day:{user_limits.subscription_holder}",
+                    user_limits.rate_limits.web_search_rate_limit_day,
+                    DAY_MS,
+                )
+                await self.check_bucket(
                     redis,
                     redis_rate_limit_command,
-                    "global:web_search:rps",
-                    allowed_rps,
+                    "web_search_rps",
+                    CONFIG.web_search.rps,
                     1000,
                 )
-
                 await self.check_bucket(
                     redis,
                     redis_rate_limit_command,
                     f"web_search:{user_limits.subscription_holder}",
                     user_limits.rate_limits.web_search_rate_limit,
-                    0,  # No expiration for for-good rate limit
+                    0,
                 )
 
-                web_search_limits = [
-                    (
-                        user_limits.rate_limits.web_search_rate_limit_minute,
-                        MINUTE_MS,
-                        "minute",
-                    ),
-                    (
-                        user_limits.rate_limits.web_search_rate_limit_hour,
-                        HOUR_MS,
-                        "hour",
-                    ),
-                    (user_limits.rate_limits.web_search_rate_limit_day, DAY_MS, "day"),
-                ]
-
-                for limit, milliseconds, time_unit in web_search_limits:
-                    await self.check_bucket(
-                        redis,
-                        redis_rate_limit_command,
-                        f"web_search_{time_unit}:{user_limits.subscription_holder}",
-                        limit,
-                        milliseconds,
-                    )
-
         key = await self.check_concurrent_and_increment(redis, request)
         try:
             yield
@@ -231,24 +219,6 @@ async def check_bucket(
                 headers={"Retry-After": str(expire)},
             )
 
-    @staticmethod
-    async def wait_for_bucket(
-        redis: Redis,
-        redis_rate_limit_command: str,
-        key: str,
-        times: int | None,
-        milliseconds: int,
-    ):
-        if times is None:
-            return
-        while True:
-            expire = await redis.evalsha(
-                redis_rate_limit_command, 1, key, str(times), str(milliseconds)
-            )  # type: ignore
-            if int(expire) == 0:
-                return
-            await asyncio.sleep((int(expire) + 50) / 1000)
-
     async def check_concurrent_and_increment(
         self, redis: Redis, request: Request
     ) -> str | None:
diff --git a/nilai-api/src/nilai_api/routers/endpoints/__init__.py b/nilai-api/src/nilai_api/routers/endpoints/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nilai-api/src/nilai_api/routers/endpoints/chat.py b/nilai-api/src/nilai_api/routers/endpoints/chat.py
new file mode 100644
index 00000000..7e1bc424
--- /dev/null
+++ b/nilai-api/src/nilai_api/routers/endpoints/chat.py
@@ -0,0 +1,355 @@
+import json
+import logging
+import time
+import uuid
+from base64 import b64encode
+from typing import AsyncGenerator, Optional, Union, List, Tuple
+
+from fastapi import APIRouter, Body, Depends, HTTPException, status, Request
+from fastapi.responses import StreamingResponse
+from openai import AsyncOpenAI
+
+from nilai_api.auth import get_auth_info, AuthenticationInfo
+from nilai_api.config import CONFIG
+from nilai_api.crypto import sign_message
+from nilai_api.db.logs import QueryLogManager
+from nilai_api.db.users import UserManager
+from nilai_api.handlers.nildb.handler import get_prompt_from_nildb
+from nilai_api.handlers.nilrag import handle_nilrag
+from nilai_api.handlers.tools.tool_router import handle_tool_workflow
+from nilai_api.handlers.web_search import handle_web_search
+from nilai_api.rate_limiting import RateLimit
+from nilai_api.state import state
+
+from nilai_common import (
+    ChatRequest,
+    MessageAdapter,
+    SignedChatCompletion,
+    Source,
+)
+
+logger = logging.getLogger(__name__)
+
+chat_completion_router = APIRouter()
+
+
+async def chat_completion_concurrent_rate_limit(request: Request) -> Tuple[int, str]:
+    body = await request.json()
+    try:
+        chat_request = ChatRequest(**body)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid request body")
+    key = f"chat:{chat_request.model}"
+    limit = CONFIG.rate_limiting.model_concurrent_rate_limit.get(
+        chat_request.model,
+        CONFIG.rate_limiting.model_concurrent_rate_limit.get("default", 50),
+    )
+    return limit, key
+
+
+async def chat_completion_web_search_rate_limit(request: Request) -> bool:
+    """Extract web_search flag from request body for rate limiting."""
+    body = await request.json()
+    try:
+        chat_request = ChatRequest(**body)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid request body")
+    return bool(chat_request.web_search)
+
+
+@chat_completion_router.post("/v1/chat/completions", tags=["Chat"], response_model=None)
+async def chat_completion(
+    req: ChatRequest = Body(
+        ChatRequest(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            messages=[
+                MessageAdapter.new_message(
+                    role="system", content="You are a helpful assistant."
+                ),
+                MessageAdapter.new_message(role="user", content="What is your name?"),
+            ],
+        )
+    ),
+    _rate_limit=Depends(
+        RateLimit(
+            concurrent_extractor=chat_completion_concurrent_rate_limit,
+            web_search_extractor=chat_completion_web_search_rate_limit,
+        )
+    ),
+    auth_info: AuthenticationInfo = Depends(get_auth_info),
+) -> Union[SignedChatCompletion, StreamingResponse]:
+    """
+    Generate a chat completion response from the AI model.
+
+    - **req**: Chat completion request containing messages and model specifications
+    - **user**: Authenticated user information (through HTTP Bearer header)
+    - **Returns**: Full chat response with model output, usage statistics, and cryptographic signature
+
+    ### Request Requirements
+    - Must include non-empty list of messages
+    - Must specify a model
+    - Supports multiple message formats (system, user, assistant)
+    - Optional web_search parameter to enhance context with current information
+
+    ### Response Components
+    - Model-generated text completion
+    - Token usage metrics
+    - Cryptographically signed response for verification
+
+    ### Processing Steps
+    1. Validate input request parameters
+    2. If web_search is enabled, perform web search and enhance context
+    3. Prepare messages for model processing
+    4. Generate AI model response
+    5. Track and update token usage
+    6. Cryptographically sign the response
+
+    ### Web Search Feature
+    When web_search=True, the system will:
+    - Extract the user's query from the last user message
+    - Perform a web search using Brave API
+    - Enhance the conversation context with current information
+    - Add search results as a system message for better responses
+
+    ### Potential HTTP Errors
+    - **400 Bad Request**:
+      - Missing messages list
+      - No model specified
+    - **500 Internal Server Error**:
+      - Model fails to generate a response
+
+    ### Example
+    ```python
+    # Generate a chat completion with web search
+    request = ChatRequest(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant"},
+            {"role": "user", "content": "What is your name?"}
+        ],
+    )
+    response = await chat_completion(request, user)
+    """
+
+    if len(req.messages) == 0:
+        raise HTTPException(
+            status_code=400,
+            detail="Request contained 0 messages",
+        )
+    model_name = req.model
+    request_id = str(uuid.uuid4())
+    t_start = time.monotonic()
+    logger.info(f"[chat] call start request_id={req.messages}")
+    endpoint = await state.get_model(model_name)
+    if endpoint is None:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Invalid model name {model_name}, check /v1/models for options",
+        )
+
+    has_multimodal = req.has_multimodal_content()
+    logger.info(f"[chat] has_multimodal: {has_multimodal}")
+    if has_multimodal and (not endpoint.metadata.multimodal_support or req.web_search):
+        raise HTTPException(
+            status_code=400,
+            detail="Model does not support multimodal content, remove image inputs from request",
+        )
+
+    model_url = endpoint.url + "/v1/"
+
+    logger.info(
+        f"[chat] start request_id={request_id} user={auth_info.user.userid} model={model_name} stream={req.stream} web_search={bool(req.web_search)} tools={bool(req.tools)} multimodal={has_multimodal} url={model_url}"
+    )
+
+    client = AsyncOpenAI(base_url=model_url, api_key="<not-needed>")
+    if auth_info.prompt_document:
+        try:
+            nildb_prompt: str = await get_prompt_from_nildb(auth_info.prompt_document)
+            req.messages.insert(
+                0, MessageAdapter.new_message(role="system", content=nildb_prompt)
+            )
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"Unable to extract prompt from nilDB: {str(e)}",
+            )
+
+    if req.nilrag:
+        logger.info(f"[chat] nilrag start request_id={request_id}")
+        t_nilrag = time.monotonic()
+        await handle_nilrag(req)
+        logger.info(
+            f"[chat] nilrag done request_id={request_id} duration_ms={(time.monotonic() - t_nilrag) * 1000:.0f}"
+        )
+
+    messages = req.messages
+    sources: Optional[List[Source]] = None
+
+    if req.web_search:
+        logger.info(f"[chat] web_search start request_id={request_id}")
+        t_ws = time.monotonic()
+        web_search_result = await handle_web_search(req, model_name, client)
+        messages = web_search_result.messages
+        sources = web_search_result.sources
+        logger.info(
+            f"[chat] web_search done request_id={request_id} sources={len(sources) if sources else 0} duration_ms={(time.monotonic() - t_ws) * 1000:.0f}"
+        )
+        logger.info(f"[chat] web_search messages: {messages}")
+
+    request_kwargs = {
+        "model": req.model,
+        "messages": messages,
+        "top_p": req.top_p,
+        "temperature": req.temperature,
+        "max_tokens": req.max_tokens,
+    }
+
+    if req.tools:
+        if not endpoint.metadata.tool_support:
+            raise HTTPException(
+                status_code=400,
+                detail="Model does not support tool usage, remove tools from request",
+            )
+        if model_name == "openai/gpt-oss-20b":
+            raise HTTPException(
+                status_code=400,
+                detail="This model only supports tool calls with responses endpoint",
+            )
+        request_kwargs["tools"] = req.tools
+        request_kwargs["tool_choice"] = req.tool_choice
+
+    if req.stream:
+
+        async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
+            t_call = time.monotonic()
+            prompt_token_usage = 0
+            completion_token_usage = 0
+
+            try:
+                logger.info(f"[chat] stream start request_id={request_id}")
+
+                request_kwargs["stream"] = True
+                request_kwargs["extra_body"] = {
+                    "stream_options": {
+                        "include_usage": True,
+                        "continuous_usage_stats": False,
+                    }
+                }
+
+                response = await client.chat.completions.create(**request_kwargs)
+
+                async for chunk in response:
+                    if chunk.usage is not None:
+                        prompt_token_usage = chunk.usage.prompt_tokens
+                        completion_token_usage = chunk.usage.completion_tokens
+
+                    payload = chunk.model_dump(exclude_unset=True)
+
+                    if chunk.usage is not None and sources:
+                        payload["sources"] = [
+                            s.model_dump(mode="json") for s in sources
+                        ]
+
+                    yield f"data: {json.dumps(payload)}\n\n"
+
+                await UserManager.update_token_usage(
+                    auth_info.user.userid,
+                    prompt_tokens=prompt_token_usage,
+                    completion_tokens=completion_token_usage,
+                )
+                await QueryLogManager.log_query(
+                    auth_info.user.userid,
+                    model=req.model,
+                    prompt_tokens=prompt_token_usage,
+                    completion_tokens=completion_token_usage,
+                    web_search_calls=len(sources) if sources else 0,
+                )
+                logger.info(
+                    "[chat] stream done request_id=%s prompt_tokens=%d completion_tokens=%d "
+                    "duration_ms=%.0f total_ms=%.0f",
+                    request_id,
+                    prompt_token_usage,
+                    completion_token_usage,
+                    (time.monotonic() - t_call) * 1000,
+                    (time.monotonic() - t_start) * 1000,
+                )
+
+            except Exception as e:
+                logger.error(
+                    "[chat] stream error request_id=%s error=%s", request_id, e
+                )
+                yield f"data: {json.dumps({'error': 'stream_failed', 'message': str(e)})}\n\n"
+
+        return StreamingResponse(
+            chat_completion_stream_generator(),
+            media_type="text/event-stream",
+        )
+
+    logger.info(f"[chat] call start request_id={request_id}")
+    logger.info(f"[chat] call message: {request_kwargs['messages']}")
+    t_call = time.monotonic()
+    response = await client.chat.completions.create(**request_kwargs)
+    logger.info(
+        f"[chat] call done request_id={request_id} duration_ms={(time.monotonic() - t_call) * 1000:.0f}"
+    )
+    logger.info(f"[chat] call response: {response}")
+
+    (
+        final_completion,
+        agg_prompt_tokens,
+        agg_completion_tokens,
+    ) = await handle_tool_workflow(client, req, request_kwargs["messages"], response)
+    logger.info(f"[chat] call final_completion: {final_completion}")
+    model_response = SignedChatCompletion(
+        **final_completion.model_dump(),
+        signature="",
+        sources=sources,
+    )
+
+    logger.info(
+        f"[chat] model_response request_id={request_id} duration_ms={(time.monotonic() - t_call) * 1000:.0f}"
+    )
+
+    if model_response.usage is None:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Model response does not contain usage statistics",
+        )
+
+    if agg_prompt_tokens or agg_completion_tokens:
+        total_prompt_tokens = response.usage.prompt_tokens
+        total_completion_tokens = response.usage.completion_tokens
+
+        total_prompt_tokens += agg_prompt_tokens
+        total_completion_tokens += agg_completion_tokens
+
+        model_response.usage.prompt_tokens = total_prompt_tokens
+        model_response.usage.completion_tokens = total_completion_tokens
+        model_response.usage.total_tokens = (
+            total_prompt_tokens + total_completion_tokens
+        )
+
+    # Update token usage in DB
+    await UserManager.update_token_usage(
+        auth_info.user.userid,
+        prompt_tokens=model_response.usage.prompt_tokens,
+        completion_tokens=model_response.usage.completion_tokens,
+    )
+
+    await QueryLogManager.log_query(
+        auth_info.user.userid,
+        model=req.model,
+        prompt_tokens=model_response.usage.prompt_tokens,
+        completion_tokens=model_response.usage.completion_tokens,
+        web_search_calls=len(sources) if sources else 0,
+    )
+
+    # Sign the response
+    response_json = model_response.model_dump_json()
+    signature = sign_message(state.private_key, response_json)
+    model_response.signature = b64encode(signature).decode()
+
+    logger.info(
+        f"[chat] done request_id={request_id} prompt_tokens={model_response.usage.prompt_tokens} completion_tokens={model_response.usage.completion_tokens} total_ms={(time.monotonic() - t_start) * 1000:.0f}"
+    )
+    return model_response
diff --git a/nilai-api/src/nilai_api/routers/endpoints/responses.py b/nilai-api/src/nilai_api/routers/endpoints/responses.py
new file mode 100644
index 00000000..a5af3dc4
--- /dev/null
+++ b/nilai-api/src/nilai_api/routers/endpoints/responses.py
@@ -0,0 +1,327 @@
+import json
+import logging
+import time
+import uuid
+from base64 import b64encode
+from typing import AsyncGenerator, Optional, Union, List, Tuple
+
+from fastapi import APIRouter, Body, Depends, HTTPException, status, Request
+from fastapi.responses import StreamingResponse
+from openai import AsyncOpenAI
+
+from nilai_api.auth import get_auth_info, AuthenticationInfo
+from nilai_api.config import CONFIG
+from nilai_api.crypto import sign_message
+from nilai_api.db.logs import QueryLogManager
+from nilai_api.db.users import UserManager
+from nilai_api.handlers.nildb.handler import get_prompt_from_nildb
+
+# from nilai_api.handlers.nilrag import handle_nilrag_for_responses
+from nilai_api.handlers.tools.responses_tool_router import (
+    handle_responses_tool_workflow,
+)
+from nilai_api.handlers.web_search import handle_web_search_for_responses
+from nilai_api.rate_limiting import RateLimit
+from nilai_api.state import state
+
+from nilai_common import ResponseRequest, SignedResponse, Source, ResponseCompletedEvent
+
+logger = logging.getLogger(__name__)
+
+responses_router = APIRouter()
+
+
+async def responses_concurrent_rate_limit(request: Request) -> Tuple[int, str]:
+    """Rate limit extractor for concurrent requests to the responses endpoint."""
+    body = await request.json()
+    try:
+        response_request = ResponseRequest(**body)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid request body")
+    key = f"responses:{response_request.model}"
+    limit = CONFIG.rate_limiting.model_concurrent_rate_limit.get(
+        response_request.model,
+        CONFIG.rate_limiting.model_concurrent_rate_limit.get("default", 50),
+    )
+    return limit, key
+
+
+async def responses_web_search_rate_limit(request: Request) -> bool:
+    """Extracts web_search flag from the request body for rate limiting."""
+    body = await request.json()
+    try:
+        response_request = ResponseRequest(**body)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid request body")
+    return bool(response_request.web_search)
+
+
+@responses_router.post(
+    "/v1/responses", tags=["Responses"], response_model=SignedResponse
+)
+async def create_response(
+    req: ResponseRequest = Body(
+        {
+            "model": "openai/gpt-oss-20b",
+            "instructions": "You are a helpful assistant.",
+            "input": [
+                {"role": "user", "content": "What is your name?"},
+            ],
+            "stream": False,
+            "web_search": False,
+        }
+    ),
+    _rate_limit=Depends(
+        RateLimit(
+            concurrent_extractor=responses_concurrent_rate_limit,
+            web_search_extractor=responses_web_search_rate_limit,
+        )
+    ),
+    auth_info: AuthenticationInfo = Depends(get_auth_info),
+) -> Union[SignedResponse, StreamingResponse]:
+    """
+    Generate a response from the AI model using the Responses API.
+
+    This endpoint provides a more flexible and powerful way to interact with models,
+    supporting complex inputs and a structured event stream.
+
+    - **req**: Response request containing input and model specifications
+    - **Returns**: Full response with model output, usage statistics, and cryptographic signature
+
+    ### Request Requirements
+    - Must include non-empty input (string or structured input)
+    - Must specify a model
+    - Supports optional instructions to guide the model's behavior
+    - Optional web_search parameter to enhance context with current information
+
+    ### Response Components
+    - Model-generated text completion
+    - Token usage metrics
+    - Cryptographically signed response for verification
+
+    ### Processing Steps
+    1. Validate input request parameters
+    2. If web_search is enabled, perform web search and enhance context
+    3. Prepare input for model processing
+    4. Generate AI model response
+    5. Track and update token usage
+    6. Cryptographically sign the response
+
+    ### Web Search Feature
+    When web_search=True, the system will:
+    - Extract the user's query from the input
+    - Perform a web search using Brave API
+    - Enhance the context with current information
+    - Add search results to instructions for better responses
+
+    ### Potential HTTP Errors
+    - **400 Bad Request**:
+      - Missing or empty input
+      - No model specified
+    - **500 Internal Server Error**:
+      - Model fails to generate a response
+    """
+    if not req.input:
+        raise HTTPException(
+            status_code=400,
+            detail="Request 'input' field cannot be empty.",
+        )
+    model_name = req.model
+    request_id = str(uuid.uuid4())
+    t_start = time.monotonic()
+
+    endpoint = await state.get_model(model_name)
+    if endpoint is None:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Invalid model name {model_name}, check /v1/models for options",
+        )
+
+    if not endpoint.metadata.tool_support and req.tools:
+        raise HTTPException(
+            status_code=400,
+            detail="Model does not support tool usage, remove tools from request",
+        )
+
+    has_multimodal = req.has_multimodal_content()
+    if has_multimodal and (not endpoint.metadata.multimodal_support or req.web_search):
+        raise HTTPException(
+            status_code=400,
+            detail="Model does not support multimodal content, remove image inputs from request",
+        )
+
+    model_url = endpoint.url + "/v1/"
+
+    client = AsyncOpenAI(base_url=model_url, api_key="<not-needed>")
+    if auth_info.prompt_document:
+        try:
+            nildb_prompt: str = await get_prompt_from_nildb(auth_info.prompt_document)
+            req.ensure_instructions(nildb_prompt)
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"Unable to extract prompt from nilDB: {str(e)}",
+            )
+
+    input_items = req.input
+    instructions = req.instructions
+    sources: Optional[List[Source]] = None
+
+    if req.web_search:
+        logger.info(f"[responses] web_search start request_id={request_id}")
+        t_ws = time.monotonic()
+        web_search_result = await handle_web_search_for_responses(
+            req, model_name, client
+        )
+        input_items = web_search_result.input
+        instructions = web_search_result.instructions
+        sources = web_search_result.sources
+        logger.info(
+            f"[responses] web_search done request_id={request_id} sources={len(sources) if sources else 0} duration_ms={(time.monotonic() - t_ws) * 1000:.0f}"
+        )
+
+    if req.stream:
+
+        async def response_stream_generator() -> AsyncGenerator[str, None]:
+            t_call = time.monotonic()
+            prompt_token_usage = 0
+            completion_token_usage = 0
+
+            try:
+                logger.info(f"[responses] stream start request_id={request_id}")
+                request_kwargs = {
+                    "model": req.model,
+                    "input": input_items,
+                    "instructions": instructions,
+                    "stream": True,
+                    "top_p": req.top_p,
+                    "temperature": req.temperature,
+                    "max_output_tokens": req.max_output_tokens,
+                    "extra_body": {
+                        "stream_options": {
+                            "include_usage": True,
+                            "continuous_usage_stats": False,
+                        }
+                    },
+                }
+                if req.tools:
+                    request_kwargs["tools"] = req.tools
+
+                stream = await client.responses.create(**request_kwargs)
+
+                async for event in stream:
+                    payload = event.model_dump(exclude_unset=True)
+
+                    if isinstance(event, ResponseCompletedEvent):
+                        if event.response and event.response.usage:
+                            usage = event.response.usage
+                            prompt_token_usage = usage.input_tokens
+                            completion_token_usage = usage.output_tokens
+                            payload["response"]["usage"] = usage.model_dump(mode="json")
+
+                        if sources:
+                            if "data" not in payload:
+                                payload["data"] = {}
+                            payload["data"]["sources"] = [
+                                s.model_dump(mode="json") for s in sources
+                            ]
+
+                    yield f"data: {json.dumps(payload)}\n\n"
+
+                await UserManager.update_token_usage(
+                    auth_info.user.userid,
+                    prompt_tokens=prompt_token_usage,
+                    completion_tokens=completion_token_usage,
+                )
+                await QueryLogManager.log_query(
+                    auth_info.user.userid,
+                    model=req.model,
+                    prompt_tokens=prompt_token_usage,
+                    completion_tokens=completion_token_usage,
+                    web_search_calls=len(sources) if sources else 0,
+                )
+                logger.info(
+                    "[responses] stream done request_id=%s prompt_tokens=%d completion_tokens=%d duration_ms=%.0f total_ms=%.0f",
+                    request_id,
+                    prompt_token_usage,
+                    completion_token_usage,
+                    (time.monotonic() - t_call) * 1000,
+                    (time.monotonic() - t_start) * 1000,
+                )
+
+            except Exception as e:
+                logger.error(
+                    "[responses] stream error request_id=%s error=%s", request_id, e
+                )
+                yield f"data: {json.dumps({'error': 'stream_failed', 'message': str(e)})}\n\n"
+
+        return StreamingResponse(
+            response_stream_generator(), media_type="text/event-stream"
+        )
+
+    request_kwargs = {
+        "model": req.model,
+        "input": input_items,
+        "instructions": instructions,
+        "top_p": req.top_p,
+        "temperature": req.temperature,
+        "max_output_tokens": req.max_output_tokens,
+    }
+    if req.tools:
+        request_kwargs["tools"] = req.tools
+        request_kwargs["tool_choice"] = req.tool_choice
+
+    logger.info(f"[responses] call start request_id={request_id}")
+    t_call = time.monotonic()
+
+    response = await client.responses.create(**request_kwargs)
+    logger.info(
+        f"[responses] call done request_id={request_id} duration_ms={(time.monotonic() - t_call) * 1000:.0f}"
+    )
+
+    (
+        final_response,
+        agg_prompt_tokens,
+        agg_completion_tokens,
+    ) = await handle_responses_tool_workflow(client, req, input_items, response)
+
+    model_response = SignedResponse(
+        **final_response.model_dump(),
+        signature="",
+        sources=sources,
+    )
+
+    if model_response.usage is None:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Model response does not contain usage statistics",
+        )
+
+    if agg_prompt_tokens or agg_completion_tokens:
+        model_response.usage.input_tokens += agg_prompt_tokens
+        model_response.usage.output_tokens += agg_completion_tokens
+
+    prompt_tokens = model_response.usage.input_tokens
+    completion_tokens = model_response.usage.output_tokens
+
+    await UserManager.update_token_usage(
+        auth_info.user.userid,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+    )
+    await QueryLogManager.log_query(
+        auth_info.user.userid,
+        model=req.model,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        web_search_calls=len(sources) if sources else 0,
+    )
+
+    response_json = model_response.model_dump_json()
+    signature = sign_message(state.private_key, response_json)
+    model_response.signature = b64encode(signature).decode()
+
+    logger.info(
+        f"[responses] done request_id={request_id} prompt_tokens={prompt_tokens} completion_tokens={completion_tokens} total_ms={(time.monotonic() - t_start) * 1000:.0f}"
+    )
+    return model_response
diff --git a/nilai-api/src/nilai_api/routers/private.py b/nilai-api/src/nilai_api/routers/private.py
index 93540db7..b2956553 100644
--- a/nilai-api/src/nilai_api/routers/private.py
+++ b/nilai-api/src/nilai_api/routers/private.py
@@ -1,53 +1,32 @@
-# Fast API and serving
-import json
 import logging
-import time
-import uuid
-from base64 import b64encode
-from typing import AsyncGenerator, Optional, Union, List, Tuple
-from nilai_api.attestation import get_attestation_report
-from nilai_api.handlers.nilrag import handle_nilrag
-from nilai_api.handlers.web_search import handle_web_search
-from nilai_api.handlers.tools.tool_router import handle_tool_workflow
+from typing import Optional, List
 
-from fastapi import APIRouter, Body, Depends, HTTPException, status, Request
-from fastapi.responses import StreamingResponse
-from nilai_api.auth import get_auth_info, AuthenticationInfo
-from nilai_api.config import CONFIG
-from nilai_api.crypto import sign_message
-from nilai_api.db.logs import QueryLogManager
-from nilai_api.db.users import UserManager
-from nilai_api.rate_limiting import RateLimit
-from nilai_api.state import state
+from fastapi import APIRouter, Depends, HTTPException, status
 
+from nilai_api.attestation import get_attestation_report
+from nilai_api.auth import get_auth_info, AuthenticationInfo
 from nilai_api.handlers.nildb.api_model import (
     PromptDelegationRequest,
     PromptDelegationToken,
 )
-from nilai_api.handlers.nildb.handler import (
-    get_nildb_delegation_token,
-    get_prompt_from_nildb,
-)
+from nilai_api.handlers.nildb.handler import get_nildb_delegation_token
+from nilai_api.routers.endpoints.chat import chat_completion_router
+from nilai_api.routers.endpoints.responses import responses_router
+from nilai_api.state import state
 
-# Internal libraries
 from nilai_common import (
     AttestationReport,
-    ChatRequest,
     ModelMetadata,
-    MessageAdapter,
-    SignedChatCompletion,
     Nonce,
-    Source,
     Usage,
 )
 
 
-from openai import AsyncOpenAI
-
-
 logger = logging.getLogger(__name__)
 
 router = APIRouter()
+router.include_router(chat_completion_router)
+router.include_router(responses_router)
 
 
 @router.get("/v1/delegation")
@@ -135,331 +114,3 @@ async def get_models(
     ```
     """
     return [endpoint.metadata for endpoint in (await state.models).values()]
-
-
-async def chat_completion_concurrent_rate_limit(request: Request) -> Tuple[int, str]:
-    body = await request.json()
-    try:
-        chat_request = ChatRequest(**body)
-    except ValueError:
-        raise HTTPException(status_code=400, detail="Invalid request body")
-    key = f"chat:{chat_request.model}"
-    limit = CONFIG.rate_limiting.model_concurrent_rate_limit.get(
-        chat_request.model,
-        CONFIG.rate_limiting.model_concurrent_rate_limit.get("default", 50),
-    )
-    return limit, key
-
-
-async def chat_completion_web_search_rate_limit(request: Request) -> bool:
-    """Extract web_search flag from request body for rate limiting."""
-    body = await request.json()
-    try:
-        chat_request = ChatRequest(**body)
-    except ValueError:
-        raise HTTPException(status_code=400, detail="Invalid request body")
-    return bool(chat_request.web_search)
-
-
-@router.post("/v1/chat/completions", tags=["Chat"], response_model=None)
-async def chat_completion(
-    req: ChatRequest = Body(
-        ChatRequest(
-            model="meta-llama/Llama-3.2-1B-Instruct",
-            messages=[
-                MessageAdapter.new_message(
-                    role="system", content="You are a helpful assistant."
-                ),
-                MessageAdapter.new_message(role="user", content="What is your name?"),
-            ],
-        )
-    ),
-    _rate_limit=Depends(
-        RateLimit(
-            concurrent_extractor=chat_completion_concurrent_rate_limit,
-            web_search_extractor=chat_completion_web_search_rate_limit,
-        )
-    ),
-    auth_info: AuthenticationInfo = Depends(get_auth_info),
-) -> Union[SignedChatCompletion, StreamingResponse]:
-    """
-    Generate a chat completion response from the AI model.
-
-    - **req**: Chat completion request containing messages and model specifications
-    - **user**: Authenticated user information (through HTTP Bearer header)
-    - **Returns**: Full chat response with model output, usage statistics, and cryptographic signature
-
-    ### Request Requirements
-    - Must include non-empty list of messages
-    - Must specify a model
-    - Supports multiple message formats (system, user, assistant)
-    - Optional web_search parameter to enhance context with current information
-
-    ### Response Components
-    - Model-generated text completion
-    - Token usage metrics
-    - Cryptographically signed response for verification
-
-    ### Processing Steps
-    1. Validate input request parameters
-    2. If web_search is enabled, perform web search and enhance context
-    3. Prepare messages for model processing
-    4. Generate AI model response
-    5. Track and update token usage
-    6. Cryptographically sign the response
-
-    ### Web Search Feature
-    When web_search=True, the system will:
-    - Extract the user's query from the last user message
-    - Perform a web search using Brave API
-    - Enhance the conversation context with current information
-    - Add search results as a system message for better responses
-
-    ### Potential HTTP Errors
-    - **400 Bad Request**:
-      - Missing messages list
-      - No model specified
-    - **500 Internal Server Error**:
-      - Model fails to generate a response
-
-    ### Example
-    ```python
-    # Generate a chat completion with web search
-    request = ChatRequest(
-        model="meta-llama/Llama-3.2-1B-Instruct",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant"},
-            {"role": "user", "content": "What is your name?"}
-        ],
-    )
-    response = await chat_completion(request, user)
-    """
-
-    if len(req.messages) == 0:
-        raise HTTPException(
-            status_code=400,
-            detail="Request contained 0 messages",
-        )
-    model_name = req.model
-    request_id = str(uuid.uuid4())
-    t_start = time.monotonic()
-    logger.info(f"[chat] call start request_id={req.messages}")
-    endpoint = await state.get_model(model_name)
-    if endpoint is None:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Invalid model name {model_name}, check /v1/models for options",
-        )
-
-    if not endpoint.metadata.tool_support and req.tools:
-        raise HTTPException(
-            status_code=400,
-            detail="Model does not support tool usage, remove tools from request",
-        )
-
-    has_multimodal = req.has_multimodal_content()
-    logger.info(f"[chat] has_multimodal: {has_multimodal}")
-    if has_multimodal and (not endpoint.metadata.multimodal_support or req.web_search):
-        raise HTTPException(
-            status_code=400,
-            detail="Model does not support multimodal content, remove image inputs from request",
-        )
-
-    model_url = endpoint.url + "/v1/"
-
-    logger.info(
-        f"[chat] start request_id={request_id} user={auth_info.user.userid} model={model_name} stream={req.stream} web_search={bool(req.web_search)} tools={bool(req.tools)} multimodal={has_multimodal} url={model_url}"
-    )
-
-    client = AsyncOpenAI(base_url=model_url, api_key="<not-needed>")
-    if auth_info.prompt_document:
-        try:
-            nildb_prompt: str = await get_prompt_from_nildb(auth_info.prompt_document)
-            req.messages.insert(
-                0, MessageAdapter.new_message(role="system", content=nildb_prompt)
-            )
-        except Exception as e:
-            raise HTTPException(
-                status_code=status.HTTP_403_FORBIDDEN,
-                detail=f"Unable to extract prompt from nilDB: {str(e)}",
-            )
-
-    if req.nilrag:
-        logger.info(f"[chat] nilrag start request_id={request_id}")
-        t_nilrag = time.monotonic()
-        await handle_nilrag(req)
-        logger.info(
-            f"[chat] nilrag done request_id={request_id} duration_ms={(time.monotonic() - t_nilrag) * 1000:.0f}"
-        )
-
-    messages = req.messages
-    sources: Optional[List[Source]] = None
-
-    if req.web_search:
-        logger.info(f"[chat] web_search start request_id={request_id}")
-        t_ws = time.monotonic()
-        web_search_result = await handle_web_search(req, model_name, client)
-        messages = web_search_result.messages
-        sources = web_search_result.sources
-        logger.info(
-            f"[chat] web_search done request_id={request_id} sources={len(sources) if sources else 0} duration_ms={(time.monotonic() - t_ws) * 1000:.0f}"
-        )
-        logger.info(f"[chat] web_search messages: {messages}")
-
-    if req.stream:
-
-        async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
-            t_call = time.monotonic()
-            prompt_token_usage = 0
-            completion_token_usage = 0
-
-            try:
-                logger.info(f"[chat] stream start request_id={request_id}")
-
-                request_kwargs = {
-                    "model": req.model,
-                    "messages": messages,
-                    "stream": True,
-                    "top_p": req.top_p,
-                    "temperature": req.temperature,
-                    "max_tokens": req.max_tokens,
-                    "extra_body": {
-                        "stream_options": {
-                            "include_usage": True,
-                            "continuous_usage_stats": False,
-                        }
-                    },
-                }
-                if req.tools:
-                    request_kwargs["tools"] = req.tools
-
-                response = await client.chat.completions.create(**request_kwargs)
-
-                async for chunk in response:
-                    if chunk.usage is not None:
-                        prompt_token_usage = chunk.usage.prompt_tokens
-                        completion_token_usage = chunk.usage.completion_tokens
-
-                    payload = chunk.model_dump(exclude_unset=True)
-
-                    if chunk.usage is not None and sources:
-                        payload["sources"] = [
-                            s.model_dump(mode="json") for s in sources
-                        ]
-
-                    yield f"data: {json.dumps(payload)}\n\n"
-
-                await UserManager.update_token_usage(
-                    auth_info.user.userid,
-                    prompt_tokens=prompt_token_usage,
-                    completion_tokens=completion_token_usage,
-                )
-                await QueryLogManager.log_query(
-                    auth_info.user.userid,
-                    model=req.model,
-                    prompt_tokens=prompt_token_usage,
-                    completion_tokens=completion_token_usage,
-                    web_search_calls=len(sources) if sources else 0,
-                )
-                logger.info(
-                    "[chat] stream done request_id=%s prompt_tokens=%d completion_tokens=%d "
-                    "duration_ms=%.0f total_ms=%.0f",
-                    request_id,
-                    prompt_token_usage,
-                    completion_token_usage,
-                    (time.monotonic() - t_call) * 1000,
-                    (time.monotonic() - t_start) * 1000,
-                )
-
-            except Exception as e:
-                logger.error(
-                    "[chat] stream error request_id=%s error=%s", request_id, e
-                )
-                yield f"data: {json.dumps({'error': 'stream_failed', 'message': str(e)})}\n\n"
-
-        return StreamingResponse(
-            chat_completion_stream_generator(),
-            media_type="text/event-stream",
-        )
-
-    current_messages = messages
-    request_kwargs = {
-        "model": req.model,
-        "messages": current_messages,  # type: ignore
-        "top_p": req.top_p,
-        "temperature": req.temperature,
-        "max_tokens": req.max_tokens,
-    }
-    if req.tools:
-        request_kwargs["tools"] = req.tools  # type: ignore
-        request_kwargs["tool_choice"] = req.tool_choice
-
-    logger.info(f"[chat] call start request_id={request_id}")
-    logger.info(f"[chat] call message: {current_messages}")
-    t_call = time.monotonic()
-    response = await client.chat.completions.create(**request_kwargs)  # type: ignore
-    logger.info(
-        f"[chat] call done request_id={request_id} duration_ms={(time.monotonic() - t_call) * 1000:.0f}"
-    )
-    logger.info(f"[chat] call response: {response}")
-
-    # Handle tool workflow fully inside tools.router
-    (
-        final_completion,
-        agg_prompt_tokens,
-        agg_completion_tokens,
-    ) = await handle_tool_workflow(client, req, current_messages, response)
-    logger.info(f"[chat] call final_completion: {final_completion}")
-    model_response = SignedChatCompletion(
-        **final_completion.model_dump(),
-        signature="",
-        sources=sources,
-    )
-
-    logger.info(
-        f"[chat] model_response request_id={request_id} duration_ms={(time.monotonic() - t_call) * 1000:.0f}"
-    )
-
-    if model_response.usage is None:
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="Model response does not contain usage statistics",
-        )
-
-    if agg_prompt_tokens or agg_completion_tokens:
-        total_prompt_tokens = response.usage.prompt_tokens
-        total_completion_tokens = response.usage.completion_tokens
-
-        total_prompt_tokens += agg_prompt_tokens
-        total_completion_tokens += agg_completion_tokens
-
-        model_response.usage.prompt_tokens = total_prompt_tokens
-        model_response.usage.completion_tokens = total_completion_tokens
-        model_response.usage.total_tokens = (
-            total_prompt_tokens + total_completion_tokens
-        )
-
-    # Update token usage in DB
-    await UserManager.update_token_usage(
-        auth_info.user.userid,
-        prompt_tokens=model_response.usage.prompt_tokens,
-        completion_tokens=model_response.usage.completion_tokens,
-    )
-
-    await QueryLogManager.log_query(
-        auth_info.user.userid,
-        model=req.model,
-        prompt_tokens=model_response.usage.prompt_tokens,
-        completion_tokens=model_response.usage.completion_tokens,
-        web_search_calls=len(sources) if sources else 0,
-    )
-
-    # Sign the response
-    response_json = model_response.model_dump_json()
-    signature = sign_message(state.private_key, response_json)
-    model_response.signature = b64encode(signature).decode()
-
-    logger.info(
-        f"[chat] done request_id={request_id} prompt_tokens={model_response.usage.prompt_tokens} completion_tokens={model_response.usage.completion_tokens} total_ms={(time.monotonic() - t_start) * 1000:.0f}"
-    )
-    return model_response
diff --git a/nilai-api/src/nilai_api/state.py b/nilai-api/src/nilai_api/state.py
index 14e0e903..cf046c74 100644
--- a/nilai-api/src/nilai_api/state.py
+++ b/nilai-api/src/nilai_api/state.py
@@ -6,7 +6,7 @@
 from nilai_api.config import CONFIG
 from nilai_api.crypto import generate_key_pair
 from nilai_common import ModelServiceDiscovery
-from nilai_common.api_model import ModelEndpoint
+from nilai_common.api_models import ModelEndpoint
 
 logger = logging.getLogger("uvicorn.error")
 
diff --git a/nilai-attestation/src/nilai_attestation/attestation/nvtrust/nv_verifier.py b/nilai-attestation/src/nilai_attestation/attestation/nvtrust/nv_verifier.py
index 755b5666..9131507f 100644
--- a/nilai-attestation/src/nilai_attestation/attestation/nvtrust/nv_verifier.py
+++ b/nilai-attestation/src/nilai_attestation/attestation/nvtrust/nv_verifier.py
@@ -4,7 +4,7 @@
 # Verifier: Validate an attestation token against a remote policy
 # Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
-from nilai_common.api_model import AttestationReport
+from nilai_common.api_models import AttestationReport
 import json
 import base64
 from nilai_common.logger import setup_logger
diff --git a/packages/nilai-common/pyproject.toml b/packages/nilai-common/pyproject.toml
index 8c9d3110..56b20bd1 100644
--- a/packages/nilai-common/pyproject.toml
+++ b/packages/nilai-common/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 requires-python = ">=3.12"
 dependencies = [
     "etcd3gw>=2.4.2",
-    "openai>=1.59.9",
+    "openai>=1.99.2",
     "pydantic>=2.10.1",
     "tenacity>=9.0.0",
 ]
diff --git a/packages/nilai-common/src/nilai_common/__init__.py b/packages/nilai-common/src/nilai_common/__init__.py
index bf2889ea..385deb3b 100644
--- a/packages/nilai-common/src/nilai_common/__init__.py
+++ b/packages/nilai-common/src/nilai_common/__init__.py
@@ -1,12 +1,5 @@
-from nilai_common.api_model import (
+from nilai_common.api_models import (
     AttestationReport,
-    ChatRequest,
-    SignedChatCompletion,
-    Choice,
-    ChatCompletion,
-    ChatCompletionMessage,
-    ChatCompletionMessageToolCall,
-    ChatToolFunction,
     HealthCheckResponse,
     ModelEndpoint,
     ModelMetadata,
@@ -15,13 +8,31 @@
     NVAttestationToken,
     SearchResult,
     Source,
-    WebSearchEnhancedMessages,
-    WebSearchContext,
     ResultContent,
     TopicResponse,
     Topic,
+    ChatRequest,
+    SignedChatCompletion,
+    Choice,
+    ChatCompletion,
+    ChatCompletionMessage,
+    ChatCompletionMessageToolCall,
+    ChatToolFunction,
+    WebSearchEnhancedMessages,
+    WebSearchContext,
     Message,
     MessageAdapter,
+    Response,
+    ResponseCompletedEvent,
+    ResponseRequest,
+    SignedResponse,
+    ResponseFunctionToolCall,
+    FunctionCallOutput,
+    ResponseFunctionToolCallOutputItem,
+    ResponseInputParam,
+    ResponseInputItemParam,
+    EasyInputMessageParam,
+    ResponseFunctionToolCallParam,
 )
 from nilai_common.config import SETTINGS, MODEL_SETTINGS
 from nilai_common.discovery import ModelServiceDiscovery
@@ -55,4 +66,15 @@
     "WebSearchEnhancedMessages",
     "WebSearchContext",
     "ResultContent",
+    "Response",
+    "ResponseCompletedEvent",
+    "ResponseRequest",
+    "SignedResponse",
+    "ResponseFunctionToolCall",
+    "FunctionCallOutput",
+    "ResponseFunctionToolCallOutputItem",
+    "ResponseInputParam",
+    "ResponseInputItemParam",
+    "EasyInputMessageParam",
+    "ResponseFunctionToolCallParam",
 ]
diff --git a/packages/nilai-common/src/nilai_common/api_models/__init__.py b/packages/nilai-common/src/nilai_common/api_models/__init__.py
new file mode 100644
index 00000000..95243ff2
--- /dev/null
+++ b/packages/nilai-common/src/nilai_common/api_models/__init__.py
@@ -0,0 +1,90 @@
+from openai.types.responses import ResponseCompletedEvent
+
+from nilai_common.api_models.common_model import (
+    AttestationReport,
+    HealthCheckResponse,
+    ModelEndpoint,
+    ModelMetadata,
+    Nonce,
+    AMDAttestationToken,
+    NVAttestationToken,
+    SearchResult,
+    Source,
+    ResultContent,
+    TopicResponse,
+    Topic,
+    TopicQuery,
+)
+
+from nilai_common.api_models.chat_completion_model import (
+    ChatRequest,
+    SignedChatCompletion,
+    Choice,
+    ChatCompletion,
+    ChatCompletionMessage,
+    ChatCompletionMessageToolCall,
+    ChatToolFunction,
+    WebSearchEnhancedMessages,
+    WebSearchContext,
+    Message,
+    MessageAdapter,
+    ImageContent,
+    TextContent,
+)
+
+from nilai_common.api_models.responses_model import (
+    Response,
+    ResponseRequest,
+    SignedResponse,
+    ToolChoice,
+    WebSearchEnhancedInput,
+    ResponseFunctionToolCall,
+    FunctionCallOutput,
+    ResponseFunctionToolCallOutputItem,
+    ResponseInputParam,
+    ResponseInputItemParam,
+    EasyInputMessageParam,
+    ResponseFunctionToolCallParam,
+)
+
+__all__ = [
+    "AttestationReport",
+    "HealthCheckResponse",
+    "ModelEndpoint",
+    "ModelMetadata",
+    "Nonce",
+    "AMDAttestationToken",
+    "NVAttestationToken",
+    "SearchResult",
+    "Source",
+    "ResultContent",
+    "TopicResponse",
+    "Topic",
+    "TopicQuery",
+    "ChatRequest",
+    "SignedChatCompletion",
+    "Choice",
+    "ChatCompletion",
+    "ChatCompletionMessage",
+    "ChatCompletionMessageToolCall",
+    "ChatToolFunction",
+    "WebSearchEnhancedMessages",
+    "WebSearchContext",
+    "Message",
+    "MessageAdapter",
+    "ImageContent",
+    "TextContent",
+    "Response",
+    "ResponseCompletedEvent",
+    "ResponseRequest",
+    "SignedResponse",
+    "ToolChoice",
+    "WebSearchEnhancedInput",
+    "ResponseFunctionToolCall",
+    "FunctionCallOutput",
+    "ResponseFunctionToolCallOutputItem",
+    "ResponseInputParam",
+    "ResponseInputItemParam",
+    "EasyInputMessageParam",
+    "ResponseFunctionToolCallParam",
+]
diff --git a/packages/nilai-common/src/nilai_common/api_model.py b/packages/nilai-common/src/nilai_common/api_models/chat_completion_model.py
similarity index 76%
rename from packages/nilai-common/src/nilai_common/api_model.py
rename to packages/nilai-common/src/nilai_common/api_models/chat_completion_model.py
index 07a59a75..1256279d 100644
--- a/packages/nilai-common/src/nilai_common/api_model.py
+++ b/packages/nilai-common/src/nilai_common/api_models/chat_completion_model.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
 
-import uuid
 from typing import (
-    Annotated,
     Iterable,
     List,
     Optional,
@@ -34,60 +32,18 @@
 from openai.types.chat.chat_completion import Choice as OpenaAIChoice
 from pydantic import BaseModel, Field
 
-ChatToolFunction: TypeAlias = Function
+from nilai_common.api_models.common_model import Source
 
-# ---------- Aliases from the OpenAI SDK ----------
+ChatToolFunction: TypeAlias = Function
 ImageContent: TypeAlias = ChatCompletionContentPartImageParam
 TextContent: TypeAlias = ChatCompletionContentPartTextParam
-Message: TypeAlias = ChatCompletionMessageParam  # SDK union of message shapes
-
-
-# ---------- Domain-specific objects for web search ----------
-class ResultContent(BaseModel):
-    text: str
-    truncated: bool = False
+Message: TypeAlias = ChatCompletionMessageParam
 
 
 class Choice(OpenaAIChoice):
     pass
 
 
-class Source(BaseModel):
-    source: str
-    content: str
-
-
-class SearchResult(BaseModel):
-    title: str
-    body: str
-    url: str
-    content: ResultContent | None = None
-
-    def as_source(self) -> "Source":
-        text = self.content.text if self.content else self.body
-        return Source(source=self.url, content=text)
-
-    def model_post_init(self, __context) -> None:
-        # Auto-derive structured fields when not provided
-        if self.content is None and isinstance(self.body, str) and self.body:
-            self.content = ResultContent(text=self.body)
-
-
-class Topic(BaseModel):
-    topic: str
-    needs_search: bool = Field(..., alias="needs_search")
-
-
-class TopicResponse(BaseModel):
-    topics: List[Topic]
-
-
-class TopicQuery(BaseModel):
-    topic: str
-    query: str
-
-
-# ---------- Helpers ----------
 def _extract_text_from_content(content: Any) -> Optional[str]:
     """
     - If content is a str -> return it (stripped) if non-empty.
@@ -109,7 +65,6 @@ def _extract_text_from_content(content: Any) -> Optional[str]:
     return None
 
 
-# ---------- Adapter over the raw SDK message ----------
 class MessageAdapter(BaseModel):
     """Thin wrapper around an OpenAI ChatCompletionMessageParam with convenience methods."""
 
@@ -126,8 +81,6 @@ def role(
     ) -> None:
         if not isinstance(value, str):
             raise TypeError("role must be a string")
-        # Update the underlying SDK message dict
-        # Cast to Any to bypass TypedDict restrictions
         cast(Any, self.raw)["role"] = value
 
     @property
@@ -136,8 +89,6 @@ def content(self) -> Any:
 
     @content.setter
     def content(self, value: Any) -> None:
-        # Update the underlying SDK message dict
-        # Cast to Any to bypass TypedDict restrictions
         cast(Any, self.raw)["content"] = value
 
     @staticmethod
@@ -234,7 +185,6 @@ def adapt_messages(msgs: List[Message]) -> List[MessageAdapter]:
     return [MessageAdapter(raw=m) for m in msgs]
 
 
-# ---------- Your additional containers ----------
 class WebSearchEnhancedMessages(BaseModel):
     messages: List[Message]
     sources: List[Source]
@@ -247,7 +197,6 @@ class WebSearchContext(BaseModel):
     sources: List[Source]
 
 
-# ---------- Request/response models ----------
 class ChatRequest(BaseModel):
     model: str
     messages: List[Message] = Field(..., min_length=1)
@@ -264,7 +213,6 @@ class ChatRequest(BaseModel):
     )
 
     def model_post_init(self, __context) -> None:
-        # Process messages after model initialization
         for i, msg in enumerate(self.messages):
             content = msg.get("content")
             if (
@@ -272,7 +220,6 @@ def model_post_init(self, __context) -> None:
                 and hasattr(content, "__iter__")
                 and hasattr(content, "__next__")
             ):
-                # Convert iterator to list in place
                 cast(Any, msg)["content"] = list(content)
 
     @property
@@ -338,52 +285,3 @@ class SignedChatCompletion(ChatCompletion):
     sources: Optional[List[Source]] = Field(
         default=None, description="Sources used for web search when enabled"
     )
-
-
-class ModelMetadata(BaseModel):
-    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
-    name: str
-    version: str
-    description: str
-    author: str
-    license: str
-    source: str
-    supported_features: List[str]
-    tool_support: bool
-    multimodal_support: bool = False
-
-
-class ModelEndpoint(BaseModel):
-    url: str
-    metadata: ModelMetadata
-
-
-class HealthCheckResponse(BaseModel):
-    status: str
-    uptime: str
-
-
-# ---------- Attestation ----------
-Nonce = Annotated[
-    str,
-    Field(
-        max_length=64,
-        min_length=64,
-        description="The nonce to be used for the attestation",
-    ),
-]
-
-AMDAttestationToken = Annotated[
-    str, Field(description="The attestation token from AMD's attestation service")
-]
-
-NVAttestationToken = Annotated[
-    str, Field(description="The attestation token from NVIDIA's attestation service")
-]
-
-
-class AttestationReport(BaseModel):
-    nonce: Nonce
-    verifying_key: Annotated[str, Field(description="PEM encoded public key")]
-    cpu_attestation: AMDAttestationToken
-    gpu_attestation: NVAttestationToken
diff --git a/packages/nilai-common/src/nilai_common/api_models/common_model.py b/packages/nilai-common/src/nilai_common/api_models/common_model.py
new file mode 100644
index 00000000..2934a212
--- /dev/null
+++ b/packages/nilai-common/src/nilai_common/api_models/common_model.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import uuid
+from typing import Annotated, List
+from pydantic import BaseModel, Field
+
+
+class ResultContent(BaseModel):
+    text: str
+    truncated: bool = False
+
+
+class Source(BaseModel):
+    source: str
+    content: str
+
+
+class SearchResult(BaseModel):
+    title: str
+    body: str
+    url: str
+    content: ResultContent | None = None
+
+    def as_source(self) -> "Source":
+        text = self.content.text if self.content else self.body
+        return Source(source=self.url, content=text)
+
+    def model_post_init(self, __context) -> None:
+        if self.content is None and isinstance(self.body, str) and self.body:
+            self.content = ResultContent(text=self.body)
+
+
+class Topic(BaseModel):
+    topic: str
+    needs_search: bool = Field(..., alias="needs_search")
+
+
+class TopicResponse(BaseModel):
+    topics: List[Topic]
+
+
+class TopicQuery(BaseModel):
+    topic: str
+    query: str
+
+
+class ModelMetadata(BaseModel):
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    name: str
+    version: str
+    description: str
+    author: str
+    license: str
+    source: str
+    supported_features: List[str]
+    tool_support: bool
+    multimodal_support: bool = False
+
+
+class ModelEndpoint(BaseModel):
+    url: str
+    metadata: ModelMetadata
+
+
+class HealthCheckResponse(BaseModel):
+    status: str
+    uptime: str
+
+
+Nonce = Annotated[
+    str,
+    Field(
+        max_length=64,
+        min_length=64,
+        description="The nonce to be used for the attestation",
+    ),
+]
+
+AMDAttestationToken = Annotated[
+    str, Field(description="The attestation token from AMD's attestation service")
+]
+
+NVAttestationToken = Annotated[
+    str, Field(description="The attestation token from NVIDIA's attestation service")
+]
+
+
+class AttestationReport(BaseModel):
+    nonce: Nonce
+    verifying_key: Annotated[str, Field(description="PEM encoded public key")]
+    cpu_attestation: AMDAttestationToken
+    gpu_attestation: NVAttestationToken
diff --git a/packages/nilai-common/src/nilai_common/api_models/responses_model.py b/packages/nilai-common/src/nilai_common/api_models/responses_model.py
new file mode 100644
index 00000000..ad6db3d4
--- /dev/null
+++ b/packages/nilai-common/src/nilai_common/api_models/responses_model.py
@@ -0,0 +1,202 @@
+from __future__ import annotations
+
+from typing import Iterable, List, Optional, Union
+from typing_extensions import Literal, TypeAlias
+
+from pydantic import BaseModel, Field
+
+from openai.types.responses import (
+    Response,
+    ResponseInputParam,
+    ToolParam,
+    ResponseFunctionToolCall as OpenAIResponseFunctionToolCall,
+    ResponseFunctionToolCallOutputItem,
+)
+from openai.types.responses.response_includable import ResponseIncludable
+from openai.types.responses.tool_choice_options import ToolChoiceOptions
+from openai.types.responses.tool_choice_allowed_param import ToolChoiceAllowedParam
+from openai.types.responses.tool_choice_types_param import ToolChoiceTypesParam
+from openai.types.responses.tool_choice_function_param import ToolChoiceFunctionParam
+from openai.types.responses.tool_choice_mcp_param import ToolChoiceMcpParam
+from openai.types.responses.tool_choice_custom_param import ToolChoiceCustomParam
+from openai.types.responses.response_prompt_param import ResponsePromptParam
+from openai.types.responses.response_text_config_param import ResponseTextConfigParam
+from openai.types.responses.response_conversation_param import ResponseConversationParam
+from openai.types.responses.response_input_param import (
+    ResponseInputItemParam,
+)
+from openai.types.responses.easy_input_message_param import EasyInputMessageParam
+from openai.types.responses.response_function_tool_call_param import (
+    ResponseFunctionToolCallParam,
+)
+from openai.types.shared_params.metadata import Metadata
+from openai.types.shared_params.reasoning import Reasoning
+from openai.types.shared_params.responses_model import ResponsesModel
+
+from .common_model import Source
+
+ToolChoice: TypeAlias = Union[
+    ToolChoiceOptions,
+    ToolChoiceAllowedParam,
+    ToolChoiceTypesParam,
+    ToolChoiceFunctionParam,
+    ToolChoiceMcpParam,
+    ToolChoiceCustomParam,
+]
+
+Conversation: TypeAlias = Union[str, ResponseConversationParam]
+
+ResponseFunctionToolCall: TypeAlias = OpenAIResponseFunctionToolCall
+FunctionCallOutput: TypeAlias = ResponseFunctionToolCallOutputItem
+
+__all__ = [
+    "Response",
+    "ResponseInputParam",
+    "ToolParam",
+    "ResponseInputItemParam",
+    "EasyInputMessageParam",
+    "ResponseFunctionToolCallParam",
+    "ResponseFunctionToolCall",
+    "FunctionCallOutput",
+    "ResponseFunctionToolCallOutputItem",
+    "ToolChoice",
+    "Conversation",
+    "StreamOptions",
+    "ResponseRequest",
+    "WebSearchEnhancedInput",
+    "SignedResponse",
+]
+
+
+class StreamOptions(BaseModel):
+    """Configuration options for streaming responses."""
+
+    include_obfuscation: Optional[bool] = None
+
+
+class ResponseRequest(BaseModel):
+    """Request model for generating AI responses with various configuration options."""
+
+    model: ResponsesModel
+
+    input: Union[str, ResponseInputParam]
+    instructions: Optional[str] = None
+
+    stream: Optional[bool] = Field(
+        default=False,
+        description=(
+            "If true, the response will stream using SSE. Matches the official "
+            "ResponseCreateParamsStreaming vs NonStreaming union behavior."
+        ),
+    )
+    stream_options: Optional[StreamOptions] = None
+
+    tools: Optional[Iterable[ToolParam]] = None
+    tool_choice: Optional[ToolChoice] = None
+    parallel_tool_calls: Optional[bool] = None
+    max_tool_calls: Optional[int] = Field(default=None, ge=0)
+
+    background: Optional[bool] = None
+    conversation: Optional[Conversation] = None
+    previous_response_id: Optional[str] = None
+    include: Optional[List[ResponseIncludable]] = None
+    prompt: Optional[ResponsePromptParam] = None
+
+    max_output_tokens: Optional[int] = Field(default=None, ge=1)
+    temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
+    top_logprobs: Optional[int] = Field(default=None, ge=0, le=20)
+
+    text: Optional[ResponseTextConfigParam] = None
+
+    reasoning: Optional[Reasoning] = None
+
+    prompt_cache_key: Optional[str] = None
+    safety_identifier: Optional[str] = None
+    service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = (
+        None
+    )
+    store: Optional[bool] = None
+    truncation: Optional[Literal["auto", "disabled"]] = None
+    user: Optional[str] = None
+
+    metadata: Optional[Metadata] = None
+
+    web_search: Optional[bool] = Field(
+        default=False,
+        description="Enable web search to enhance context with current information",
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def has_multimodal_content(self) -> bool:
+        if isinstance(self.input, str):
+            return False
+
+        if isinstance(self.input, list):
+            for item in self.input:
+                if isinstance(item, dict):
+                    if item.get("type") == "input_image":
+                        return True
+
+                    content = item.get("content")
+                    if isinstance(content, list):
+                        for part in content:
+                            if (
+                                isinstance(part, dict)
+                                and part.get("type") == "input_image"
+                            ):
+                                return True
+        return False
+
+    def get_last_user_query(self) -> Optional[str]:
+        if isinstance(self.input, str):
+            return self.input
+
+        if isinstance(self.input, list):
+            for item in reversed(self.input):
+                if isinstance(item, dict):
+                    role = item.get("role")
+                    if role == "user":
+                        content = item.get("content")
+                        if isinstance(content, str):
+                            return content.strip() or None
+                        elif isinstance(content, list):
+                            text_parts = []
+                            for part in content:
+                                if (
+                                    isinstance(part, dict)
+                                    and part.get("type") == "input_text"
+                                ):
+                                    text = part.get("text")
+                                    if isinstance(text, str) and text.strip():
+                                        text_parts.append(text.strip())
+                            if text_parts:
+                                return "\n".join(text_parts)
+        return None
+
+    def ensure_instructions(self, additional_instructions: str) -> None:
+        if self.instructions:
+            self.instructions = self.instructions + "\n\n" + additional_instructions
+        else:
+            self.instructions = additional_instructions
+
+
+class WebSearchEnhancedInput(BaseModel):
+    """Input model enhanced with web search sources for context-aware responses."""
+
+    input: Union[str, ResponseInputParam]
+    instructions: Optional[str]
+    sources: List[Source]
+
+
+class SignedResponse(Response):
+    """
+    An extension of the official Response object with cryptographic signature and web search sources.
+    """
+
+    signature: str
+    sources: Optional[List[Source]] = Field(
+        default=None, description="Sources used for web search when enabled"
+    )
diff --git a/packages/nilai-common/src/nilai_common/discovery.py b/packages/nilai-common/src/nilai_common/discovery.py
index 7d3b1cf8..a604fecb 100644
--- a/packages/nilai-common/src/nilai_common/discovery.py
+++ b/packages/nilai-common/src/nilai_common/discovery.py
@@ -9,7 +9,7 @@
 
 from etcd3gw import Lease
 from etcd3gw.client import Etcd3Client
-from nilai_common.api_model import ModelEndpoint, ModelMetadata
+from nilai_common.api_models import ModelEndpoint, ModelMetadata
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
diff --git a/scripts/wait_for_ci_services.sh b/scripts/wait_for_ci_services.sh
index 163fc50c..71d05ee2 100755
--- a/scripts/wait_for_ci_services.sh
+++ b/scripts/wait_for_ci_services.sh
@@ -2,16 +2,22 @@
 
 # Wait for the services to be ready
 API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-api 2>/dev/null)
-MODEL_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-llama_1b_gpu 2>/dev/null)
+MODEL_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai_gpt_20b_gpu_1 2>/dev/null)
 NUC_API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-nuc-api 2>/dev/null)
 MAX_ATTEMPTS=30
 ATTEMPT=1
 
 while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
     echo "Waiting for nilai to become healthy... API:[$API_HEALTH_STATUS] MODEL:[$MODEL_HEALTH_STATUS] NUC_API:[$NUC_API_HEALTH_STATUS] (Attempt $ATTEMPT/$MAX_ATTEMPTS)"
+    
+    echo "===== Model Container Logs (last 50 lines) ====="
+    docker logs --tail 50 nilai_gpt_20b_gpu_1 2>&1
+    docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}"
+    echo "================================================="
+    
     sleep 30
     API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-api 2>/dev/null)
-    MODEL_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-llama_1b_gpu 2>/dev/null)
+    MODEL_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai_gpt_20b_gpu_1 2>/dev/null)
     NUC_API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-nuc-api 2>/dev/null)
     if [ "$API_HEALTH_STATUS" = "healthy" ] && [ "$MODEL_HEALTH_STATUS" = "healthy" ] && [ "$NUC_API_HEALTH_STATUS" = "healthy" ]; then
         break
@@ -28,7 +34,7 @@ fi
 
 echo "MODEL_HEALTH_STATUS: $MODEL_HEALTH_STATUS"
 if [ "$MODEL_HEALTH_STATUS" != "healthy" ]; then
-    echo "Error: nilai-llama_1b_gpu failed to become healthy after $MAX_ATTEMPTS attempts"
+    echo "Error: nilai_gpt_20b_gpu_1 failed to become healthy after $MAX_ATTEMPTS attempts"
     exit 1
 fi
 
diff --git a/tests/e2e/config.py b/tests/e2e/config.py
index 2ce0c652..81797ead 100644
--- a/tests/e2e/config.py
+++ b/tests/e2e/config.py
@@ -37,7 +37,7 @@ def api_key_getter() -> str:
         "meta-llama/Llama-3.2-1B-Instruct",
         "meta-llama/Llama-3.1-8B-Instruct",
     ],
-    "ci": ["meta-llama/Llama-3.2-1B-Instruct"],
+    "ci": ["openai/gpt-oss-20b"],
 }
 
 if ENVIRONMENT not in models:
@@ -46,3 +46,4 @@ def api_key_getter() -> str:
         f"Environment {ENVIRONMENT} not found in models, using {ENVIRONMENT} as default"
     )
 test_models = models[ENVIRONMENT]
+WEB_SEARCH_RPS = getattr(CONFIG.web_search, "rps", None)
diff --git a/tests/e2e/test_openai.py b/tests/e2e/test_chat_completions.py
similarity index 74%
rename from tests/e2e/test_openai.py
rename to tests/e2e/test_chat_completions.py
index 987365d4..72b491f9 100644
--- a/tests/e2e/test_openai.py
+++ b/tests/e2e/test_chat_completions.py
@@ -1,17 +1,21 @@
 """
-Test suite for nilAI OpenAI client
+Test suite for nilAI Chat Completions endpoint using OpenAI client
 
 This test suite uses the OpenAI client to make requests to the nilAI API.
 
 To run the tests, use the following command:
 
-pytest tests/e2e/test_openai.py
+pytest tests/e2e/test_chat_completions.py
 """
 
 import json
+import os
+import re
 import httpx
 import pytest
+import pytest_asyncio
 from openai import OpenAI
+from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletion
 from .config import BASE_URL, test_models, AUTH_STRATEGY, api_key_getter
 from .nuc import (
@@ -31,6 +35,15 @@ def _create_openai_client(api_key: str) -> OpenAI:
     )
 
 
+def _create_async_openai_client(api_key: str) -> AsyncOpenAI:
+    transport = httpx.AsyncHTTPTransport(verify=False)
+    return AsyncOpenAI(
+        base_url=BASE_URL,
+        api_key=api_key,
+        http_client=httpx.AsyncClient(transport=transport),
+    )
+
+
 @pytest.fixture
 def client():
     """Create an OpenAI client configured to use the Nilai API"""
@@ -39,6 +52,18 @@ def client():
     return _create_openai_client(invocation_token)
 
 
+@pytest_asyncio.fixture
+async def async_client():
+    invocation_token: str = api_key_getter()
+    transport = httpx.AsyncHTTPTransport(verify=False)
+    httpx_client = httpx.AsyncClient(transport=transport)
+    client = AsyncOpenAI(
+        base_url=BASE_URL, api_key=invocation_token, http_client=httpx_client
+    )
+    yield client
+    await httpx_client.aclose()
+
+
 @pytest.fixture
 def rate_limited_client():
     """Create an OpenAI client configured to use the Nilai API with rate limiting"""
@@ -60,6 +85,33 @@ def nildb_client():
     return _create_openai_client(invocation_token.token)
 
 
+@pytest.fixture
+def high_web_search_rate_limit(monkeypatch):
+    """Set high rate limits for web search for RPS tests"""
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_MINUTE", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_HOUR", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_DAY", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT", "9999")
+    monkeypatch.setenv("USER_RATE_LIMIT_MINUTE", "9999")
+    monkeypatch.setenv("USER_RATE_LIMIT_HOUR", "9999")
+    monkeypatch.setenv("USER_RATE_LIMIT_DAY", "9999")
+    monkeypatch.setenv("USER_RATE_LIMIT", "9999")
+    monkeypatch.setenv(
+        "MODEL_CONCURRENT_RATE_LIMIT",
+        (
+            '{"meta-llama/Llama-3.2-1B-Instruct": 500, '
+            '"meta-llama/Llama-3.2-3B-Instruct": 500, '
+            '"meta-llama/Llama-3.1-8B-Instruct": 300, '
+            '"cognitivecomputations/Dolphin3.0-Llama3.1-8B": 300, '
+            '"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 50, '
+            '"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 50, '
+            '"openai/gpt-oss-20b": 500, '
+            '"google/gemma-3-27b-it": 500, '
+            '"default": 500}'
+        ),
+    )
+
+
 @pytest.mark.parametrize(
     "model",
     test_models,
@@ -229,10 +281,12 @@ def test_streaming_chat_completion(client, model):
                     "role": "system",
                     "content": "You are a helpful assistant that provides accurate and concise information.",
                 },
-                {"role": "user", "content": "Write a short poem about mountains."},
+                {
+                    "role": "user",
+                    "content": "Write a short poem about mountains. It must be 20 words maximum.",
+                },
             ],
             temperature=0.2,
-            max_tokens=100,
             stream=True,
         )
 
@@ -244,14 +298,13 @@ def test_streaming_chat_completion(client, model):
         for chunk in stream:
             chunk_count += 1
             if chunk.choices and chunk.choices[0].delta.content:
-                content_piece = chunk.choices[0].delta.content
+                content_piece = chunk.choices[0].delta.content or ""
                 full_content += content_piece
 
                 print(f"Model {model} stream chunk {chunk_count}: {chunk}")
             if chunk.usage:
                 had_usage = True
                 print(f"Model {model} usage: {chunk.usage}")
-                break
         assert had_usage, f"No usage data received for {model} streaming request"
         assert chunk_count > 0, f"No chunks received for {model} streaming request"
         assert full_content, f"No content assembled from stream for {model}"
@@ -272,6 +325,11 @@ def test_streaming_chat_completion(client, model):
 )
 def test_function_calling(client, model):
     """Test function calling with different models"""
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "Skipping test for openai/gpt-oss-20b model as it only supports responses endpoint"
+        )
+
     try:
         response = client.chat.completions.create(
             model=model,
@@ -407,13 +465,18 @@ def test_function_calling(client, model):
 )
 def test_function_calling_with_streaming(client, model):
     """Test function calling with different models"""
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "Skipping test for openai/gpt-oss-20b model as it only supports responses endpoint"
+        )
+
     try:
         response = client.chat.completions.create(
             model=model,
             messages=[
                 {
                     "role": "system",
-                    "content": "You are a helpful assistant that provides accurate and concise information.",
+                    "content": "You are a helpful assistant that provides accurate and concise information. You must use the get_weather tool to get the weather information.",
                 },
                 {
                     "role": "user",
@@ -441,6 +504,7 @@ def test_function_calling_with_streaming(client, model):
                     },
                 }
             ],
+            tool_choice={"type": "function", "function": {"name": "get_weather"}},
             temperature=0.2,
             stream=True,
         )
@@ -686,13 +750,16 @@ def test_chat_completion_high_temperature(client):
                 "content": "Write an imaginative story about a wizard.",
             },
         ],
-        temperature=5.0,  # Extremely high temperature for creative responses
+        temperature=1.99,  # Extremely high temperature for creative responses
         max_tokens=50,
     )
     assert response, "High temperature request should return a valid response"
     assert response.choices, "Response should contain choices"
     assert len(response.choices) > 0, "At least one choice should be present"
-    assert response.choices[0].message.content, "Response should contain content"
+    assert (
+        response.choices[0].message.content
+        or response.choices[0].message.reasoning_content
+    ), "Response should contain content or reasoning_content"
 
 
 def test_model_streaming_request_high_token(client):
@@ -726,228 +793,121 @@ def test_model_streaming_request_high_token(client):
     "model",
     test_models,
 )
-def test_web_search(client, model):
+def test_web_search(client, model, high_web_search_rate_limit):
     """Test web_search functionality with proper source validation."""
-    import time
-    import openai
 
-    max_retries = 5
-    last_exception = None
-
-    for attempt in range(max_retries):
-        try:
-            print(f"\nAttempt {attempt + 1}/{max_retries}...")
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that provides accurate and up-to-date information.",
+            },
+            {
+                "role": "user",
+                "content": "Who won the Roland Garros Open in 2024? Just reply with the winner's name.",
+            },
+        ],
+        extra_body={"web_search": True},
+        temperature=0.2,
+        max_tokens=150,
+    )
 
-            response = client.chat.completions.create(
-                model=model,
-                messages=[
-                    {
-                        "role": "system",
-                        "content": "You are a helpful assistant that provides accurate and up-to-date information.",
-                    },
-                    {
-                        "role": "user",
-                        "content": "Who won the Roland Garros Open in 2024? Just reply with the winner's name.",
-                    },
-                ],
-                extra_body={"web_search": True},
-                temperature=0.2,
-                max_tokens=150,
-            )
+    assert isinstance(response, ChatCompletion), (
+        "Response should be a ChatCompletion object"
+    )
+    assert response.model == model, f"Response model should be {model}"
+    assert len(response.choices) > 0, "Response should contain at least one choice"
 
-            assert isinstance(response, ChatCompletion), (
-                "Response should be a ChatCompletion object"
-            )
-            assert response.model == model, f"Response model should be {model}"
-            assert len(response.choices) > 0, (
-                "Response should contain at least one choice"
-            )
+    content = response.choices[0].message.content
+    reasoning_content = getattr(response.choices[0].message, "reasoning_content", None)
+    assert content or reasoning_content, (
+        "Response should contain content or reasoning_content"
+    )
 
-            content = response.choices[0].message.content
-            assert content, "Response should contain content"
-
-            sources = getattr(response, "sources", None)
-            assert sources is not None, "Sources field should not be None"
-            assert isinstance(sources, list), "Sources should be a list"
-            assert len(sources) > 0, "Sources should not be empty"
-
-            print(f"Success on attempt {attempt + 1}")
-            return
-        except openai.RateLimitError as e:
-            print(f"Rate limit hit on attempt {attempt + 1}: {e}")
-        except AssertionError as e:
-            print(f"Assertion failed on attempt {attempt + 1}: {e}")
-            last_exception = e
-            if attempt < max_retries - 1:
-                print("Retrying...")
-                time.sleep(1)
-            else:
-                print("All retries failed.")
-                raise last_exception
-
-
-def test_web_search_brave_rps_e2e(client):
-    """Test that web search requests are rate limited to 20 per second globally for the Brave API."""
-    import threading
-    import time
-    import openai
-    from concurrent.futures import ThreadPoolExecutor, as_completed
+    sources = getattr(response, "sources", None)
+    assert sources is not None, "Sources field should not be None"
+    assert isinstance(sources, list), "Sources should be a list"
+    assert len(sources) > 0, "Sources should not be empty"
 
-    # Use a barrier to ensure all requests start simultaneously
-    request_barrier = threading.Barrier(40)
-    responses = []
-    start_time = None
 
-    def make_request():
-        request_barrier.wait()
+@pytest.mark.skipif(
+    not os.environ.get("E2B_API_KEY"),
+    reason="Requires E2B_API_KEY for code execution sandbox",
+)
+@pytest.mark.parametrize("model", test_models)
+def test_execute_python_sha256_simple_e2e(client, model):
+    # Some tiny models don't support tool calls; skip those explicitly.
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "Skipping test for openai/gpt-oss-20b model as it only supports responses endpoint"
+        )
 
-        nonlocal start_time
-        if start_time is None:
-            start_time = time.time()
+    # Expected SHA-256 of the *string* "7919" (the 1,000th prime)
+    expected = "a8054ef7fc192135dd8dc07d4d9832c9fa9bd39d01ba383e29e378f5cc72cacd"
 
-        try:
-            response = client.chat.completions.create(
-                model=test_models[0],
-                messages=[{"role": "user", "content": "What is the latest news?"}],
-                extra_body={"web_search": True},
-                max_tokens=10,
-                temperature=0.0,
-            )
-            completion_time = time.time() - start_time
-            responses.append((completion_time, response, "success"))
-        except openai.RateLimitError as e:
-            completion_time = time.time() - start_time
-            responses.append((completion_time, e, "rate_limited"))
-        except Exception as e:
-            completion_time = time.time() - start_time
-            responses.append((completion_time, e, "error"))
-
-    with ThreadPoolExecutor(max_workers=40) as executor:
-        futures = [executor.submit(make_request) for _ in range(40)]
-
-        for future in as_completed(futures):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"Thread execution error: {e}")
-
-    assert len(responses) == 40, "All requests should complete"
-
-    successful_responses = [(t, r) for t, r, status in responses if status == "success"]
-    rate_limited_responses = [
-        (t, r) for t, r, status in responses if status == "rate_limited"
-    ]
-    error_responses = [(t, r) for t, r, status in responses if status == "error"]
-
-    print(
-        f"Successful: {len(successful_responses)}, Rate limited: {len(rate_limited_responses)}, Errors: {len(error_responses)}"
+    system_msg = (
+        "You are a helpful assistant. If a question requires running code, "
+        "you MUST use the execute_python tool to run it and use the tool's output "
+        "to produce your final answer. Do not include any code or JSON in the final answer; "
+        "just the result."
     )
 
-    # Verify rate limiting behavior
-    # At least some requests should be rate limited or delayed
-    assert len(rate_limited_responses) > 0 or len(successful_responses) < 40, (
-        "Rate limiting should be enforced - either some requests should be rate limited or delayed"
+    # Simple, concrete task that requires code execution for a reliable result.
+    user_msg = (
+        "Find the 1,000th prime number using Python (by writing and running code). "
+        "Then output ONLY the SHA-256 hash of that prime, as a 64-character lowercase hex string."
     )
 
-    for t, response in successful_responses:
-        assert isinstance(response, ChatCompletion), (
-            "Response should be a ChatCompletion object"
-        )
-        sources = getattr(response, "sources", None)
-        assert sources is not None, (
-            "Successful web search responses should have sources"
-        )
-        assert isinstance(sources, list), "Sources should be a list"
-        assert len(sources) > 0, "Sources should not be empty"
+    trials = 2
+    pattern = rf"\b{re.escape(expected)}\b"
+    last_completion = None
+    last_content = ""
 
-    for t, error in rate_limited_responses:
-        assert isinstance(error, openai.RateLimitError), (
-            "Rate limited responses should be RateLimitError"
+    for _ in range(trials):
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_msg},
+                {"role": "user", "content": user_msg},
+            ],
+            temperature=0,
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "execute_python",
+                        "description": "Executes a snippet of Python code in a secure sandbox and returns the standard output.",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "code": {
+                                    "type": "string",
+                                    "description": "The Python code to be executed.",
+                                }
+                            },
+                            "required": ["code"],
+                            "additionalProperties": False,
+                        },
+                    },
+                }
+            ],
         )
 
+        last_completion = completion
+        if not completion.choices:
+            continue
 
-def test_web_search_queueing_next_second_e2e(client):
-    """Test that web search requests are properly queued and processed in batches."""
-    import threading
-    import time
-    import openai
-    from concurrent.futures import ThreadPoolExecutor, as_completed
-
-    request_barrier = threading.Barrier(25)
-    responses = []
-    start_time = None
-
-    def make_request():
-        request_barrier.wait()
-
-        nonlocal start_time
-        if start_time is None:
-            start_time = time.time()
-
-        try:
-            response = client.chat.completions.create(
-                model=test_models[0],
-                messages=[{"role": "user", "content": "What is the weather like?"}],
-                extra_body={"web_search": True},
-                max_tokens=10,
-                temperature=0.0,
+        content = completion.choices[0].message.content or ""
+        last_content = content
+        if re.search(pattern, content.strip()):
+            break
+    else:
+        pytest.fail(
+            (
+                "Expected exact SHA-256 hash not found after retries.\n"
+                f"Got: {last_content[:200]}...\n"
+                f"Expected: {expected}\n"
+                f"Full: {last_completion.model_dump_json() if last_completion else '<no completion>'}"
             )
-            completion_time = time.time() - start_time
-            responses.append((completion_time, response, "success"))
-        except openai.RateLimitError as e:
-            completion_time = time.time() - start_time
-            responses.append((completion_time, e, "rate_limited"))
-        except Exception as e:
-            completion_time = time.time() - start_time
-            responses.append((completion_time, e, "error"))
-
-    with ThreadPoolExecutor(max_workers=25) as executor:
-        futures = [executor.submit(make_request) for _ in range(25)]
-
-        for future in as_completed(futures):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"Thread execution error: {e}")
-
-    assert len(responses) == 25, "All requests should complete"
-
-    # Categorize responses
-    successful_responses = [(t, r) for t, r, status in responses if status == "success"]
-    rate_limited_responses = [
-        (t, r) for t, r, status in responses if status == "rate_limited"
-    ]
-    error_responses = [(t, r) for t, r, status in responses if status == "error"]
-
-    print(
-        f"Successful: {len(successful_responses)}, Rate limited: {len(rate_limited_responses)}, Errors: {len(error_responses)}"
-    )
-
-    # Verify queuing behavior
-    # With 25 requests and 20 RPS limit, some should be queued or rate limited
-    assert len(rate_limited_responses) > 0 or len(successful_responses) < 25, (
-        "Queuing should be enforced - either some requests should be rate limited or delayed"
-    )
-
-    for t, response in successful_responses:
-        assert isinstance(response, ChatCompletion), (
-            "Response should be a ChatCompletion object"
-        )
-        assert len(response.choices) > 0, "Response should contain at least one choice"
-        assert response.choices[0].message.content, "Response should contain content"
-
-        sources = getattr(response, "sources", None)
-        assert sources is not None, "Web search responses should have sources"
-        assert isinstance(sources, list), "Sources should be a list"
-        assert len(sources) > 0, "Sources should not be empty"
-
-        first_source = sources[0]
-        assert isinstance(first_source, dict), "First source should be a dictionary"
-        assert "title" in first_source, "First source should have title"
-        assert "url" in first_source, "First source should have url"
-        assert "snippet" in first_source, "First source should have snippet"
-
-    for t, error in rate_limited_responses:
-        assert isinstance(error, openai.RateLimitError), (
-            "Rate limited responses should be RateLimitError"
         )
diff --git a/tests/e2e/test_http.py b/tests/e2e/test_chat_completions_http.py
similarity index 82%
rename from tests/e2e/test_http.py
rename to tests/e2e/test_chat_completions_http.py
index 47db7136..32538f0b 100644
--- a/tests/e2e/test_http.py
+++ b/tests/e2e/test_chat_completions_http.py
@@ -1,15 +1,16 @@
 """
-Test suite for nilAI HTTP API
+Test suite for nilAI Chat Completions endpoint using HTTP client
 
 This test suite uses httpx to make requests to the nilAI HTTP API.
 
 To run the tests, use the following command:
 
-pytest tests/e2e/test_http.py
+pytest tests/e2e/test_chat_completions_http.py
 """
 
 import json
-
+import os
+import re
 
 from .config import BASE_URL, test_models, AUTH_STRATEGY, api_key_getter
 from .nuc import (
@@ -305,7 +306,10 @@ def test_model_streaming_request(client, model):
                 "role": "system",
                 "content": "You are a helpful assistant that provides accurate and concise information.",
             },
-            {"role": "user", "content": "Write a short poem about mountains."},
+            {
+                "role": "user",
+                "content": "Write a short poem about mountains. It must be 20 words maximum.",
+            },
         ],
         "temperature": 0.2,
         "stream": True,
@@ -355,12 +359,16 @@ def test_model_streaming_request(client, model):
 )
 def test_model_tools_request(client, model):
     """Test tools request for different models"""
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "openai/gpt-oss-20b model only supports tool calls with responses endpoint"
+        )
     payload = {
         "model": model,
         "messages": [
             {
                 "role": "system",
-                "content": "You are a helpful assistant. When a user asks a question that requires calculation, use the execute_python tool to find the answer. After the tool provides its result, you must use that result to formulate a clear, final answer to the user's original question. Do not include any code or JSON in your final response.",
+                "content": "You are a helpful assistant. When a user asks a question that requires weather, use the get_weather tool to get the weather information.",
             },
             {"role": "user", "content": "What is the weather like in Paris today?"},
         ],
@@ -442,12 +450,17 @@ def test_model_tools_request(client, model):
 @pytest.mark.parametrize("model", test_models)
 def test_function_calling_with_streaming_httpx(client, model):
     """Test function calling with streaming using httpx, verifying tool calls and usage data."""
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "Skipping test for openai/gpt-oss-20b model as it only supports responses endpoint"
+        )
+
     payload = {
         "model": model,
         "messages": [
             {
                 "role": "system",
-                "content": "You are a helpful assistant that provides accurate and concise information.",
+                "content": "You are a helpful assistant that provides accurate and concise information. You are a helpful assistant that provides accurate and concise information. For getting weather, use function call with get_weather.",
             },
             {
                 "role": "user",
@@ -475,6 +488,7 @@ def test_function_calling_with_streaming_httpx(client, model):
                 },
             }
         ],
+        "tool_choice": {"type": "function", "function": {"name": "get_weather"}},
         "temperature": 0.2,
         "stream": True,
     }
@@ -620,7 +634,7 @@ def test_invalid_nildb_command_nucs(nildb_client):
 def test_large_payload_handling(client):
     """Test handling of large input payloads"""
     # Create a very large system message
-    large_system_message = "Hello " * 10000  # 100KB of text
+    large_system_message = "Hello " * 1000  # 100KB of text
 
     payload = {
         "model": test_models[0],
@@ -848,6 +862,9 @@ def test_nildb_delegation(client: httpx.Client):
     )
 
 
+@pytest.mark.skip(
+    reason="prompt cannot be accessed because of a secretvaults-py update"
+)
 @pytest.mark.parametrize(
     "model",
     test_models,
@@ -860,10 +877,6 @@ def test_nildb_prompt_document(document_id_client: httpx.Client, model):
     payload = {
         "model": model,
         "messages": [
-            {
-                "role": "system",
-                "content": "You are a helpful assistant.",
-            },
             {"role": "user", "content": "Can you make a small rhyme?"},
         ],
         "temperature": 0.2,
@@ -877,3 +890,150 @@ def test_nildb_prompt_document(document_id_client: httpx.Client, model):
     # Response must talk about cheese which is what the prompt document contains
     message: str = response.json()["choices"][0].get("message", {}).get("content", None)
     assert "cheese" in message.lower(), "Response should contain cheese"
+
+
+@pytest.fixture
+def high_web_search_rate_limit(monkeypatch):
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_MINUTE", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_HOUR", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_DAY", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT", "9999")
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_web_search(client, model, high_web_search_rate_limit):
+    """Test web_search functionality with proper source validation."""
+    payload = {
+        "model": model,
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that provides accurate and up-to-date information.",
+            },
+            {
+                "role": "user",
+                "content": "Who won the Roland Garros Open in 2024? Just reply with the winner's name.",
+            },
+        ],
+        "extra_body": {"web_search": True},
+        "temperature": 0.2,
+        "max_tokens": 150,
+    }
+
+    response = client.post("/chat/completions", json=payload, timeout=60)
+    assert response.status_code == 200, (
+        f"Response for {model} failed with status {response.status_code}"
+    )
+
+    response_json = response.json()
+    assert response_json.get("model") == model, f"Response model should be {model}"
+    assert "choices" in response_json, "Response should contain choices"
+    assert len(response_json["choices"]) > 0, (
+        "Response should contain at least one choice"
+    )
+
+    message = response_json["choices"][0].get("message", {})
+    content = message.get("content", "")
+    reasoning_content = message.get("reasoning_content", "")
+
+    assert content or reasoning_content, (
+        "Response should contain content or reasoning_content"
+    )
+
+    sources = response_json.get("sources")
+    if sources is not None:
+        assert isinstance(sources, list), "Sources should be a list"
+        assert len(sources) > 0, "Sources should not be empty"
+        print(f"Sources found: {len(sources)}")
+    else:
+        print(
+            "Warning: Sources field is None - web search may not be enabled or working properly"
+        )
+
+    print(
+        f"\nModel {model} web search response: {content[:100] if content else 'No content'}..."
+    )
+
+
+@pytest.mark.skipif(
+    not os.environ.get("E2B_API_KEY"),
+    reason="Requires E2B_API_KEY for code execution sandbox",
+)
+@pytest.mark.parametrize("model", test_models)
+def test_execute_python_sha256_e2e(client, model):
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "Skipping test for openai/gpt-oss-20b model as it only supports responses endpoint"
+        )
+
+    expected = "75cc238b167a05ab7336d773cb096735d459df2f0df9c8df949b1c44075df8a5"
+
+    system_msg = (
+        "You are a helpful assistant. When a user asks a question that requires code execution, "
+        "use the execute_python tool to find the answer. After the tool provides its result, "
+        "you must use that result to formulate a clear, final answer to the user's original question. "
+        "Do not include any code or JSON in your final response."
+    )
+    user_msg = "Execute this exact Python code and return the result: import hashlib; print(hashlib.sha256('Nillion'.encode()).hexdigest())"
+
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": system_msg},
+            {"role": "user", "content": user_msg},
+        ],
+        "temperature": 0,
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "execute_python",
+                    "description": "Executes a snippet of Python code in a secure sandbox and returns the standard output.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "code": {
+                                "type": "string",
+                                "description": "The Python code to be executed.",
+                            }
+                        },
+                        "required": ["code"],
+                        "additionalProperties": False,
+                    },
+                    "strict": True,
+                },
+            }
+        ],
+    }
+
+    trials = 3
+    escaped_expected = re.escape(expected)
+    pattern = rf"\b{escaped_expected}\b"
+    last_data = None
+    last_content = ""
+    last_status = None
+    for _ in range(trials):
+        response = client.post("/chat/completions", json=payload)
+        last_status = response.status_code
+        if response.status_code != 200:
+            continue
+        data = response.json()
+        last_data = data
+        if not ("choices" in data and data["choices"]):
+            continue
+        message = data["choices"][0].get("message", {})
+        content = message.get("content") or ""
+        last_content = content
+        normalized_content = re.sub(r"\s+", " ", content)
+        if re.search(pattern, normalized_content):
+            break
+    else:
+        pytest.fail(
+            (
+                "Expected exact SHA-256 hash not found after retries.\n"
+                f"Last status: {last_status}\n"
+                f"Got: {last_content[:200]}...\n"
+                f"Expected: {expected}\n"
+                f"Full: {json.dumps(last_data, indent=2)[:1000] if last_data else '<no json>'}"
+            )
+        )
diff --git a/tests/e2e/test_code_execution.py b/tests/e2e/test_code_execution.py
deleted file mode 100644
index f0ea43d9..00000000
--- a/tests/e2e/test_code_execution.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import os
-import json
-import re
-
-import httpx
-import pytest
-
-from .config import BASE_URL, test_models, api_key_getter
-
-
-# Skip entire module if sandbox key not present
-pytestmark = pytest.mark.skipif(
-    not os.environ.get("E2B_API_KEY"),
-    reason="Requires E2B_API_KEY for code execution sandbox",
-)
-
-
-@pytest.fixture
-def client():
-    try:
-        token = api_key_getter()
-    except Exception as e:
-        pytest.skip(f"Skipping: missing auth token for e2e ({e})")
-    return httpx.Client(
-        base_url=BASE_URL,
-        headers={
-            "accept": "application/json",
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {token}",
-        },
-        verify=False,
-        timeout=None,
-    )
-
-
-@pytest.mark.parametrize("model", test_models)
-def test_execute_python_sha256_e2e(client, model):
-    expected = "75cc238b167a05ab7336d773cb096735d459df2f0df9c8df949b1c44075df8a5"
-
-    system_msg = (
-        "You are a helpful assistant. When a user asks a question that requires code execution, "
-        "use the execute_python tool to find the answer. After the tool provides its result, "
-        "you must use that result to formulate a clear, final answer to the user's original question. "
-        "Do not include any code or JSON in your final response."
-    )
-    user_msg = "Execute this exact Python code and return the result: import hashlib; print(hashlib.sha256('Nillion'.encode()).hexdigest())"
-
-    payload = {
-        "model": model,
-        "messages": [
-            {"role": "system", "content": system_msg},
-            {"role": "user", "content": user_msg},
-        ],
-        "temperature": 0,
-        "tools": [
-            {
-                "type": "function",
-                "function": {
-                    "name": "execute_python",
-                    "description": "Executes a snippet of Python code in a secure sandbox and returns the standard output.",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {
-                            "code": {
-                                "type": "string",
-                                "description": "The Python code to be executed.",
-                            }
-                        },
-                        "required": ["code"],
-                        "additionalProperties": False,
-                    },
-                    "strict": True,
-                },
-            }
-        ],
-    }
-
-    # Retry up to 3 times since small models can be non-deterministic
-    trials = 3
-    escaped_expected = re.escape(expected)
-    pattern = rf"\b{escaped_expected}\b"
-    last_data = None
-    last_content = ""
-    last_status = None
-    for _ in range(trials):
-        response = client.post("/chat/completions", json=payload)
-        last_status = response.status_code
-        if response.status_code != 200:
-            continue
-        data = response.json()
-        last_data = data
-        if not ("choices" in data and data["choices"]):
-            continue
-        message = data["choices"][0].get("message", {})
-        content = message.get("content") or ""
-        last_content = content
-        normalized_content = re.sub(r"\s+", " ", content)
-        if re.search(pattern, normalized_content):
-            break
-    else:
-        pytest.fail(
-            (
-                "Expected exact SHA-256 hash not found after retries.\n"
-                f"Last status: {last_status}\n"
-                f"Got: {last_content[:200]}...\n"
-                f"Expected: {expected}\n"
-                f"Full: {json.dumps(last_data, indent=2)[:1000] if last_data else '<no json>'}"
-            )
-        )
diff --git a/tests/e2e/test_responses.py b/tests/e2e/test_responses.py
new file mode 100644
index 00000000..f5f931c5
--- /dev/null
+++ b/tests/e2e/test_responses.py
@@ -0,0 +1,804 @@
+import json
+import os
+import httpx
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+from openai import AsyncOpenAI
+
+from .config import BASE_URL, test_models, AUTH_STRATEGY, api_key_getter
+from .nuc import (
+    get_rate_limited_nuc_token,
+    get_invalid_rate_limited_nuc_token,
+    get_nildb_nuc_token,
+)
+
+
+def _create_openai_client(api_key: str) -> OpenAI:
+    """Helper function to create an OpenAI client with SSL verification disabled"""
+    transport = httpx.HTTPTransport(verify=False)
+    return OpenAI(
+        base_url=BASE_URL,
+        api_key=api_key,
+        http_client=httpx.Client(transport=transport),
+    )
+
+
+def _create_async_openai_client(api_key: str) -> AsyncOpenAI:
+    transport = httpx.AsyncHTTPTransport(verify=False)
+    return AsyncOpenAI(
+        base_url=BASE_URL,
+        api_key=api_key,
+        http_client=httpx.AsyncClient(transport=transport),
+    )
+
+
+@pytest.fixture
+def client():
+    invocation_token: str = api_key_getter()
+    return _create_openai_client(invocation_token)
+
+
+@pytest_asyncio.fixture
+async def async_client():
+    invocation_token: str = api_key_getter()
+    transport = httpx.AsyncHTTPTransport(verify=False)
+    httpx_client = httpx.AsyncClient(transport=transport)
+    client = AsyncOpenAI(
+        base_url=BASE_URL, api_key=invocation_token, http_client=httpx_client
+    )
+    yield client
+    await httpx_client.aclose()
+
+
+@pytest.fixture
+def rate_limited_client():
+    invocation_token = get_rate_limited_nuc_token(rate_limit=1)
+    return _create_openai_client(invocation_token.token)
+
+
+@pytest.fixture
+def invalid_rate_limited_client():
+    invocation_token = get_invalid_rate_limited_nuc_token()
+    return _create_openai_client(invocation_token.token)
+
+
+@pytest.fixture
+def nildb_client():
+    invocation_token = get_nildb_nuc_token()
+    return _create_openai_client(invocation_token.token)
+
+
+@pytest.fixture
+def high_web_search_rate_limit(monkeypatch):
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_MINUTE", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_HOUR", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_DAY", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT", "9999")
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_response_generation(client, model):
+    """Test basic response generation with different models"""
+    try:
+        response = client.responses.create(
+            model=model,
+            input="What is the capital of France?",
+            instructions="You are a helpful assistant that provides accurate and concise information.",
+            temperature=0.2,
+            max_output_tokens=100,
+        )
+
+        assert hasattr(response, "output"), "Response should contain output"
+        assert hasattr(response, "signature"), "Response should contain signature"
+        assert hasattr(response, "usage"), "Response should contain usage"
+        assert response.model == model, f"Response model should be {model}"
+
+        output = response.output
+        assert isinstance(output, list), "Output should be a list"
+        assert len(output) > 0, "Output should contain at least one item"
+
+        message_item = next(
+            (item for item in output if getattr(item, "type", None) == "message"), None
+        )
+        assert message_item is not None, "Output should contain a message item"
+
+        message_content_list = getattr(message_item, "content", [])
+        assert len(message_content_list) > 0, "Message item should have content"
+
+        text_item = next(
+            (
+                c
+                for c in message_content_list
+                if getattr(c, "type", None) == "output_text"
+            ),
+            None,
+        )
+        assert text_item is not None, (
+            "Message content should contain an output_text item"
+        )
+
+        content = getattr(text_item, "text", "")
+
+        assert content, f"No content returned for {model}"
+        print(
+            f"\nModel {model} response: {content[:100]}..."
+            if len(content) > 100
+            else content
+        )
+        if model == "openai/gpt-oss-20b":
+            return
+        assert response.usage.input_tokens > 0, f"No input tokens returned for {model}"
+        assert response.usage.output_tokens > 0, (
+            f"No output tokens returned for {model}"
+        )
+        assert response.usage.total_tokens > 0, f"No total tokens returned for {model}"
+
+        assert "paris" in content.lower(), (
+            "Response should mention Paris as the capital of France"
+        )
+
+    except Exception as e:
+        pytest.fail(f"Error testing response generation with {model}: {str(e)}")
+
+
+@pytest.mark.parametrize("model", test_models)
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC rate limiting not used with API key"
+)
+def test_rate_limiting_nucs(rate_limited_client, model):
+    """Test rate limiting by sending multiple rapid requests"""
+    import openai
+
+    rate_limited = False
+    for _ in range(4):
+        try:
+            _ = rate_limited_client.responses.create(
+                model=model,
+                input="What is the capital of France?",
+                instructions="You are a helpful assistant that provides accurate and concise information.",
+                temperature=0.2,
+                max_output_tokens=100,
+            )
+        except openai.RateLimitError:
+            rate_limited = True
+
+    assert rate_limited, "No NUC rate limiting detected, when expected"
+
+
+@pytest.mark.parametrize("model", test_models)
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC rate limiting not used with API key"
+)
+def test_invalid_rate_limiting_nucs(invalid_rate_limited_client, model):
+    """Test invalid rate limiting by sending multiple rapid requests"""
+    import openai
+
+    forbidden = False
+    for _ in range(4):
+        try:
+            invalid_rate_limited_client.responses.create(
+                model=model,
+                input="What is the capital of France?",
+                instructions="You are a helpful assistant that provides accurate and concise information.",
+                temperature=0.2,
+                max_output_tokens=100,
+            )
+        except openai.AuthenticationError:
+            forbidden = True
+            break
+
+    assert forbidden, "No NUC rate limiting detected, when expected"
+
+
+@pytest.mark.parametrize("model", test_models)
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC rate limiting not used with API key"
+)
+def test_invalid_nildb_command_nucs(nildb_client, model):
+    """Test invalid NILDB command handling"""
+    import openai
+
+    forbidden = False
+    for _ in range(4):
+        try:
+            nildb_client.responses.create(
+                model=model,
+                input="What is the capital of France?",
+                instructions="You are a helpful assistant that provides accurate and concise information.",
+                temperature=0.2,
+                max_output_tokens=100,
+            )
+        except openai.AuthenticationError:
+            forbidden = True
+            break
+
+    assert forbidden, "No NILDB command detected, when expected"
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_streaming_response(client, model):
+    """Test streaming response generation with different models"""
+    try:
+        stream = client.responses.create(
+            model=model,
+            input="Write a short poem about mountains. It must be 20 words maximum.",
+            instructions="You are a helpful assistant that provides accurate and concise information.",
+            temperature=0.2,
+            max_output_tokens=1000,
+            stream=True,
+        )
+
+        chunk_count = 0
+        full_content = ""
+        had_usage = False
+
+        for chunk in stream:
+            chunk_count += 1
+            print(f"Model {model} stream chunk {chunk_count}: {chunk}")
+
+            if hasattr(chunk, "type"):
+                if chunk.type == "response.output_item.added":
+                    item = getattr(chunk, "item", None)
+                    if item and hasattr(item, "type") and item.type == "message":
+                        content_list = getattr(item, "content", [])
+                        if isinstance(content_list, list):
+                            for content_item in content_list:
+                                if (
+                                    hasattr(content_item, "type")
+                                    and content_item.type == "text"
+                                ):
+                                    full_content += getattr(content_item, "text", "")
+
+                if chunk.type == "response.output_text.delta":
+                    delta = getattr(chunk, "delta", "")
+                    full_content += delta
+
+                if chunk.type == "response.completed":
+                    response_obj = getattr(chunk, "response", None)
+                    if response_obj:
+                        usage = getattr(response_obj, "usage", None)
+                        if usage:
+                            had_usage = True
+                            print(f"Model {model} usage: {usage}")
+
+        assert had_usage, f"No usage data received for {model} streaming request"
+        assert chunk_count > 0, f"No chunks received for {model} streaming request"
+        assert full_content, f"No content assembled from stream for {model}"
+        print(f"Received {chunk_count} chunks for {model} streaming request")
+        print(
+            f"Assembled content: {full_content[:100]}..."
+            if len(full_content) > 100
+            else full_content
+        )
+
+    except Exception as e:
+        pytest.fail(f"Error testing streaming response with {model}: {str(e)}")
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_function_calling(client, model):
+    """Test function calling with different models"""
+    try:
+        tools_def = [
+            {
+                "type": "function",
+                "name": "get_weather",
+                "description": "Get current temperature for a given location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and country e.g. Paris, France",
+                        }
+                    },
+                    "required": ["location"],
+                    "additionalProperties": False,
+                },
+                "strict": True,
+            }
+        ]
+
+        first = client.responses.create(
+            model=model,
+            input="What is the weather like in Paris today?",
+            instructions="You are a helpful assistant that provides accurate and concise information.",
+            tools=tools_def,
+            tool_choice="auto",
+            temperature=0.2,
+        )
+
+        assert hasattr(first, "output")
+        calls = [o for o in first.output if getattr(o, "type", None) == "function_call"]
+
+        if not calls:
+            msg_items = [
+                o for o in first.output if getattr(o, "type", None) == "message"
+            ]
+            if msg_items:
+                parts = getattr(msg_items[0], "content", []) or []
+                text = ""
+                for p in parts:
+                    t = getattr(p, "text", None) or (
+                        p.get("text") if isinstance(p, dict) else None
+                    )
+                    if t:
+                        text += t
+                assert text, f"No content or tool calls returned for {model}"
+                return
+            texts = [o for o in first.output if getattr(o, "type", None) == "text"]
+            assert texts, f"No content or tool calls returned for {model}"
+            assert getattr(texts[0], "text", "")
+            return
+
+        fc = calls[0]
+        assert getattr(fc, "name", None) == "get_weather"
+        args_str = getattr(fc, "arguments", None)
+        assert args_str
+        args = json.loads(args_str)
+        assert "location" in args
+        assert "paris" in args["location"].lower()
+
+        tool_result = "The weather in Paris is currently 22°C and sunny."
+        prompt = (
+            "You are Llama 1B, a detail-oriented AI tasked with verifying and analyzing the output of a recent tool call. "
+            "Review the provided tool output and answer the user's question succinctly."
+        )
+
+        second = client.responses.create(
+            model=model,
+            input=[
+                {"type": "message", "role": "system", "content": prompt},
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": "What is the weather like in Paris today?",
+                },
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": f"Tool output for get_weather with arguments {json.dumps({'location': args['location']})}: {tool_result}",
+                },
+            ],
+            temperature=0.2,
+            tool_choice="auto",
+        )
+
+        out = second.output
+        msg_items = [o for o in out if getattr(o, "type", None) == "message"]
+        txt = ""
+        if msg_items:
+            parts = getattr(msg_items[0], "content", []) or []
+            for p in parts:
+                t = getattr(p, "text", None) or (
+                    p.get("text") if isinstance(p, dict) else None
+                )
+                if t:
+                    txt += t
+        else:
+            texts = [o for o in out if getattr(o, "type", None) == "text"]
+            if texts:
+                txt = getattr(texts[0], "text", "") or ""
+
+        assert txt, "No content in follow-up response"
+        assert ("22°C" in txt) or ("sunny" in txt.lower()) or ("weather" in txt.lower())
+
+    except Exception as e:
+        pytest.fail(f"Error testing function calling with {model}: {str(e)}")
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_function_calling_with_streaming(client, model):
+    """Test function calling with streaming"""
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "Skipping test for openai/gpt-oss-20b model as it only supports non streaming with responsesendpoint"
+        )
+
+    try:
+        stream = client.responses.create(
+            model=model,
+            input="What is the weather like in Paris today?",
+            instructions="You are a helpful assistant that provides accurate and concise information.",
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "description": "Get current temperature for a given location.",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "location": {
+                                    "type": "string",
+                                    "description": "City and country e.g. Paris, France",
+                                }
+                            },
+                            "required": ["location"],
+                            "additionalProperties": False,
+                        },
+                        "strict": True,
+                    },
+                }
+            ],
+            temperature=0.2,
+            stream=True,
+        )
+
+        had_tool_call = False
+        had_usage = False
+
+        for chunk in stream:
+            print(f"Model {model} stream chunk: {chunk}")
+
+            if hasattr(chunk, "type"):
+                if chunk.type == "response.function_call_arguments.delta":
+                    had_tool_call = True
+
+                if chunk.type == "response.output_item.added":
+                    item = getattr(chunk, "item", None)
+                    if item and hasattr(item, "type") and item.type == "function_call":
+                        had_tool_call = True
+
+                if chunk.type == "response.completed":
+                    response_obj = getattr(chunk, "response", None)
+                    if response_obj:
+                        usage = getattr(response_obj, "usage", None)
+                        if usage:
+                            had_usage = True
+                            print(f"Model {model} usage: {usage}")
+
+        assert had_tool_call, f"No tool calls received for {model} streaming request"
+        assert had_usage, f"No usage data received for {model} streaming request"
+
+    except Exception as e:
+        pytest.fail(f"Error testing streaming function calling with {model}: {str(e)}")
+
+
+def test_usage_endpoint(client):
+    """Test retrieving usage statistics"""
+    try:
+        import requests
+
+        invocation_token = api_key_getter()
+
+        url = BASE_URL + "/usage"
+        response = requests.get(
+            url,
+            headers={
+                "Authorization": f"Bearer {invocation_token}",
+                "Content-Type": "application/json",
+            },
+            verify=False,
+        )
+        assert response.status_code == 200, "Usage endpoint should return 200 OK"
+
+        usage_data = response.json()
+        assert isinstance(usage_data, dict), "Usage data should be a dictionary"
+
+        expected_keys = [
+            "total_tokens",
+            "completion_tokens",
+            "prompt_tokens",
+            "queries",
+        ]
+        for key in expected_keys:
+            assert key in usage_data, f"Expected key {key} not found in usage data"
+
+        print(f"\nUsage data: {json.dumps(usage_data, indent=2)}")
+
+    except Exception as e:
+        pytest.fail(f"Error testing usage endpoint: {str(e)}")
+
+
+def test_attestation_endpoint(client):
+    """Test retrieving attestation report"""
+    try:
+        import requests
+
+        invocation_token = api_key_getter()
+
+        url = BASE_URL + "/attestation/report"
+        response = requests.get(
+            url,
+            headers={
+                "Authorization": f"Bearer {invocation_token}",
+                "Content-Type": "application/json",
+            },
+            params={"nonce": "0" * 64},
+            verify=False,
+        )
+
+        assert response.status_code == 200, "Attestation endpoint should return 200 OK"
+
+        report = response.json()
+        assert isinstance(report, dict), "Attestation report should be a dictionary"
+
+        expected_keys = ["cpu_attestation", "gpu_attestation", "verifying_key"]
+        for key in expected_keys:
+            assert key in report, f"Expected key {key} not found in attestation report"
+
+        print(f"\nAttestation report received with keys: {list(report.keys())}")
+
+    except Exception as e:
+        pytest.fail(f"Error testing attestation endpoint: {str(e)}")
+
+
+def test_health_endpoint(client):
+    """Test health check endpoint"""
+    try:
+        import requests
+
+        url = BASE_URL + "/health"
+        response = requests.get(
+            url,
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            verify=False,
+        )
+
+        print(f"Health response: {response.status_code} {response.text}")
+        assert response.status_code == 200, "Health endpoint should return 200 OK"
+
+        health_data = response.json()
+        assert isinstance(health_data, dict), "Health data should be a dictionary"
+        assert "status" in health_data, "Health response should contain status"
+
+        print(f"\nHealth status: {health_data.get('status')}")
+
+    except Exception as e:
+        pytest.fail(f"Error testing health endpoint: {str(e)}")
+
+
+@pytest.mark.parametrize("invalid_model", ["nonexistent-model/v1", "", None, "   "])
+def test_invalid_model_handling(client, invalid_model):
+    """Test handling of invalid or non-existent models"""
+    try:
+        client.responses.create(
+            model=invalid_model,
+            input="Test invalid model",
+        )
+        pytest.fail(f"Invalid model {invalid_model} should raise an error")
+    except Exception as e:
+        assert True, (
+            f"Invalid model {invalid_model} raised an error as expected: {str(e)}"
+        )
+
+
+def test_timeout_handling(client):
+    """Test request timeout behavior"""
+    try:
+        client.responses.create(
+            model=test_models[0],
+            input="Generate a very long response that might take a while",
+            max_output_tokens=1000,
+            timeout=0.01,
+        )
+        pytest.fail("Request should have timed out")
+    except Exception as e:
+        assert "time" in str(e).lower(), "Request timed out as expected"
+
+
+def test_empty_input_handling(client):
+    """Test handling of empty input"""
+    try:
+        client.responses.create(
+            model=test_models[0],
+            input="",
+        )
+        pytest.fail("Empty input should raise an error")
+    except Exception as e:
+        assert True, f"Empty input raised an error as expected: {str(e)}"
+
+
+def test_unsupported_parameters(client):
+    """Test handling of unsupported or unexpected parameters"""
+    try:
+        response = client.responses.create(
+            model=test_models[0],
+            input="Test unsupported parameters",
+            unsupported_param="some_value",
+            another_weird_param=42,
+        )
+        assert response, "Request with unsupported parameters should still work"
+    except Exception as e:
+        assert True, f"Unsupported parameters handled as expected: {str(e)}"
+
+
+def test_response_invalid_temperature(client):
+    """Test response with invalid temperature type"""
+    try:
+        client.responses.create(
+            model=test_models[0],
+            input="What is the weather like?",
+            temperature="hot",
+        )
+        pytest.fail("Invalid temperature type should raise an error")
+    except Exception as e:
+        assert True, f"Invalid temperature raised an error as expected: {str(e)}"
+
+
+def test_response_missing_model(client):
+    """Test response with missing model field"""
+    try:
+        client.responses.create(
+            input="What is your name?",
+            temperature=0.2,
+        )
+        pytest.fail("Missing model should raise an error")
+    except Exception as e:
+        assert True, f"Missing model raised an error as expected: {str(e)}"
+
+
+def test_response_negative_max_tokens(client):
+    """Test response with negative max_output_tokens value"""
+    try:
+        client.responses.create(
+            model=test_models[0],
+            input="Tell me a joke.",
+            temperature=0.2,
+            max_output_tokens=-10,
+        )
+        pytest.fail("Negative max_output_tokens should raise an error")
+    except Exception as e:
+        assert True, f"Negative max_output_tokens raised an error as expected: {str(e)}"
+
+
+def test_response_high_temperature(client):
+    """Test response with a high temperature value"""
+    response = client.responses.create(
+        model=test_models[0],
+        input="Write an imaginative story about a wizard. Only write 10 words",
+        instructions="You are a creative assistant.",
+        temperature=1.30,
+        max_output_tokens=1500,
+    )
+
+    assert response, "High temperature request should return a valid response"
+    assert hasattr(response, "output"), "Response should contain output"
+    assert len(response.output) > 0, "At least one output item should be present"
+
+    message_items = [
+        item for item in response.output if getattr(item, "type", None) == "message"
+    ]
+
+    assert len(message_items) > 0, "Response should contain a 'message' object"
+
+    message = message_items[0]
+    assert hasattr(message, "content") and len(message.content) > 0, (
+        "Message object should have content"
+    )
+
+    final_text = getattr(message.content[0], "text", "")
+
+    assert len(final_text) > 0, "The message content should not be empty"
+
+
+def test_streaming_request_high_token(client):
+    """Test streaming request with high max_output_tokens"""
+    stream = client.responses.create(
+        model=test_models[0],
+        input="Tell me a long story about a superhero's journey.",
+        instructions="You are a creative assistant.",
+        temperature=0.7,
+        max_output_tokens=100,
+        stream=True,
+    )
+
+    chunk_count = 0
+    for chunk in stream:
+        chunk_count += 1
+        if hasattr(chunk, "type") and chunk.type == "response.text.delta":
+            delta = getattr(chunk, "delta", None)
+            assert delta is not None, "Chunk should contain delta"
+        if chunk_count >= 20:
+            break
+
+    assert chunk_count > 0, (
+        "Should receive at least one chunk for high token streaming request"
+    )
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_web_search(client, model, high_web_search_rate_limit):
+    """Test web_search functionality with proper source validation"""
+
+    response = client.responses.create(
+        model=model,
+        input="Who won the Roland Garros Open in 2024? Just reply with the winner's name.",
+        instructions="You are a helpful assistant that provides accurate and up-to-date information.",
+        extra_body={"web_search": True},
+        temperature=0.2,
+    )
+
+    assert response is not None, "Response should not be None"
+    assert response.model == model, f"Response model should be {model}"
+    assert hasattr(response, "output"), "Response should contain output"
+    assert len(response.output) > 0, "Response should contain at least one output item"
+
+    output_types = [getattr(item, "type", None) for item in response.output]
+
+    message_items = [
+        item for item in response.output if getattr(item, "type", None) == "message"
+    ]
+    assert len(message_items) > 0, (
+        f"Response should contain message items. Found types: {output_types}"
+    )
+
+    message = message_items[0]
+    assert hasattr(message, "content") and len(message.content) > 0, (
+        "Message should have content"
+    )
+
+    text_item = next(
+        (c for c in message.content if getattr(c, "type", None) == "output_text"), None
+    )
+    assert text_item is not None, "Message content should contain an output_text item"
+
+    content = getattr(text_item, "text", "")
+    assert content, "Response should contain content"
+
+    sources = getattr(response, "sources", None)
+    assert sources is not None, "Sources field should not be None"
+    assert isinstance(sources, list), "Sources should be a list"
+    assert len(sources) > 0, "Sources should not be empty"
+
+
+@pytest.mark.skipif(
+    not os.environ.get("E2B_API_KEY"),
+    reason="Requires E2B_API_KEY for code execution sandbox",
+)
+@pytest.mark.parametrize("model", test_models)
+def test_execute_python_sha256_e2e(client, model):
+    """Test Python code execution via execute_python tool"""
+    expected = "75cc238b167a05ab7336d773cb096735d459df2f0df9c8df949b1c44075df8a5"
+
+    instructions = (
+        "You are a helpful assistant. When a user asks a question that requires code execution, "
+        "use the execute_python tool to find the answer. After the tool provides its result, "
+        "reply with the value ONLY. No prose, no explanations, no code blocks, no JSON, no quotes."
+    )
+    user_input = (
+        "Execute this exact Python code and return ONLY the result: "
+        "import hashlib; print(hashlib.sha256('Nillion'.encode()).hexdigest())"
+    )
+
+    response = client.responses.create(
+        model=model,
+        input=user_input,
+        instructions=instructions,
+        temperature=0,
+        tools=[
+            {
+                "type": "function",
+                "name": "execute_python",
+                "description": "Executes a snippet of Python code in a secure sandbox and returns the standard output.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "code": {
+                            "type": "string",
+                            "description": "The Python code to be executed.",
+                        }
+                    },
+                    "required": ["code"],
+                    "additionalProperties": False,
+                },
+                "strict": True,
+            }
+        ],
+        tool_choice="auto",  # force the tool
+    )
+
+    # Must have exactly one text output item
+    assert response.output[1].content[0].text, (
+        f"No output. Full: {response.model_dump_json()}"
+    )
+
+    # Enforce "only the result": exact 64-hex chars and equals expected
+    assert response.output[1].content[0].text == expected, (
+        f"Got: {response.output[1].content[0].text!r}  Expected: {expected}"
+    )
diff --git a/tests/e2e/test_responses_http.py b/tests/e2e/test_responses_http.py
new file mode 100644
index 00000000..03db990a
--- /dev/null
+++ b/tests/e2e/test_responses_http.py
@@ -0,0 +1,1046 @@
+import json
+import os
+import re
+import httpx
+import pytest
+
+from .config import BASE_URL, test_models, AUTH_STRATEGY, api_key_getter
+from .nuc import (
+    get_rate_limited_nuc_token,
+    get_invalid_rate_limited_nuc_token,
+    get_nildb_nuc_token,
+    get_document_id_nuc_token,
+)
+
+
+@pytest.fixture
+def client():
+    invocation_token: str = api_key_getter()
+    return httpx.Client(
+        base_url=BASE_URL,
+        headers={
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {invocation_token}",
+        },
+        verify=False,
+        timeout=None,
+    )
+
+
+@pytest.fixture
+def rate_limited_client():
+    invocation_token = get_rate_limited_nuc_token(rate_limit=1)
+    return httpx.Client(
+        base_url=BASE_URL,
+        headers={
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {invocation_token.token}",
+        },
+        timeout=None,
+        verify=False,
+    )
+
+
+@pytest.fixture
+def invalid_rate_limited_client():
+    invocation_token = get_invalid_rate_limited_nuc_token()
+    return httpx.Client(
+        base_url=BASE_URL,
+        headers={
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {invocation_token.token}",
+        },
+        timeout=None,
+        verify=False,
+    )
+
+
+@pytest.fixture
+def nildb_client():
+    invocation_token = get_nildb_nuc_token()
+    return httpx.Client(
+        base_url=BASE_URL,
+        headers={
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {invocation_token.token}",
+        },
+        timeout=None,
+        verify=False,
+    )
+
+
+@pytest.fixture
+def nillion_2025_client():
+    return httpx.Client(
+        base_url=BASE_URL,
+        headers={
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": "Bearer Nillion2025",
+        },
+        verify=False,
+        timeout=None,
+    )
+
+
+@pytest.fixture
+def document_id_client():
+    invocation_token = get_document_id_nuc_token()
+    return httpx.Client(
+        base_url=BASE_URL,
+        headers={
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {invocation_token.token}",
+        },
+        verify=False,
+        timeout=None,
+    )
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_model_standard_request(client, model):
+    payload = {
+        "model": model,
+        "input": "What is the capital of France?",
+        "instructions": "You are a helpful assistant that provides accurate and concise information.",
+        "temperature": 0.2,
+        "max_output_tokens": 100,
+    }
+
+    try:
+        response = client.post("/responses", json=payload, timeout=30)
+        assert response.status_code == 200, (
+            f"Standard request for {model} failed with status {response.status_code}"
+        )
+
+        response_json = response.json()
+        print(response_json)
+        assert "output" in response_json, "Response should contain output"
+        assert "signature" in response_json, "Response should contain signature"
+        assert "usage" in response_json, "Response should contain usage"
+        assert response_json.get("model") == model, f"Response model should be {model}"
+
+        assert len(response_json["output"]) > 0, (
+            "At least one output item should be present"
+        )
+
+        message_items = [
+            item for item in response_json["output"] if item.get("type") == "message"
+        ]
+
+        if message_items:
+            message = message_items[0]
+            content_list = message.get("content", [])
+            assert len(content_list) > 0, "Message item should have content"
+
+            text_item = next(
+                (c for c in content_list if c.get("type") == "output_text"), None
+            )
+            assert text_item is not None, (
+                "Message content should contain an output_text item"
+            )
+
+            content = text_item.get("text", "")
+        else:
+            text_items = [
+                item for item in response_json["output"] if item.get("type") == "text"
+            ]
+            assert len(text_items) > 0, "Response should contain text items"
+            content = text_items[0].get("text", "")
+
+        assert content, f"No content returned for {model}"
+        assert content.strip(), f"Empty response returned for {model}"
+
+        print(
+            f"\nModel {model} response: {content[:100]}..."
+            if len(content) > 100
+            else content
+        )
+
+        if model == "openai/gpt-oss-20b":
+            return
+
+        assert response_json["usage"]["input_tokens"] > 0, (
+            f"No input tokens returned for {model}"
+        )
+        assert response_json["usage"]["output_tokens"] > 0, (
+            f"No output tokens returned for {model}"
+        )
+        assert response_json["usage"]["total_tokens"] > 0, (
+            f"No total tokens returned for {model}"
+        )
+
+        assert "paris" in content.lower(), (
+            "Response should mention Paris as the capital of France"
+        )
+
+    except Exception as e:
+        pytest.fail(f"Error testing response generation with {model}: {str(e)}")
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_model_standard_request_nillion_2025(nillion_2025_client, model):
+    payload = {
+        "model": model,
+        "input": "What is the capital of France?",
+        "instructions": "You are a helpful assistant that provides accurate and concise information.",
+        "temperature": 0.2,
+    }
+
+    response = nillion_2025_client.post("/responses", json=payload, timeout=30)
+    assert response.status_code == 200, (
+        f"Standard request for {model} failed with status {response.status_code}"
+    )
+
+    response_json = response.json()
+    print(response_json)
+    assert "output" in response_json, "Response should contain output"
+    assert len(response_json["output"]) > 0, (
+        "At least one output item should be present"
+    )
+
+    message_items = [i for i in response_json["output"] if i.get("type") == "message"]
+    text_items = [i for i in response_json["output"] if i.get("type") == "text"]
+    if message_items:
+        content_parts = message_items[0].get("content", [])
+        text_part = next(
+            (c for c in content_parts if c.get("type") == "output_text"), None
+        )
+        assert text_part is not None, (
+            "Message content should contain an output_text item"
+        )
+        content = text_part.get("text", "")
+    elif text_items:
+        content = text_items[0].get("text", "")
+    else:
+        raise AssertionError("Response should contain a message or text item")
+    assert content, f"No content returned for {model}"
+
+    assert content.strip(), f"Empty response returned for {model}"
+
+    if model == "openai/gpt-oss-20b":
+        return
+    assert response_json["usage"]["input_tokens"] > 0, f"Input tokens are 0 for {model}"
+    assert response_json["usage"]["output_tokens"] > 0, (
+        f"Output tokens are 0 for {model}"
+    )
+    assert response_json["usage"]["total_tokens"] > 0, f"Total tokens are 0 for {model}"
+
+    print(
+        f"\nModel {model} standard response: {content[:100]}..."
+        if len(content) > 100
+        else content
+    )
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_model_streaming_request(client, model):
+    payload = {
+        "model": model,
+        "input": "Write a short poem about mountains.",
+        "instructions": "You are a helpful assistant that provides accurate and concise information.",
+        "temperature": 0.2,
+        "stream": True,
+    }
+
+    with client.stream("POST", "/responses", json=payload) as response:
+        assert response.status_code == 200, (
+            f"Streaming request for {model} failed with status {response.status_code}"
+        )
+
+        assert response.headers.get("Transfer-Encoding") == "chunked", (
+            "Response should be streamed"
+        )
+
+        chunk_count = 0
+        content = ""
+        had_completed_or_error = False
+
+        for chunk in response.iter_lines():
+            if chunk and chunk.strip() and chunk.startswith("data:"):
+                chunk_count += 1
+                chunk_data = chunk[6:].strip()
+
+                if chunk_data == "[DONE]":
+                    continue
+
+                print(f"\nModel {model} stream chunk {chunk_count}: {chunk_data}")
+                chunk_json = json.loads(chunk_data)
+
+                if chunk_json.get("type") in (
+                    "response.text.delta",
+                    "response.reasoning_text.delta",
+                ):
+                    delta = chunk_json.get("delta", "")
+                    content += delta
+
+                if chunk_json.get("type") == "response.output_item.added":
+                    item = chunk_json.get("item", {})
+                    if item.get("type") == "message" and isinstance(
+                        item.get("content"), list
+                    ):
+                        for content_item in item["content"]:
+                            if content_item.get("type") == "text":
+                                content += content_item.get("text", "")
+
+                if chunk_json.get("type") in ("response.completed", "response.error"):
+                    had_completed_or_error = True
+                    if chunk_json.get("usage"):
+                        print(f"Usage: {chunk_json.get('usage')}")
+
+        assert had_completed_or_error, (
+            f"No completed or error event received for {model}"
+        )
+        assert chunk_count > 0, f"No chunks received for {model} streaming request"
+        print(f"Received {chunk_count} chunks for {model} streaming request")
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_model_tools_request(client, model):
+    payload = {
+        "model": model,
+        "instructions": "You are a helpful assistant. When a user asks a question that requires weather, use the get_weather tool to get the weather information.",
+        "input": "What is the weather like in Paris today?",
+        "temperature": 0.2,
+        "tool_choice": "auto",
+        "tools": [
+            {
+                "type": "function",
+                "name": "get_weather",
+                "description": "Get current temperature for a given location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and country e.g. Paris, France",
+                        }
+                    },
+                    "required": ["location"],
+                    "additionalProperties": False,
+                },
+                "strict": True,
+            }
+        ],
+    }
+
+    try:
+        response = client.post("/responses", json=payload)
+        assert response.status_code == 200, (
+            f"Tools request for {model} failed with status {response.status_code}"
+        )
+
+        response_json = response.json()
+        assert "output" in response_json, "Response should contain output"
+        assert len(response_json["output"]) > 0, (
+            "At least one output item should be present"
+        )
+
+        output = response_json["output"]
+
+        tool_calls = [item for item in output if item.get("type") == "function_call"]
+
+        if tool_calls:
+            print(f"\nModel {model} tool calls: {json.dumps(tool_calls, indent=2)}")
+            assert len(tool_calls) > 0, f"Tool calls array is empty for {model}"
+
+            first_call = tool_calls[0]
+            assert first_call.get("name") == "get_weather", (
+                "Function name should be get_weather"
+            )
+            assert "arguments" in first_call, "Function should have arguments"
+
+            args = json.loads(first_call["arguments"])
+            assert "location" in args, "Arguments should contain location"
+            assert "paris" in args["location"].lower(), "Location should be Paris"
+        else:
+            text_items = [item for item in output if item.get("type") == "text"]
+            if text_items:
+                content = text_items[0].get("text", "")
+                print(
+                    f"\nModel {model} response (no tool call): {content[:100]}..."
+                    if len(content) > 100
+                    else content
+                )
+                assert content, f"No content or tool calls returned for {model}"
+    except Exception as e:
+        print(f"\nError testing tools with {model}: {str(e)}")
+        raise e
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_function_calling_with_streaming_httpx(client, model):
+    if model == "openai/gpt-oss-20b":
+        pytest.skip(
+            "Skipping test for openai/gpt-oss-20b model as it only supports non streaming with responses endpoint"
+        )
+
+    payload = {
+        "model": model,
+        "input": "What is the weather like in Paris today?",
+        "instructions": "You are a helpful assistant that provides accurate and concise information.",
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get current temperature for a given location.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "City and country e.g. Paris, France",
+                            }
+                        },
+                        "required": ["location"],
+                        "additionalProperties": False,
+                    },
+                    "strict": True,
+                },
+            }
+        ],
+        "temperature": 0.2,
+        "stream": True,
+    }
+
+    with client.stream("POST", "/responses", json=payload) as response:
+        assert response.status_code == 200, (
+            f"Streaming request for {model} failed with status {response.status_code}"
+        )
+        had_tool_call = False
+        had_usage = False
+        for line in response.iter_lines():
+            if line and line.strip() and line.startswith("data:"):
+                data_line = line[6:].strip()
+                if data_line == "[DONE]":
+                    continue
+                try:
+                    chunk_json = json.loads(data_line)
+                except json.JSONDecodeError:
+                    continue
+
+                if chunk_json.get("type") == "response.function_call_arguments.delta":
+                    had_tool_call = True
+
+                if chunk_json.get("type") == "response.output_item.added":
+                    item = chunk_json.get("item", {})
+                    if item.get("type") == "function_call":
+                        had_tool_call = True
+
+                if chunk_json.get("type") == "response.completed":
+                    usage = chunk_json.get("usage")
+                    if usage:
+                        had_usage = True
+
+        assert had_tool_call, f"No tool calls received for {model} streaming request"
+        assert had_usage, f"No usage data received for {model} streaming request"
+
+
+def test_invalid_auth_token():
+    invalid_client = httpx.Client(
+        base_url=BASE_URL,
+        headers={
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": "Bearer invalid_token_123",
+        },
+        verify=False,
+    )
+
+    payload = {
+        "model": test_models[0],
+        "input": "Test",
+    }
+
+    response = invalid_client.post("/responses", json=payload)
+    assert response.status_code in [401, 403], (
+        "Invalid token should result in unauthorized access"
+    )
+
+
+def test_rate_limiting(client):
+    payload = {
+        "model": test_models[0],
+        "input": "Generate a short poem",
+    }
+
+    responses = []
+    for _ in range(20):
+        response = client.post("/responses", json=payload)
+        responses.append(response)
+
+    rate_limit_statuses = [429, 403, 503]
+    rate_limited_responses = [
+        r for r in responses if r.status_code in rate_limit_statuses
+    ]
+
+    if len(rate_limited_responses) == 0:
+        pytest.skip("No rate limiting detected. Manual review may be needed.")
+
+
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC rate limiting not used with API key"
+)
+def test_rate_limiting_nucs(rate_limited_client):
+    payload = {
+        "model": test_models[0],
+        "input": "What is your name?",
+    }
+
+    responses = []
+    for _ in range(4):
+        response = rate_limited_client.post("/responses", json=payload)
+        responses.append(response)
+
+    rate_limit_statuses = [429, 403, 503]
+    rate_limited_responses = [
+        r for r in responses if r.status_code in rate_limit_statuses
+    ]
+
+    assert len(rate_limited_responses) > 0, (
+        "No NUC rate limiting detected, when expected"
+    )
+
+
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC rate limiting not used with API key"
+)
+def test_invalid_rate_limiting_nucs(invalid_rate_limited_client):
+    payload = {
+        "model": test_models[0],
+        "input": "What is your name?",
+    }
+
+    responses = []
+    for _ in range(4):
+        response = invalid_rate_limited_client.post("/responses", json=payload)
+        responses.append(response)
+
+    rate_limit_statuses = [401]
+    rate_limited_responses = [
+        r for r in responses if r.status_code in rate_limit_statuses
+    ]
+
+    assert len(rate_limited_responses) > 0, (
+        "No NUC rate limiting detected, when expected"
+    )
+
+
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC rate limiting not used with API key"
+)
+def test_invalid_nildb_command_nucs(nildb_client):
+    payload = {
+        "model": test_models[0],
+        "input": "What is your name?",
+    }
+    response = nildb_client.post("/responses", json=payload)
+    assert response.status_code == 401, "Invalid NILDB command should return 401"
+
+
+def test_large_payload_handling(client):
+    large_instructions = "Hello " * 1000
+
+    payload = {
+        "model": test_models[0],
+        "input": "Respond briefly",
+        "instructions": large_instructions,
+        "max_output_tokens": 50,
+    }
+
+    response = client.post("/responses", json=payload, timeout=30)
+    print(response)
+
+    assert response.status_code in [200, 413], (
+        "Large payload should be handled gracefully"
+    )
+
+    if response.status_code == 200:
+        response_json = response.json()
+        assert "output" in response_json, "Response should contain output"
+        assert len(response_json["output"]) > 0, (
+            "At least one output item should be present"
+        )
+
+
+@pytest.mark.parametrize("invalid_model", ["nonexistent-model/v1", "", None, "   "])
+def test_invalid_model_handling(client, invalid_model):
+    payload = {
+        "model": invalid_model,
+        "input": "Test invalid model",
+    }
+
+    response = client.post("/responses", json=payload)
+
+    assert response.status_code in [400, 404], (
+        f"Invalid model {invalid_model} should return an error"
+    )
+
+
+def test_timeout_handling(client):
+    payload = {
+        "model": test_models[0],
+        "input": "Generate a very long response that might take a while",
+        "max_output_tokens": 1000,
+    }
+
+    try:
+        _ = client.post("/responses", json=payload, timeout=0.1)
+        pytest.fail("Request should have timed out")
+    except httpx.TimeoutException:
+        assert True, "Request timed out as expected"
+
+
+def test_empty_input_handling(client):
+    payload = {
+        "model": test_models[0],
+        "input": "",
+    }
+
+    response = client.post("/responses", json=payload)
+    print(response)
+
+    assert response.status_code == 400, "Empty input should return a Bad Request"
+
+    response_json = response.json()
+    assert "detail" in response_json, "Error response should contain detail"
+
+
+def test_unsupported_parameters(client):
+    payload = {
+        "model": test_models[0],
+        "input": "Test unsupported parameters",
+        "unsupported_param": "some_value",
+        "another_weird_param": 42,
+    }
+
+    response = client.post("/responses", json=payload)
+
+    assert response.status_code in [200, 400], (
+        "Unsupported parameters should be handled gracefully"
+    )
+
+
+def test_response_invalid_temperature(client):
+    payload = {
+        "model": test_models[0],
+        "input": "What is the weather like?",
+        "temperature": "hot",
+    }
+    response = client.post("/responses", json=payload)
+    print(response)
+    assert response.status_code == 400, (
+        "Invalid temperature type should return a 400 error"
+    )
+
+
+def test_response_missing_model(client):
+    payload = {
+        "input": "What is your name?",
+        "temperature": 0.2,
+    }
+    response = client.post("/responses", json=payload)
+    assert response.status_code == 400, (
+        "Missing model should return a 400 validation error"
+    )
+
+
+def test_response_negative_max_tokens(client):
+    payload = {
+        "model": test_models[0],
+        "input": "Tell me a joke.",
+        "temperature": 0.2,
+        "max_output_tokens": -10,
+    }
+    response = client.post("/responses", json=payload)
+    assert response.status_code == 400, (
+        "Negative max_output_tokens should return a 400 validation error"
+    )
+
+
+def test_response_high_temperature(client):
+    payload = {
+        "model": test_models[0],
+        "input": "Write an imaginative story about a wizard.",
+        "instructions": "You are a creative assistant.",
+        "temperature": 2.0,
+        "max_output_tokens": 50,
+    }
+    response = client.post("/responses", json=payload)
+    assert response.status_code == 200, (
+        "High temperature request should return a valid response"
+    )
+    response_json = response.json()
+    assert "output" in response_json, "Response should contain output"
+    assert len(response_json["output"]) > 0, (
+        "At least one output item should be present"
+    )
+
+
+def test_model_streaming_request_high_token(client):
+    payload = {
+        "model": test_models[0],
+        "input": "Tell me a long story about a superhero's journey.",
+        "instructions": "You are a creative assistant.",
+        "temperature": 0.7,
+        "max_output_tokens": 100,
+        "stream": True,
+    }
+    with client.stream("POST", "/responses", json=payload) as response:
+        assert response.status_code == 200, (
+            "Streaming with high max_output_tokens should return 200 status"
+        )
+        chunk_count = 0
+        for line in response.iter_lines():
+            if line and line.strip() and line.startswith("data:"):
+                chunk_count += 1
+        assert chunk_count > 0, (
+            "Should receive at least one chunk for high token streaming request"
+        )
+
+
+def test_usage_endpoint(client):
+    try:
+        import requests
+
+        invocation_token = api_key_getter()
+
+        url = BASE_URL + "/usage"
+        response = requests.get(
+            url,
+            headers={
+                "Authorization": f"Bearer {invocation_token}",
+                "Content-Type": "application/json",
+            },
+            verify=False,
+        )
+        assert response.status_code == 200, "Usage endpoint should return 200 OK"
+
+        usage_data = response.json()
+        assert isinstance(usage_data, dict), "Usage data should be a dictionary"
+
+        expected_keys = [
+            "total_tokens",
+            "completion_tokens",
+            "prompt_tokens",
+            "queries",
+        ]
+        for key in expected_keys:
+            assert key in usage_data, f"Expected key {key} not found in usage data"
+
+        print(f"\nUsage data: {json.dumps(usage_data, indent=2)}")
+
+    except Exception as e:
+        pytest.fail(f"Error testing usage endpoint: {str(e)}")
+
+
+def test_attestation_endpoint(client):
+    try:
+        import requests
+
+        invocation_token = api_key_getter()
+
+        url = BASE_URL + "/attestation/report"
+        response = requests.get(
+            url,
+            headers={
+                "Authorization": f"Bearer {invocation_token}",
+                "Content-Type": "application/json",
+            },
+            params={"nonce": "0" * 64},
+            verify=False,
+        )
+
+        assert response.status_code == 200, "Attestation endpoint should return 200 OK"
+
+        report = response.json()
+        assert isinstance(report, dict), "Attestation report should be a dictionary"
+
+        expected_keys = ["cpu_attestation", "gpu_attestation", "verifying_key"]
+        for key in expected_keys:
+            assert key in report, f"Expected key {key} not found in attestation report"
+
+        print(f"\nAttestation report received with keys: {list(report.keys())}")
+
+    except Exception as e:
+        pytest.fail(f"Error testing attestation endpoint: {str(e)}")
+
+
+def test_health_endpoint(client):
+    try:
+        import requests
+
+        url = BASE_URL + "/health"
+        response = requests.get(
+            url,
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            verify=False,
+        )
+
+        print(f"Health response: {response.status_code} {response.text}")
+        assert response.status_code == 200, "Health endpoint should return 200 OK"
+
+        health_data = response.json()
+        assert isinstance(health_data, dict), "Health data should be a dictionary"
+        assert "status" in health_data, "Health response should contain status"
+
+        print(f"\nHealth status: {health_data.get('status')}")
+
+    except Exception as e:
+        pytest.fail(f"Error testing health endpoint: {str(e)}")
+
+
+@pytest.fixture
+def high_web_search_rate_limit(monkeypatch):
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_MINUTE", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_HOUR", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT_DAY", "9999")
+    monkeypatch.setenv("WEB_SEARCH_RATE_LIMIT", "9999")
+
+
+@pytest.mark.parametrize("model", test_models)
+def test_web_search(client, model, high_web_search_rate_limit):
+    payload = {
+        "model": model,
+        "input": "Who won the Roland Garros Open in 2024? Just reply with the winner's name.",
+        "instructions": "You are a helpful assistant that provides accurate and up-to-date information. Answer in 10 words maximum and do not reason.",
+        "temperature": 0.2,
+        "max_output_tokens": 15000,
+        "extra_body": {"web_search": True},
+    }
+
+    response = client.post("/responses", json=payload, timeout=180)
+    assert response.status_code == 200, (
+        f"Response for {model} failed with status {response.status_code}"
+    )
+
+    response_json = response.json()
+    assert response_json.get("model") == model, f"Response model should be {model}"
+    assert "output" in response_json, "Response should contain output"
+    assert len(response_json["output"]) > 0, (
+        "Response should contain at least one output item"
+    )
+
+    message_items = [i for i in response_json["output"] if i.get("type") == "message"]
+    text_items = [i for i in response_json["output"] if i.get("type") == "text"]
+    reasoning_items = [
+        i for i in response_json["output"] if i.get("type") == "reasoning"
+    ]
+
+    assert message_items or text_items or reasoning_items, (
+        "Response should contain message, text, or reasoning items"
+    )
+
+    if message_items:
+        message = message_items[0]
+        content_list = message.get("content", [])
+        assert len(content_list) > 0, "Message should have content"
+        text_item = next(
+            (c for c in content_list if c.get("type") == "output_text"), None
+        )
+        assert text_item is not None, (
+            "Message content should contain an output_text item"
+        )
+        content = text_item.get("text", "")
+    elif text_items:
+        content = text_items[0].get("text", "")
+    else:
+        parts = reasoning_items[0].get("content") or []
+        text_part = next(
+            (c for c in parts if c.get("type") in ("output_text", "reasoning_text")),
+            None,
+        )
+        assert text_part and text_part.get("text", ""), (
+            "Reasoning item missing text content"
+        )
+        content = text_part.get("text", "")
+
+    assert content, "Response should contain content"
+
+    sources = response_json.get("sources")
+    if sources is not None:
+        assert isinstance(sources, list), "Sources should be a list"
+        assert len(sources) > 0, "Sources should not be empty"
+        print(f"Sources found: {len(sources)}")
+    else:
+        print(
+            "Warning: Sources field is None - web search may not be enabled or working properly"
+        )
+
+
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC required for this tests on nilDB"
+)
+def test_nildb_delegation(client: httpx.Client):
+    from secretvaults.common.keypair import Keypair
+    from nuc.envelope import NucTokenEnvelope
+    from nuc.validate import NucTokenValidator, ValidationParameters
+    from nuc.nilauth import NilauthClient
+    from nilai_api.config import CONFIG
+    from nuc.token import Did
+
+    keypair = Keypair.generate()
+    did = keypair.to_did_string()
+
+    response = client.get("/delegation", params={"prompt_delegation_request": did})
+
+    assert response.status_code == 200, (
+        f"Delegation token should be returned: {response.text}"
+    )
+    assert "token" in response.json(), "Delegation token should be returned"
+    assert "did" in response.json(), "Delegation did should be returned"
+    token = response.json()["token"]
+    did = response.json()["did"]
+    assert token is not None, "Delegation token should be returned"
+    assert did is not None, "Delegation did should be returned"
+
+    nuc_token_envelope = NucTokenEnvelope.parse(token)
+    nilauth_public_keys = [
+        Did(NilauthClient(CONFIG.nildb.nilauth_url).about().public_key.serialize())
+    ]
+    NucTokenValidator(nilauth_public_keys).validate(
+        nuc_token_envelope, context={}, parameters=ValidationParameters.default()
+    )
+
+
+@pytest.mark.skip(
+    reason="prompt cannot be accessed because of a secretvaults-py update"
+)
+@pytest.mark.parametrize("model", test_models)
+@pytest.mark.skipif(
+    AUTH_STRATEGY != "nuc", reason="NUC required for this tests on nilDB"
+)
+def test_nildb_prompt_document(document_id_client: httpx.Client, model):
+    payload = {
+        "model": model,
+        "input": "Can you make a small rhyme?",
+        "instructions": "You are a helpful assistant.",
+        "temperature": 0.2,
+    }
+
+    response = document_id_client.post("/responses", json=payload, timeout=30)
+
+    assert response.status_code == 200, (
+        f"Response should be successful: {response.text}"
+    )
+
+    response_json = response.json()
+
+    message_items = [
+        item for item in response_json["output"] if item.get("type") == "message"
+    ]
+    text_items = [
+        item for item in response_json["output"] if item.get("type") == "text"
+    ]
+
+    if message_items:
+        content_parts = message_items[0].get("content", [])
+        text_part = next(
+            (c for c in content_parts if c.get("type") == "output_text"), None
+        )
+        assert text_part is not None, (
+            "Message content should contain an output_text item"
+        )
+        message = text_part.get("text", "")
+    elif text_items:
+        message = text_items[0].get("text", "")
+    else:
+        raise AssertionError("Response should contain a message or text item")
+
+    assert message, "Response should contain content"
+    assert "cheese" in message.lower(), "Response should contain cheese"
+
+
+@pytest.mark.skipif(
+    not os.environ.get("E2B_API_KEY"),
+    reason="Requires E2B_API_KEY for code execution sandbox",
+)
+@pytest.mark.parametrize("model", test_models)
+def test_execute_python_sha256_e2e(client, model):
+    if model == "openai/gpt-oss-20b":
+        pytest.skip("Model/back-end does not support execute_python tool")
+
+    expected = "75cc238b167a05ab7336d773cb096735d459df2f0df9c8df949b1c44075df8a5"
+
+    instructions = (
+        "You are a helpful assistant. When a user asks a question that requires code execution, "
+        "use the execute_python tool to find the answer. After the tool provides its result, "
+        "you must use that result to formulate a clear, final answer to the user's original question. "
+        "Do not include any code or JSON in your final response."
+    )
+    user_input = "Execute this exact Python code and return the result: import hashlib; print(hashlib.sha256('Nillion'.encode()).hexdigest())"
+
+    payload = {
+        "model": model,
+        "input": user_input,
+        "instructions": instructions,
+        "temperature": 0,
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "execute_python",
+                    "description": "Executes a snippet of Python code in a secure sandbox and returns the standard output.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "code": {
+                                "type": "string",
+                                "description": "The Python code to be executed.",
+                            }
+                        },
+                        "required": ["code"],
+                        "additionalProperties": False,
+                    },
+                    "strict": True,
+                },
+            }
+        ],
+    }
+
+    trials = 3
+    escaped_expected = re.escape(expected)
+    pattern = rf"\b{escaped_expected}\b"
+    last_data = None
+    last_content = ""
+    last_status = None
+
+    for _ in range(trials):
+        response = client.post("/responses", json=payload)
+        last_status = response.status_code
+        if response.status_code != 200:
+            continue
+        data = response.json()
+        last_data = data
+        if not ("output" in data and data["output"]):
+            continue
+
+        text_items = [item for item in data["output"] if item.get("type") == "text"]
+        if not text_items:
+            continue
+
+        content = text_items[0].get("text", "")
+        last_content = content
+        normalized_content = re.sub(r"\s+", " ", content)
+
+        if re.search(pattern, normalized_content):
+            break
+    else:
+        pytest.fail(
+            (
+                "Expected exact SHA-256 hash not found after retries.\n"
+                f"Last status: {last_status}\n"
+                f"Got: {last_content[:200]}...\n"
+                f"Expected: {expected}\n"
+                f"Full: {json.dumps(last_data, indent=2)[:1000] if last_data else '<no json>'}"
+            )
+        )
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
index 4b43a545..a0eb15b2 100644
--- a/tests/unit/__init__.py
+++ b/tests/unit/__init__.py
@@ -1,7 +1,13 @@
 from openai.types.chat.chat_completion import ChoiceLogprobs
+from openai.types.responses import Response as OpenAIResponse, ResponseUsage
+from openai.types.responses.response_usage import (
+    InputTokensDetails,
+    OutputTokensDetails,
+)
 
 from nilai_common import (
     SignedChatCompletion,
+    SignedResponse,
     ModelEndpoint,
     ModelMetadata,
     Usage,
@@ -41,3 +47,29 @@
     usage=Usage(prompt_tokens=100, completion_tokens=50, total_tokens=150),
     signature="test-signature",
 )
+
+responses_usage: ResponseUsage = ResponseUsage(
+    input_tokens=100,
+    input_tokens_details=InputTokensDetails(cached_tokens=0),
+    output_tokens=50,
+    output_tokens_details=OutputTokensDetails(reasoning_tokens=0),
+    total_tokens=150,
+)
+
+RESPONSES_RESPONSE: OpenAIResponse = OpenAIResponse(
+    id="test-response-id",
+    object="response",
+    model="test-model",
+    created_at=123456.0,
+    status="completed",
+    output=[],
+    parallel_tool_calls=False,
+    tool_choice="auto",
+    tools=[],
+    usage=responses_usage,
+)
+
+SIGNED_RESPONSES_RESPONSE: SignedResponse = SignedResponse(
+    **RESPONSES_RESPONSE.model_dump(),
+    signature="test-signature",
+)
diff --git a/tests/unit/nilai-common/test_api_model.py b/tests/unit/nilai-common/test_api_model.py
index d3ae5d4b..4f6b72df 100644
--- a/tests/unit/nilai-common/test_api_model.py
+++ b/tests/unit/nilai-common/test_api_model.py
@@ -1,5 +1,5 @@
 import pytest
-from nilai_common.api_model import ModelMetadata
+from nilai_common.api_models import ModelMetadata
 from pydantic import ValidationError
 
 
diff --git a/tests/unit/nilai-common/test_discovery.py b/tests/unit/nilai-common/test_discovery.py
index c8b5f2cc..34066125 100644
--- a/tests/unit/nilai-common/test_discovery.py
+++ b/tests/unit/nilai-common/test_discovery.py
@@ -2,7 +2,7 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from nilai_common.api_model import ModelEndpoint, ModelMetadata
+from nilai_common.api_models import ModelEndpoint, ModelMetadata
 from nilai_common.discovery import ModelServiceDiscovery
 
 
diff --git a/tests/unit/nilai_api/handlers/tools/__init__.py b/tests/unit/nilai_api/handlers/tools/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/nilai_api/handlers/tools/test_responses_tool_router.py b/tests/unit/nilai_api/handlers/tools/test_responses_tool_router.py
new file mode 100644
index 00000000..96af35ce
--- /dev/null
+++ b/tests/unit/nilai_api/handlers/tools/test_responses_tool_router.py
@@ -0,0 +1,175 @@
+import json
+from typing import Any, cast
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from nilai_api.handlers.tools import responses_tool_router
+from nilai_common import (
+    ResponseRequest,
+    ResponseFunctionToolCallParam,
+)
+from openai.types.responses import (
+    Response,
+    ResponseFunctionToolCall,
+    ResponseUsage,
+    FunctionToolParam,
+)
+from openai.types.responses.response_usage import (
+    InputTokensDetails,
+    OutputTokensDetails,
+)
+
+
+@pytest.mark.asyncio
+async def test_route_and_execute_tool_call_invokes_code_execution(mocker):
+    tool_call: ResponseFunctionToolCallParam = {
+        "type": "function_call",
+        "call_id": "call_123",
+        "name": "execute_python",
+        "arguments": json.dumps({"code": "print(6*7)"}),
+    }
+
+    mock_exec = mocker.patch(
+        "nilai_api.handlers.tools.responses_tool_router.code_execution.execute_python",
+        new_callable=AsyncMock,
+        return_value="42",
+    )
+
+    result = await responses_tool_router.route_and_execute_tool_call(tool_call)
+
+    mock_exec.assert_awaited_once_with("print(6*7)")
+    assert result.type == "function_call_output"
+    assert result.call_id == "call_123"
+    output_str = result.output if isinstance(result.output, str) else "{}"
+    payload = json.loads(output_str)
+    assert payload == {"result": "42"}
+
+
+def make_response_usage(prompt: int, completion: int) -> ResponseUsage:
+    return ResponseUsage(
+        input_tokens=prompt,
+        input_tokens_details=InputTokensDetails(cached_tokens=0),
+        output_tokens=completion,
+        output_tokens_details=OutputTokensDetails(reasoning_tokens=0),
+        total_tokens=prompt + completion,
+    )
+
+
+def make_tool_call_response(code: str) -> Response:
+    return Response(
+        id="resp_tool",
+        object="response",
+        model="openai/gpt-oss-20b",
+        created_at=123456.0,
+        status="completed",
+        output=[
+            ResponseFunctionToolCall(
+                id="call_abc",
+                type="function_call",
+                call_id="call_abc",
+                name="execute_python",
+                arguments=json.dumps({"code": code}),
+                status="completed",
+            )
+        ],
+        parallel_tool_calls=False,
+        tool_choice="auto",
+        tools=[],
+        usage=make_response_usage(prompt=10, completion=5),
+    )
+
+
+@pytest.mark.asyncio
+async def test_handle_responses_tool_workflow_executes_and_uses_result(mocker):
+    tool: FunctionToolParam = {
+        "type": "function",
+        "name": "execute_python",
+        "description": "Execute small Python code snippets.",
+        "parameters": {
+            "type": "object",
+            "properties": {"code": {"type": "string"}},
+            "required": ["code"],
+        },
+        "strict": None,
+    }
+
+    req = ResponseRequest(
+        model="openai/gpt-oss-20b",
+        input=[
+            {
+                "role": "user",
+                "type": "message",
+                "content": [
+                    {"type": "input_text", "text": "What is 6*7?"},
+                ],
+            }
+        ],
+        tools=[tool],
+    )
+
+    first_response = make_tool_call_response("print(6*7)")
+
+    mock_exec = mocker.patch(
+        "nilai_api.handlers.tools.responses_tool_router.code_execution.execute_python",
+        new_callable=AsyncMock,
+        return_value="42",
+    )
+
+    second_response = Response(
+        id="resp_final",
+        object="response",
+        model=req.model,
+        created_at=123457.0,
+        status="completed",
+        output=[],
+        parallel_tool_calls=False,
+        tool_choice="auto",
+        tools=[],
+        usage=make_response_usage(prompt=7, completion=2),
+    )
+
+    mock_client = MagicMock()
+    mock_responses = MagicMock()
+    mock_responses.create = AsyncMock(return_value=second_response)
+    mock_client.responses = mock_responses
+
+    (
+        final,
+        prompt_tokens,
+        completion_tokens,
+    ) = await responses_tool_router.handle_responses_tool_workflow(
+        mock_client,
+        req,
+        cast(Any, req.input),
+        first_response,
+    )
+
+    mock_exec.assert_awaited_once_with("print(6*7)")
+    assert final == second_response
+    assert first_response.usage is not None
+    assert second_response.usage is not None
+    assert (
+        prompt_tokens
+        == first_response.usage.input_tokens + second_response.usage.input_tokens
+    )
+    assert (
+        completion_tokens
+        == first_response.usage.output_tokens + second_response.usage.output_tokens
+    )
+
+
+def test_extract_function_tool_calls_from_response():
+    response = make_tool_call_response("print(2+3)")
+
+    tool_calls = responses_tool_router.extract_function_tool_calls_from_response(
+        response
+    )
+
+    assert len(tool_calls) == 1
+    tc = tool_calls[0]
+    assert tc["type"] == "function_call"
+    assert tc["name"] == "execute_python"
+    assert tc["call_id"] == "call_abc"
+    args = json.loads(tc["arguments"] or "{}")
+    assert args == {"code": "print(2+3)"}
diff --git a/tests/unit/nilai_api/routers/test_private.py b/tests/unit/nilai_api/routers/test_chat_completions_private.py
similarity index 95%
rename from tests/unit/nilai_api/routers/test_private.py
rename to tests/unit/nilai_api/routers/test_chat_completions_private.py
index 13d9d781..7179688e 100644
--- a/tests/unit/nilai_api/routers/test_private.py
+++ b/tests/unit/nilai_api/routers/test_chat_completions_private.py
@@ -9,7 +9,11 @@
 from nilai_common import AttestationReport, Source
 
 from nilai_api.state import state
-from ... import model_endpoint, model_metadata, response as RESPONSE
+from ... import (
+    model_endpoint,
+    model_metadata,
+    response as RESPONSE,
+)
 
 
 @pytest.mark.asyncio
@@ -100,7 +104,6 @@ def mock_user_manager(mock_user, mocker):
 @pytest.fixture
 def mock_state(mocker):
     # Prepare expected models data
-
     expected_models = {"ABC": model_endpoint}
 
     # Create a mock discovery service that returns the expected models
@@ -200,7 +203,12 @@ def test_chat_completion(mock_user, mock_state, mock_user_manager, mocker, clien
     mock_async_openai_instance = MagicMock()
     mock_async_openai_instance.chat = mock_chat
     mocker.patch(
-        "nilai_api.routers.private.AsyncOpenAI", return_value=mock_async_openai_instance
+        "nilai_api.routers.endpoints.chat.AsyncOpenAI",
+        return_value=mock_async_openai_instance,
+    )
+    mocker.patch(
+        "nilai_api.routers.endpoints.chat.handle_tool_workflow",
+        return_value=(response_data, 0, 0),
     )
     response = client.post(
         "/v1/chat/completions",
@@ -227,6 +235,8 @@ def test_chat_completion(mock_user, mock_state, mock_user_manager, mocker, clien
 def test_chat_completion_stream_includes_sources(
     mock_user, mock_state, mock_user_manager, mocker, client
 ):
+    mock_user.rate_limits_obj.web_search_rate_limit_minute = 100
+
     source = Source(source="https://example.com", content="Example result")
 
     mock_web_search_result = MagicMock()
@@ -237,7 +247,7 @@ def test_chat_completion_stream_includes_sources(
     mock_web_search_result.sources = [source]
 
     mocker.patch(
-        "nilai_api.routers.private.handle_web_search",
+        "nilai_api.routers.endpoints.chat.handle_web_search",
         new=AsyncMock(return_value=mock_web_search_result),
     )
 
@@ -294,7 +304,7 @@ async def chunk_generator():
     mock_async_openai_instance.chat = mock_chat
 
     mocker.patch(
-        "nilai_api.routers.private.AsyncOpenAI",
+        "nilai_api.routers.endpoints.chat.AsyncOpenAI",
         return_value=mock_async_openai_instance,
     )
 
diff --git a/tests/unit/nilai_api/routers/test_nildb_endpoints.py b/tests/unit/nilai_api/routers/test_nildb_endpoints.py
index 54209b69..7036fecc 100644
--- a/tests/unit/nilai_api/routers/test_nildb_endpoints.py
+++ b/tests/unit/nilai_api/routers/test_nildb_endpoints.py
@@ -8,6 +8,7 @@
     PromptDelegationToken,
 )
 from datetime import datetime, timezone
+from nilai_common import ResponseRequest
 
 
 class TestNilDBEndpoints:
@@ -141,7 +142,7 @@ async def test_get_prompt_store_delegation_handler_error(
     @pytest.mark.asyncio
     async def test_chat_completion_with_prompt_document_injection(self):
         """Test chat completion with prompt document injection"""
-        from nilai_api.routers.private import chat_completion
+        from nilai_api.routers.endpoints.chat import chat_completion
         from nilai_common import ChatRequest
 
         mock_prompt_document = PromptDocument(
@@ -163,19 +164,26 @@ async def test_chat_completion_with_prompt_document_injection(self):
         )
 
         with (
-            patch("nilai_api.routers.private.get_prompt_from_nildb") as mock_get_prompt,
-            patch("nilai_api.routers.private.AsyncOpenAI") as mock_openai_client,
-            patch("nilai_api.routers.private.state.get_model") as mock_get_model,
-            patch("nilai_api.routers.private.handle_nilrag") as mock_handle_nilrag,
             patch(
-                "nilai_api.routers.private.handle_web_search"
+                "nilai_api.routers.endpoints.chat.get_prompt_from_nildb"
+            ) as mock_get_prompt,
+            patch("nilai_api.routers.endpoints.chat.AsyncOpenAI") as mock_openai_client,
+            patch("nilai_api.routers.endpoints.chat.state.get_model") as mock_get_model,
+            patch(
+                "nilai_api.routers.endpoints.chat.handle_nilrag"
+            ) as mock_handle_nilrag,
+            patch(
+                "nilai_api.routers.endpoints.chat.handle_web_search"
             ) as mock_handle_web_search,
             patch(
-                "nilai_api.routers.private.UserManager.update_token_usage"
+                "nilai_api.routers.endpoints.chat.UserManager.update_token_usage"
             ) as mock_update_usage,
             patch(
-                "nilai_api.routers.private.QueryLogManager.log_query"
+                "nilai_api.routers.endpoints.chat.QueryLogManager.log_query"
             ) as mock_log_query,
+            patch(
+                "nilai_api.routers.endpoints.chat.handle_tool_workflow"
+            ) as mock_handle_tool_workflow,
         ):
             mock_get_prompt.return_value = "System prompt from nilDB"
 
@@ -219,6 +227,9 @@ async def test_chat_completion_with_prompt_document_injection(self):
                     "total_tokens": 15,
                 },
             }
+            mock_response.usage = MagicMock()
+            mock_response.usage.prompt_tokens = 10
+            mock_response.usage.completion_tokens = 5
             # Make the create method itself an AsyncMock that returns the response
             mock_client_instance.chat.completions.create = AsyncMock(
                 return_value=mock_response
@@ -226,23 +237,18 @@ async def test_chat_completion_with_prompt_document_injection(self):
             mock_client_instance.close = AsyncMock()
             mock_openai_client.return_value = mock_client_instance
 
-            #
+            # Mock handle_tool_workflow to return the response and token counts
+            mock_handle_tool_workflow.return_value = (mock_response, 0, 0)
+
             # Call the function (this will test the prompt injection logic)
-            # Note: We can't easily test the full endpoint without setting up the FastAPI app
-            # But we can test that get_prompt_from_nildb is called
-            try:
-                await chat_completion(req=request, auth_info=mock_auth_info)
-            except Exception as e:
-                # Expected to fail due to incomplete mocking, but we should still see the prompt call
-                print("The exception is: ", str(e))
-                raise e
+            await chat_completion(req=request, auth_info=mock_auth_info)
 
             mock_get_prompt.assert_called_once_with(mock_prompt_document)
 
     @pytest.mark.asyncio
     async def test_chat_completion_prompt_document_extraction_error(self):
         """Test chat completion when prompt document extraction fails"""
-        from nilai_api.routers.private import chat_completion
+        from nilai_api.routers.endpoints.chat import chat_completion
         from nilai_common import ChatRequest
 
         mock_prompt_document = PromptDocument(
@@ -264,8 +270,10 @@ async def test_chat_completion_prompt_document_extraction_error(self):
         )
 
         with (
-            patch("nilai_api.routers.private.get_prompt_from_nildb") as mock_get_prompt,
-            patch("nilai_api.routers.private.state.get_model") as mock_get_model,
+            patch(
+                "nilai_api.routers.endpoints.chat.get_prompt_from_nildb"
+            ) as mock_get_prompt,
+            patch("nilai_api.routers.endpoints.chat.state.get_model") as mock_get_model,
         ):
             # Mock state.get_model() to return a ModelEndpoint
             mock_model_endpoint = MagicMock()
@@ -288,7 +296,7 @@ async def test_chat_completion_prompt_document_extraction_error(self):
     @pytest.mark.asyncio
     async def test_chat_completion_without_prompt_document(self):
         """Test chat completion when no prompt document is present"""
-        from nilai_api.routers.private import chat_completion
+        from nilai_api.routers.endpoints.chat import chat_completion
         from nilai_common import ChatRequest
 
         mock_user = MagicMock()
@@ -300,7 +308,7 @@ async def test_chat_completion_without_prompt_document(self):
         mock_auth_info = AuthenticationInfo(
             user=mock_user,
             token_rate_limit=None,
-            prompt_document=None,  # No prompt document
+            prompt_document=None,
         )
 
         request = ChatRequest(
@@ -308,19 +316,26 @@ async def test_chat_completion_without_prompt_document(self):
         )
 
         with (
-            patch("nilai_api.routers.private.get_prompt_from_nildb") as mock_get_prompt,
-            patch("nilai_api.routers.private.AsyncOpenAI") as mock_openai_client,
-            patch("nilai_api.routers.private.state.get_model") as mock_get_model,
-            patch("nilai_api.routers.private.handle_nilrag") as mock_handle_nilrag,
             patch(
-                "nilai_api.routers.private.handle_web_search"
+                "nilai_api.routers.endpoints.chat.get_prompt_from_nildb"
+            ) as mock_get_prompt,
+            patch("nilai_api.routers.endpoints.chat.AsyncOpenAI") as mock_openai_client,
+            patch("nilai_api.routers.endpoints.chat.state.get_model") as mock_get_model,
+            patch(
+                "nilai_api.routers.endpoints.chat.handle_nilrag"
+            ) as mock_handle_nilrag,
+            patch(
+                "nilai_api.routers.endpoints.chat.handle_web_search"
             ) as mock_handle_web_search,
             patch(
-                "nilai_api.routers.private.UserManager.update_token_usage"
+                "nilai_api.routers.endpoints.chat.UserManager.update_token_usage"
             ) as mock_update_usage,
             patch(
-                "nilai_api.routers.private.QueryLogManager.log_query"
+                "nilai_api.routers.endpoints.chat.QueryLogManager.log_query"
             ) as mock_log_query,
+            patch(
+                "nilai_api.routers.endpoints.chat.handle_tool_workflow"
+            ) as mock_handle_tool_workflow,
         ):
             # Mock state.get_model() to return a ModelEndpoint
             mock_model_endpoint = MagicMock()
@@ -362,6 +377,9 @@ async def test_chat_completion_without_prompt_document(self):
                     "total_tokens": 15,
                 },
             }
+            mock_response.usage = MagicMock()
+            mock_response.usage.prompt_tokens = 10
+            mock_response.usage.completion_tokens = 5
             # Make the create method itself an AsyncMock that returns the response
             mock_client_instance.chat.completions.create = AsyncMock(
                 return_value=mock_response
@@ -369,16 +387,227 @@ async def test_chat_completion_without_prompt_document(self):
             mock_client_instance.close = AsyncMock()
             mock_openai_client.return_value = mock_client_instance
 
+            # Mock handle_tool_workflow to return the response and token counts
+            mock_handle_tool_workflow.return_value = (mock_response, 0, 0)
+
             # Call the function
-            try:
-                await chat_completion(req=request, auth_info=mock_auth_info)
-            except Exception:
-                # Expected to fail due to incomplete mocking
-                pass
+            await chat_completion(req=request, auth_info=mock_auth_info)
 
             # Should not call get_prompt_from_nildb when no prompt document
             mock_get_prompt.assert_not_called()
 
+    @pytest.mark.asyncio
+    async def test_responses_with_prompt_document_injection(self):
+        """Test responses endpoint with prompt document injection"""
+        from nilai_api.routers.endpoints.responses import create_response
+
+        mock_prompt_document = PromptDocument(
+            document_id="test-doc-123", owner_did="did:nil:" + "1" * 66
+        )
+
+        mock_user = MagicMock()
+        mock_user.userid = "test-user-id"
+        mock_user.name = "Test User"
+        mock_user.apikey = "test-api-key"
+        mock_user.rate_limits = RateLimits().get_effective_limits()
+
+        mock_auth_info = AuthenticationInfo(
+            user=mock_user, token_rate_limit=None, prompt_document=mock_prompt_document
+        )
+
+        request = ResponseRequest(model="test-model", input="Hello")
+
+        response_payload = {
+            "id": "test-response-id",
+            "object": "response",
+            "model": "test-model",
+            "created_at": 123456.0,
+            "status": "completed",
+            "output": [],
+            "parallel_tool_calls": False,
+            "tool_choice": "auto",
+            "tools": [],
+            "usage": {
+                "input_tokens": 10,
+                "input_tokens_details": {"cached_tokens": 0},
+                "output_tokens": 5,
+                "output_tokens_details": {"reasoning_tokens": 0},
+                "total_tokens": 15,
+            },
+        }
+
+        with (
+            patch(
+                "nilai_api.routers.endpoints.responses.get_prompt_from_nildb"
+            ) as mock_get_prompt,
+            patch(
+                "nilai_api.routers.endpoints.responses.AsyncOpenAI"
+            ) as mock_openai_client,
+            patch(
+                "nilai_api.routers.endpoints.responses.state.get_model"
+            ) as mock_get_model,
+            patch(
+                "nilai_api.routers.endpoints.responses.UserManager.update_token_usage"
+            ) as mock_update_usage,
+            patch(
+                "nilai_api.routers.endpoints.responses.QueryLogManager.log_query"
+            ) as mock_log_query,
+            patch(
+                "nilai_api.routers.endpoints.responses.handle_responses_tool_workflow"
+            ) as mock_handle_tool_workflow,
+        ):
+            mock_get_prompt.return_value = "System prompt from nilDB"
+
+            mock_model_endpoint = MagicMock()
+            mock_model_endpoint.url = "http://test-model-endpoint"
+            mock_model_endpoint.metadata.tool_support = True
+            mock_model_endpoint.metadata.multimodal_support = True
+            mock_get_model.return_value = mock_model_endpoint
+
+            mock_update_usage.return_value = None
+            mock_log_query.return_value = None
+
+            mock_client_instance = MagicMock()
+            mock_response = MagicMock()
+            mock_response.model_dump.return_value = response_payload
+            mock_client_instance.responses.create = AsyncMock(
+                return_value=mock_response
+            )
+            mock_openai_client.return_value = mock_client_instance
+
+            mock_handle_tool_workflow.return_value = (mock_response, 0, 0)
+
+            await create_response(req=request, auth_info=mock_auth_info)
+
+            mock_get_prompt.assert_called_once_with(mock_prompt_document)
+
+    @pytest.mark.asyncio
+    async def test_responses_prompt_document_extraction_error(self):
+        """Test responses endpoint when prompt document extraction fails"""
+        from nilai_api.routers.endpoints.responses import create_response
+
+        mock_prompt_document = PromptDocument(
+            document_id="test-doc-123", owner_did="did:nil:" + "1" * 66
+        )
+
+        mock_user = MagicMock()
+        mock_user.userid = "test-user-id"
+        mock_user.name = "Test User"
+        mock_user.apikey = "test-api-key"
+        mock_user.rate_limits = RateLimits().get_effective_limits()
+
+        mock_auth_info = AuthenticationInfo(
+            user=mock_user, token_rate_limit=None, prompt_document=mock_prompt_document
+        )
+
+        request = ResponseRequest(model="test-model", input="Hello")
+
+        with (
+            patch(
+                "nilai_api.routers.endpoints.responses.get_prompt_from_nildb"
+            ) as mock_get_prompt,
+            patch(
+                "nilai_api.routers.endpoints.responses.state.get_model"
+            ) as mock_get_model,
+        ):
+            mock_model_endpoint = MagicMock()
+            mock_model_endpoint.url = "http://test-model-endpoint"
+            mock_model_endpoint.metadata.tool_support = True
+            mock_model_endpoint.metadata.multimodal_support = True
+            mock_get_model.return_value = mock_model_endpoint
+
+            mock_get_prompt.side_effect = Exception("Unable to extract prompt")
+
+            with pytest.raises(HTTPException) as exc_info:
+                await create_response(req=request, auth_info=mock_auth_info)
+
+            assert exc_info.value.status_code == status.HTTP_403_FORBIDDEN
+            assert (
+                "Unable to extract prompt from nilDB: Unable to extract prompt"
+                in str(exc_info.value.detail)
+            )
+
+    @pytest.mark.asyncio
+    async def test_responses_without_prompt_document(self):
+        """Test responses endpoint when no prompt document is present"""
+        from nilai_api.routers.endpoints.responses import create_response
+
+        mock_user = MagicMock()
+        mock_user.userid = "test-user-id"
+        mock_user.name = "Test User"
+        mock_user.apikey = "test-api-key"
+        mock_user.rate_limits = RateLimits().get_effective_limits()
+
+        mock_auth_info = AuthenticationInfo(
+            user=mock_user,
+            token_rate_limit=None,
+            prompt_document=None,
+        )
+
+        request = ResponseRequest(model="test-model", input="Hello")
+
+        response_payload = {
+            "id": "test-response-id",
+            "object": "response",
+            "model": "test-model",
+            "created_at": 123456.0,
+            "status": "completed",
+            "output": [],
+            "parallel_tool_calls": False,
+            "tool_choice": "auto",
+            "tools": [],
+            "usage": {
+                "input_tokens": 10,
+                "input_tokens_details": {"cached_tokens": 0},
+                "output_tokens": 5,
+                "output_tokens_details": {"reasoning_tokens": 0},
+                "total_tokens": 15,
+            },
+        }
+
+        with (
+            patch(
+                "nilai_api.routers.endpoints.responses.get_prompt_from_nildb"
+            ) as mock_get_prompt,
+            patch(
+                "nilai_api.routers.endpoints.responses.AsyncOpenAI"
+            ) as mock_openai_client,
+            patch(
+                "nilai_api.routers.endpoints.responses.state.get_model"
+            ) as mock_get_model,
+            patch(
+                "nilai_api.routers.endpoints.responses.UserManager.update_token_usage"
+            ) as mock_update_usage,
+            patch(
+                "nilai_api.routers.endpoints.responses.QueryLogManager.log_query"
+            ) as mock_log_query,
+            patch(
+                "nilai_api.routers.endpoints.responses.handle_responses_tool_workflow"
+            ) as mock_handle_tool_workflow,
+        ):
+            mock_model_endpoint = MagicMock()
+            mock_model_endpoint.url = "http://test-model-endpoint"
+            mock_model_endpoint.metadata.tool_support = True
+            mock_model_endpoint.metadata.multimodal_support = True
+            mock_get_model.return_value = mock_model_endpoint
+
+            mock_update_usage.return_value = None
+            mock_log_query.return_value = None
+
+            mock_client_instance = MagicMock()
+            mock_response = MagicMock()
+            mock_response.model_dump.return_value = response_payload
+            mock_client_instance.responses.create = AsyncMock(
+                return_value=mock_response
+            )
+            mock_openai_client.return_value = mock_client_instance
+
+            mock_handle_tool_workflow.return_value = (mock_response, 0, 0)
+
+            await create_response(req=request, auth_info=mock_auth_info)
+
+            mock_get_prompt.assert_not_called()
+
     def test_prompt_delegation_request_model_validation(self):
         """Test PromptDelegationRequest model validation"""
         # Valid request
diff --git a/tests/unit/nilai_api/routers/test_responses_private.py b/tests/unit/nilai_api/routers/test_responses_private.py
new file mode 100644
index 00000000..5c65c648
--- /dev/null
+++ b/tests/unit/nilai_api/routers/test_responses_private.py
@@ -0,0 +1,330 @@
+import asyncio
+import json
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+from fastapi.testclient import TestClient
+
+from nilai_api.db.users import RateLimits, UserModel
+from nilai_common import AttestationReport, Source
+
+from nilai_api.state import state
+from ... import (
+    model_endpoint,
+    model_metadata,
+    RESPONSES_RESPONSE,
+)
+
+
+@pytest.mark.asyncio
+async def test_runs_in_a_loop():
+    assert asyncio.get_running_loop()
+
+
+@pytest.fixture
+def mock_user():
+    mock = MagicMock(spec=UserModel)
+    mock.userid = "test-user-id"
+    mock.name = "Test User"
+    mock.apikey = "test-api-key"
+    mock.prompt_tokens = 100
+    mock.completion_tokens = 50
+    mock.total_tokens = 150
+    mock.completion_tokens_details = None
+    mock.prompt_tokens_details = None
+    mock.queries = 10
+    mock.rate_limits = RateLimits().get_effective_limits().model_dump_json()
+    mock.rate_limits_obj = RateLimits().get_effective_limits()
+    return mock
+
+
+@pytest.fixture
+def mock_user_manager(mock_user, mocker):
+    from nilai_api.db.users import UserManager
+    from nilai_api.db.logs import QueryLogManager
+
+    mocker.patch.object(
+        UserManager,
+        "get_token_usage",
+        return_value={
+            "prompt_tokens": 100,
+            "completion_tokens": 50,
+            "total_tokens": 150,
+            "queries": 10,
+        },
+    )
+    mocker.patch.object(UserManager, "update_token_usage")
+    mocker.patch.object(
+        UserManager,
+        "get_user_token_usage",
+        return_value={
+            "prompt_tokens": 100,
+            "completion_tokens": 50,
+            "total_tokens": 150,
+            "completion_tokens_details": None,
+            "prompt_tokens_details": None,
+            "queries": 10,
+        },
+    )
+    mocker.patch.object(
+        UserManager,
+        "insert_user",
+        return_value={
+            "userid": "test-user-id",
+            "apikey": "test-api-key",
+            "rate_limits": RateLimits().get_effective_limits().model_dump_json(),
+        },
+    )
+    mocker.patch.object(
+        UserManager,
+        "check_api_key",
+        return_value=mock_user,
+    )
+    mocker.patch.object(
+        UserManager,
+        "get_all_users",
+        return_value=[
+            {
+                "userid": "test-user-id",
+                "apikey": "test-api-key",
+                "rate_limits": RateLimits().get_effective_limits().model_dump_json(),
+            },
+            {
+                "userid": "test-user-id-2",
+                "apikey": "test-api-key",
+                "rate_limits": RateLimits().get_effective_limits().model_dump_json(),
+            },
+        ],
+    )
+    mocker.patch.object(QueryLogManager, "log_query")
+    mocker.patch.object(UserManager, "update_last_activity")
+    return UserManager
+
+
+@pytest.fixture
+def mock_state(mocker):
+    expected_models = {"ABC": model_endpoint}
+
+    mock_discovery_service = mocker.Mock()
+    mock_discovery_service.discover_models = AsyncMock(return_value=expected_models)
+
+    mocker.patch.object(state, "discovery_service", mock_discovery_service)
+
+    mocker.patch.object(state, "b64_public_key", "test-verifying-key")
+
+    mocker.patch.object(state, "get_model", return_value=model_endpoint)
+
+    attestation_response = AttestationReport(
+        verifying_key="test-verifying-key",
+        nonce="0" * 64,
+        cpu_attestation="test-cpu-attestation",
+        gpu_attestation="test-gpu-attestation",
+    )
+    mocker.patch(
+        "nilai_api.routers.private.get_attestation_report",
+        new_callable=AsyncMock,
+        return_value=attestation_response,
+    )
+
+    return state
+
+
+@pytest.fixture
+def client(mock_user_manager):
+    from nilai_api.app import app
+
+    with TestClient(app) as client:
+        yield client
+
+
+@pytest.mark.asyncio
+async def test_models_property(mock_state):
+    models = await state.models
+
+    assert models == {"ABC": model_endpoint}
+
+
+def test_get_usage(mock_user, mock_user_manager, mock_state, client):
+    response = client.get("/v1/usage", headers={"Authorization": "Bearer test-api-key"})
+    assert response.status_code == 200
+    assert response.json() == {
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "total_tokens": 150,
+        "completion_tokens_details": None,
+        "prompt_tokens_details": None,
+        "queries": 10,
+    }
+
+
+def test_get_attestation(mock_user, mock_user_manager, mock_state, client):
+    response = client.get(
+        "/v1/attestation/report",
+        headers={"Authorization": "Bearer test-api-key"},
+        params={"nonce": "0" * 64},
+    )
+    assert response.status_code == 200
+    assert response.json()["verifying_key"] == "test-verifying-key"
+    assert response.json()["cpu_attestation"] == "test-cpu-attestation"
+    assert response.json()["gpu_attestation"] == "test-gpu-attestation"
+
+
+def test_get_models(mock_user, mock_user_manager, mock_state, client):
+    response = client.get(
+        "/v1/models", headers={"Authorization": "Bearer test-api-key"}
+    )
+    assert response.status_code == 200
+    assert response.json() == [model_metadata.model_dump()]
+
+
+def test_create_response(mock_user, mock_state, mock_user_manager, mocker, client):
+    mocker.patch("openai.api_key", new="test-api-key")
+
+    response_data = RESPONSES_RESPONSE
+
+    mock_responses = MagicMock()
+    mock_responses.create = mocker.AsyncMock(return_value=response_data)
+    mock_async_openai_instance = MagicMock()
+    mock_async_openai_instance.responses = mock_responses
+
+    mocker.patch(
+        "nilai_api.routers.endpoints.responses.AsyncOpenAI",
+        return_value=mock_async_openai_instance,
+    )
+    mocker.patch(
+        "nilai_api.routers.endpoints.responses.handle_responses_tool_workflow",
+        return_value=(response_data, 0, 0),
+    )
+
+    payload = {
+        "model": "meta-llama/Llama-3.2-1B-Instruct",
+        "instructions": "You are a helpful assistant.",
+        "input": "What is your name?",
+    }
+
+    response = client.post(
+        "/v1/responses",
+        json=payload,
+        headers={"Authorization": "Bearer test-api-key"},
+    )
+
+    assert response.status_code == 200
+    assert "usage" in response.json()
+    assert response_data.usage is not None
+    assert response.json()["usage"] == response_data.usage.model_dump(mode="json")
+
+
+def test_create_response_stream_includes_sources(
+    mock_user, mock_state, mock_user_manager, mocker, client
+):
+    from openai.types.responses import Response as OpenAIResponse, ResponseUsage
+    from openai.types.responses.response_usage import (
+        InputTokensDetails,
+        OutputTokensDetails,
+    )
+    from nilai_common import ResponseCompletedEvent
+
+    mock_user.rate_limits_obj.web_search_rate_limit_minute = 100
+
+    source = Source(source="https://example.com", content="Example result")
+
+    mock_web_search_result = MagicMock()
+    mock_web_search_result.input = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "input_text", "text": "Tell me something new."},
+            ],
+            "type": "message",
+        }
+    ]
+    mock_web_search_result.instructions = "You are a helpful assistant."
+    mock_web_search_result.sources = [source]
+
+    mocker.patch(
+        "nilai_api.routers.endpoints.responses.handle_web_search_for_responses",
+        new=AsyncMock(return_value=mock_web_search_result),
+    )
+
+    class MockEvent:
+        def __init__(self, data):
+            self._data = data
+
+        def model_dump(self, exclude_unset=True):
+            return self._data
+
+    streaming_usage = ResponseUsage(
+        input_tokens=5,
+        input_tokens_details=InputTokensDetails(cached_tokens=0),
+        output_tokens=7,
+        output_tokens_details=OutputTokensDetails(reasoning_tokens=0),
+        total_tokens=12,
+    )
+
+    streaming_response = OpenAIResponse(
+        **{
+            **RESPONSES_RESPONSE.model_dump(),
+            "usage": streaming_usage,
+        }
+    )
+
+    first_event = MockEvent(
+        {
+            "type": "response.output_text.delta",
+            "response": {"id": "resp-stream-1"},
+            "delta": {"text": "Hello"},
+        }
+    )
+
+    final_event = ResponseCompletedEvent(
+        response=streaming_response, sequence_number=1, type="response.completed"
+    )
+
+    async def chunk_generator():
+        yield first_event
+        yield final_event
+
+    mock_responses = MagicMock()
+    mock_responses.create = AsyncMock(return_value=chunk_generator())
+    mock_async_openai_instance = MagicMock()
+    mock_async_openai_instance.responses = mock_responses
+
+    mocker.patch(
+        "nilai_api.routers.endpoints.responses.AsyncOpenAI",
+        return_value=mock_async_openai_instance,
+    )
+
+    payload = {
+        "model": "meta-llama/Llama-3.2-1B-Instruct",
+        "instructions": "You are a helpful assistant.",
+        "input": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "input_text", "text": "Tell me something new."},
+                ],
+                "type": "message",
+            }
+        ],
+        "stream": True,
+        "web_search": True,
+    }
+
+    headers = {"Authorization": "Bearer test-api-key"}
+
+    with client.stream("POST", "/v1/responses", json=payload, headers=headers) as resp:
+        assert resp.status_code == 200
+        data_lines = [
+            line for line in resp.iter_lines() if line and line.startswith("data: ")
+        ]
+
+    assert data_lines, "Expected SSE data from stream response"
+
+    first_payload = json.loads(data_lines[0][len("data: ") :])
+    assert "data" not in first_payload or "sources" not in first_payload.get("data", {})
+
+    final_payload = json.loads(data_lines[-1][len("data: ") :])
+    assert "data" in final_payload
+    assert "sources" in final_payload["data"]
+    assert len(final_payload["data"]["sources"]) == 1
+    assert final_payload["data"]["sources"][0]["source"] == "https://example.com"
diff --git a/tests/unit/nilai_api/test_rate_limiting.py b/tests/unit/nilai_api/test_rate_limiting.py
index 4cf53b0e..27a5c1bc 100644
--- a/tests/unit/nilai_api/test_rate_limiting.py
+++ b/tests/unit/nilai_api/test_rate_limiting.py
@@ -11,6 +11,7 @@
 from fastapi import HTTPException, Request
 
 from nilai_api.rate_limiting import RateLimit, UserRateLimits, setup_redis_conn
+from nilai_api.config import CONFIG
 
 
 @pytest_asyncio.fixture
@@ -72,6 +73,45 @@ async def test_concurrent_rate_limit(req):
     await asyncio.gather(*futures)
 
 
+@pytest.mark.asyncio
+async def test_web_search_rps_limit(redis_client):
+    mock_request = MagicMock(spec=Request)
+    mock_request.state.redis = redis_client[0]
+    mock_request.state.redis_rate_limit_command = redis_client[1]
+    # Ensure a clean slate for the global RPS key used by the limiter
+    await redis_client[0].delete("web_search_rps")
+
+    async def web_search_extractor(_):
+        return True
+
+    rate_limit = RateLimit(web_search_extractor=web_search_extractor)
+    user_limits = UserRateLimits(
+        subscription_holder=random_id(),
+        token_rate_limit=None,
+        rate_limits=RateLimits(
+            user_rate_limit_day=None,
+            user_rate_limit_hour=None,
+            user_rate_limit_minute=None,
+            web_search_rate_limit_day=None,
+            web_search_rate_limit_hour=None,
+            web_search_rate_limit_minute=None,
+            user_rate_limit=None,
+            web_search_rate_limit=None,
+        ),
+    )
+
+    old_rps = CONFIG.web_search.rps
+    CONFIG.web_search.rps = 2
+    try:
+        await consume_generator(rate_limit(mock_request, user_limits))
+        await consume_generator(rate_limit(mock_request, user_limits))
+        with pytest.raises(HTTPException):
+            await consume_generator(rate_limit(mock_request, user_limits))
+    finally:
+        CONFIG.web_search.rps = old_rps
+        await redis_client[0].delete("web_search_rps")
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "user_limits",
@@ -199,86 +239,3 @@ async def web_search_extractor(request):
     # Second request should be rejected due to minute limit (1 per minute)
     with pytest.raises(HTTPException):
         await consume_generator(rate_limit(mock_request, user_limits))
-
-
-@pytest.mark.asyncio
-async def test_global_web_search_rps_limit(req, redis_client, monkeypatch):
-    from nilai_api.config import CONFIG
-
-    await redis_client[0].delete("global:web_search:rps")
-    monkeypatch.setattr(CONFIG.web_search, "rps", 20)
-    monkeypatch.setattr(CONFIG.web_search, "max_concurrent_requests", 20)
-    monkeypatch.setattr(CONFIG.web_search, "count", 1)
-
-    rate_limit = RateLimit(web_search_extractor=lambda _: True)
-    user_limits = UserRateLimits(
-        subscription_holder=random_id(),
-        token_rate_limit=None,
-        rate_limits=RateLimits(
-            user_rate_limit_day=None,
-            user_rate_limit_hour=None,
-            user_rate_limit_minute=None,
-            web_search_rate_limit_day=None,
-            web_search_rate_limit_hour=None,
-            web_search_rate_limit_minute=None,
-            user_rate_limit=None,
-            web_search_rate_limit=None,
-        ),
-    )
-
-    async def run_guarded(i, times, t0):
-        async for _ in rate_limit(req, user_limits):
-            times[i] = asyncio.get_event_loop().time() - t0
-            await asyncio.sleep(0.01)
-
-    n = 40
-    times = [0.0] * n
-    t0 = asyncio.get_event_loop().time()
-    tasks = [asyncio.create_task(run_guarded(i, times, t0)) for i in range(n)]
-    await asyncio.gather(*tasks)
-
-    within_first_second = [t for t in times if t < 1.0]
-    assert len(within_first_second) <= 20
-    assert max(times) >= 1.0
-
-
-@pytest.mark.asyncio
-async def test_queueing_across_seconds(req, redis_client, monkeypatch):
-    from nilai_api.config import CONFIG
-
-    await redis_client[0].delete("global:web_search:rps")
-    monkeypatch.setattr(CONFIG.web_search, "rps", 20)
-    monkeypatch.setattr(CONFIG.web_search, "max_concurrent_requests", 20)
-    monkeypatch.setattr(CONFIG.web_search, "count", 1)
-
-    rate_limit = RateLimit(web_search_extractor=lambda _: True)
-    user_limits = UserRateLimits(
-        subscription_holder=random_id(),
-        token_rate_limit=None,
-        rate_limits=RateLimits(
-            user_rate_limit_day=None,
-            user_rate_limit_hour=None,
-            user_rate_limit_minute=None,
-            web_search_rate_limit_day=None,
-            web_search_rate_limit_hour=None,
-            web_search_rate_limit_minute=None,
-            user_rate_limit=None,
-            web_search_rate_limit=None,
-        ),
-    )
-
-    async def run_guarded(i, times, t0):
-        async for _ in rate_limit(req, user_limits):
-            times[i] = asyncio.get_event_loop().time() - t0
-            await asyncio.sleep(0.01)
-
-    n = 25
-    times = [0.0] * n
-    t0 = asyncio.get_event_loop().time()
-    tasks = [asyncio.create_task(run_guarded(i, times, t0)) for i in range(n)]
-    await asyncio.gather(*tasks)
-
-    first_window = [t for t in times if t < 1.0]
-    second_window = [t for t in times if 1.0 <= t < 2.0]
-    assert len(first_window) <= 20
-    assert len(second_window) >= 1
diff --git a/tests/unit/nilai_api/test_web_search.py b/tests/unit/nilai_api/test_web_search.py
index d7afc712..ef94a1f5 100644
--- a/tests/unit/nilai_api/test_web_search.py
+++ b/tests/unit/nilai_api/test_web_search.py
@@ -6,7 +6,7 @@
     enhance_messages_with_web_search,
 )
 from nilai_common import MessageAdapter, ChatRequest
-from nilai_common.api_model import (
+from nilai_common.api_models import (
     WebSearchContext,
     Source,
 )