lightspeed-core
diff --git a/‎src/app/endpoints/query.py‎
Lines changed: 15 additions & 27 deletions b/‎src/app/endpoints/query.py‎
Lines changed: 15 additions & 27 deletions
diff --git a/‎src/app/endpoints/query_v2.py‎
Lines changed: 40 additions & 30 deletions b/‎src/app/endpoints/query_v2.py‎
Lines changed: 40 additions & 30 deletions
diff --git a/‎src/app/endpoints/streaming_query.py‎
Lines changed: 14 additions & 10 deletions b/‎src/app/endpoints/streaming_query.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎src/app/endpoints/streaming_query_v2.py‎
Lines changed: 9 additions & 3 deletions b/‎src/app/endpoints/streaming_query_v2.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎src/app/routers.py‎
Lines changed: 1 addition & 1 deletion b/‎src/app/routers.py‎
Lines changed: 1 addition & 1 deletion
@@ -8,15 +8,13 @@
 from typing import Annotated, Any, Optional, cast
 
 from fastapi import APIRouter, Depends, HTTPException, Request
-from litellm.exceptions import RateLimitError
 from llama_stack_client import (
     APIConnectionError,
     AsyncLlamaStackClient,  # type: ignore
 )
 from llama_stack_client.types import Shield, UserMessage  # type: ignore
 from llama_stack_client.types.alpha.agents.turn import Turn
 from llama_stack_client.types.alpha.agents.turn_create_params import (
-    Document,
     Toolgroup,
     ToolgroupAgentToolGroupWithArgs,
 )
@@ -42,10 +40,10 @@
     InternalServerErrorResponse,
     NotFoundResponse,
     QueryResponse,
+    PromptTooLongResponse,
     QuotaExceededResponse,
     ReferencedDocument,
     ServiceUnavailableResponse,
-    ToolCall,
     UnauthorizedResponse,
     UnprocessableEntityResponse,
 )
@@ -84,6 +82,7 @@
     404: NotFoundResponse.openapi_response(
         examples=["model", "conversation", "provider"]
     ),
+    413: PromptTooLongResponse.openapi_response(),
     422: UnprocessableEntityResponse.openapi_response(),
     429: QuotaExceededResponse.openapi_response(),
     500: InternalServerErrorResponse.openapi_response(examples=["configuration"]),
@@ -379,20 +378,6 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
 
         # Convert tool calls to response format
         logger.info("Processing tool calls...")
-        tool_calls = [
-            ToolCall(
-                tool_name=tc.name,
-                arguments=(
-                    tc.args if isinstance(tc.args, dict) else {"query": str(tc.args)}
-                ),
-                result=(
-                    {"response": tc.response}
-                    if tc.response and tc.name != constants.DEFAULT_RAG_TOOL
-                    else None
-                ),
-            )
-            for tc in summary.tool_calls
-        ]
 
         logger.info("Using referenced documents from response...")
 
@@ -403,7 +388,8 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
             conversation_id=conversation_id,
             response=summary.llm_response,
             rag_chunks=summary.rag_chunks if summary.rag_chunks else [],
-            tool_calls=tool_calls if tool_calls else None,
+            tool_calls=summary.tool_calls if summary.tool_calls else None,
+            tool_results=summary.tool_results if summary.tool_results else None,
             referenced_documents=referenced_documents,
             truncated=False,  # TODO: implement truncation detection
             input_tokens=token_usage.input_tokens,
@@ -427,7 +413,7 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
         logger.exception("Error persisting conversation details: %s", e)
         response = InternalServerErrorResponse.database_error()
         raise HTTPException(**response.model_dump()) from e
-    except RateLimitError as e:
+    except Exception as e:
         used_model = getattr(e, "model", "")
         response = QuotaExceededResponse.model(used_model)
         raise HTTPException(**response.model_dump()) from e
@@ -743,14 +729,14 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
             toolgroups = None
 
     # TODO: LCORE-881 - Remove if Llama Stack starts to support these mime types
-    documents: list[Document] = [
-        (
-            {"content": doc["content"], "mime_type": "text/plain"}
-            if doc["mime_type"].lower() in ("application/json", "application/xml")
-            else doc
-        )
-        for doc in query_request.get_documents()
-    ]
+    # documents: list[Document] = [
+    #     (
+    #         {"content": doc["content"], "mime_type": "text/plain"}
+    #         if doc["mime_type"].lower() in ("application/json", "application/xml")
+    #         else doc
+    #     )
+    #     for doc in query_request.get_documents()
+    # ]
 
     response = await agent.create_turn(
         messages=[UserMessage(role="user", content=query_request.query).model_dump()],
@@ -771,6 +757,8 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
             else ""
         ),
         tool_calls=[],
+        tool_results=[],
+        rag_chunks=[],
     )
 
     referenced_documents = parse_referenced_documents(response)
 
@@ -1,6 +1,7 @@
+# pylint: disable=too-many-locals,too-many-branches,too-many-nested-blocks
+
 """Handler for REST API call to provide answer to query using Response API."""
 
-import json
 import logging
 from typing import Annotated, Any, cast
 
@@ -24,6 +25,7 @@
 from models.requests import QueryRequest
 from models.responses import (
     ForbiddenResponse,
+    PromptTooLongResponse,
     InternalServerErrorResponse,
     NotFoundResponse,
     QueryResponse,
@@ -59,6 +61,7 @@
     404: NotFoundResponse.openapi_response(
         examples=["conversation", "model", "provider"]
     ),
+    413: PromptTooLongResponse.openapi_response(),
     422: UnprocessableEntityResponse.openapi_response(),
     429: QuotaExceededResponse.openapi_response(),
     500: InternalServerErrorResponse.openapi_response(examples=["configuration"]),
@@ -96,7 +99,7 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
             id=str(call_id),
             name=getattr(output_item, "name", "function_call"),
             args=args,
-            response=None,
+            type="tool_call",
         )
 
     if item_type == "file_search_call":
@@ -105,36 +108,38 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
             "status": getattr(output_item, "status", None),
         }
         results = getattr(output_item, "results", None)
-        response_payload: Any | None = None
+        # response_payload: Any | None = None
         if results is not None:
             # Store only the essential result metadata to avoid large payloads
-            response_payload = {
-                "results": [
-                    {
-                        "file_id": (
-                            getattr(result, "file_id", None)
-                            if not isinstance(result, dict)
-                            else result.get("file_id")
-                        ),
-                        "filename": (
-                            getattr(result, "filename", None)
-                            if not isinstance(result, dict)
-                            else result.get("filename")
-                        ),
-                        "score": (
-                            getattr(result, "score", None)
-                            if not isinstance(result, dict)
-                            else result.get("score")
-                        ),
-                    }
-                    for result in results
-                ]
-            }
+            # response_payload = {
+            #     "results": [
+            #         {
+            #             "file_id": (
+            #                 getattr(result, "file_id", None)
+            #                 if not isinstance(result, dict)
+            #                 else result.get("file_id")
+            #             ),
+            #             "filename": (
+            #                 getattr(result, "filename", None)
+            #                 if not isinstance(result, dict)
+            #                 else result.get("filename")
+            #             ),
+            #             "score": (
+            #                 getattr(result, "score", None)
+            #                 if not isinstance(result, dict)
+            #                 else result.get("score")
+            #             ),
+            #         }
+            #         for result in results
+            #     ]
+            # }
+            ...  # Handle response_payload
         return ToolCallSummary(
             id=str(getattr(output_item, "id")),
             name=DEFAULT_RAG_TOOL,
             args=args,
-            response=json.dumps(response_payload) if response_payload else None,
+            # response=json.dumps(response_payload) if response_payload else None,
+            type="tool_call",
         )
 
     if item_type == "web_search_call":
@@ -143,7 +148,7 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
             id=str(getattr(output_item, "id")),
             name="web_search",
             args=args,
-            response=None,
+            type="tool_call",
         )
 
     if item_type == "mcp_call":
@@ -160,7 +165,8 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
             id=str(getattr(output_item, "id")),
             name=getattr(output_item, "name", "mcp_call"),
             args=args,
-            response=getattr(output_item, "output", None),
+            # response=getattr(output_item, "output", None),
+            type="tool_call",
         )
 
     if item_type == "mcp_list_tools":
@@ -178,7 +184,8 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
             id=str(getattr(output_item, "id")),
             name="mcp_list_tools",
             args=args,
-            response=None,
+            # response=None,
+            type="tool_call",
         )
 
     if item_type == "mcp_approval_request":
@@ -191,7 +198,8 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
             id=str(getattr(output_item, "id")),
             name=getattr(output_item, "name", "mcp_approval_request"),
             args=args,
-            response=None,
+            # response=None,
+            type="tool_call",
         )
 
     return None
@@ -400,6 +408,8 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
     summary = TurnSummary(
         llm_response=llm_response,
         tool_calls=tool_calls,
+        tool_results=[],
+        rag_chunks=[],
     )
 
     # Extract referenced documents and token usage from Responses API response
 
@@ -20,7 +20,6 @@
 from llama_stack_client.types.alpha.agents.agent_turn_response_stream_chunk import (
     AgentTurnResponseStreamChunk,
 )
-from llama_stack_client.types.alpha.agents.turn_create_params import Document
 from llama_stack_client.types.shared import ToolCall
 from llama_stack_client.types.shared.interleaved_content_item import TextContentItem
 
@@ -51,6 +50,7 @@
 from models.responses import (
     ForbiddenResponse,
     InternalServerErrorResponse,
+    PromptTooLongResponse,
     NotFoundResponse,
     QuotaExceededResponse,
     ServiceUnavailableResponse,
@@ -86,6 +86,7 @@
     404: NotFoundResponse.openapi_response(
         examples=["conversation", "model", "provider"]
     ),
+    413: PromptTooLongResponse.openapi_response(),
     422: UnprocessableEntityResponse.openapi_response(),
     429: QuotaExceededResponse.openapi_response(),
     500: InternalServerErrorResponse.openapi_response(examples=["configuration"]),
@@ -704,7 +705,10 @@ async def response_generator(
         complete response for transcript storage if enabled.
         """
         chunk_id = 0
-        summary = TurnSummary(llm_response="No response from the model", tool_calls=[])
+        summary = TurnSummary(
+            llm_response="No response from the model", 
+            tool_calls=[], tool_results=[], rag_chunks=[]
+        )
 
         # Determine media type for response formatting
         media_type = context.query_request.media_type or MEDIA_TYPE_JSON
@@ -1064,14 +1068,14 @@ async def retrieve_response(
             toolgroups = None
 
     # TODO: LCORE-881 - Remove if Llama Stack starts to support these mime types
-    documents: list[Document] = [
-        (
-            {"content": doc["content"], "mime_type": "text/plain"}
-            if doc["mime_type"].lower() in ("application/json", "application/xml")
-            else doc
-        )
-        for doc in query_request.get_documents()
-    ]
+    # documents: list[Document] = [
+    #     (
+    #         {"content": doc["content"], "mime_type": "text/plain"}
+    #         if doc["mime_type"].lower() in ("application/json", "application/xml")
+    #         else doc
+    #     )
+    #     for doc in query_request.get_documents()
+    # ]
 
     response = await agent.create_turn(
         messages=[UserMessage(role="user", content=query_request.query).model_dump()],
 
@@ -38,6 +38,7 @@
     ForbiddenResponse,
     InternalServerErrorResponse,
     NotFoundResponse,
+    PromptTooLongResponse,
     QuotaExceededResponse,
     ServiceUnavailableResponse,
     StreamingQueryResponse,
@@ -70,6 +71,7 @@
     404: NotFoundResponse.openapi_response(
         examples=["conversation", "model", "provider"]
     ),
+    413: PromptTooLongResponse.openapi_response(),
     422: UnprocessableEntityResponse.openapi_response(),
     429: QuotaExceededResponse.openapi_response(),
     500: InternalServerErrorResponse.openapi_response(examples=["configuration"]),
@@ -108,7 +110,9 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
         complete response for transcript storage if enabled.
         """
         chunk_id = 0
-        summary = TurnSummary(llm_response="", tool_calls=[])
+        summary = TurnSummary(
+            llm_response="", tool_calls=[], tool_results=[], rag_chunks=[]
+        )
 
         # Determine media type for response formatting
         media_type = context.query_request.media_type or MEDIA_TYPE_JSON
@@ -216,8 +220,10 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
                     ToolCallSummary(
                         id=meta.get("call_id", item_id or "unknown"),
                         name=meta.get("name", "tool_call"),
-                        args=arguments,
-                        response=None,
+                        args=(
+                            arguments if isinstance(arguments, dict) else {}
+                        ),  # Handle non-dict arguments
+                        type="tool_call",
                     )
                 )
 
 
@@ -47,7 +47,7 @@ def include_routers(app: FastAPI) -> None:
     app.include_router(conversations_v2.router, prefix="/v2")
 
     # Note: query_v2, streaming_query_v2, and conversations_v3 are now exposed at /v1 above
-    # The old query, streaming_query, and conversations modules are deprecated but kept for reference
+    # The old query, streaming_query, and conversations modules are deprecated
 
     # road-core does not version these endpoints
     app.include_router(health.router)