quadratichq · davidkircos · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/quadratic-api/src/ai/handler/anthropic.ts b/quadratic-api/src/ai/handler/anthropic.ts
@@ -1,7 +1,7 @@
 import Anthropic from '@anthropic-ai/sdk';
 import type { Response } from 'express';
 import { getModelOptions } from 'quadratic-shared/ai/helpers/model.helper';
-import type { AIMessagePrompt, AIRequestHelperArgs, AnthropicModel } from 'quadratic-shared/typesAndSchemasAI';
+import type { AIRequestHelperArgs, AnthropicModel, ParsedAIResponse } from 'quadratic-shared/typesAndSchemasAI';
 import { ANTHROPIC_API_KEY } from '../../env-vars';
 import { getAnthropicApiArgs, parseAnthropicResponse, parseAnthropicStream } from '../helpers/anthropic.helper';
 
@@ -13,7 +13,7 @@ export const handleAnthropicRequest = async (
   model: AnthropicModel,
   args: AIRequestHelperArgs,
   response: Response
-): Promise<AIMessagePrompt | undefined> => {
+): Promise<ParsedAIResponse | undefined> => {
   const { system, messages, tools, tool_choice } = getAnthropicApiArgs(args);
   const { stream, temperature, max_tokens } = getModelOptions(model, args);
 
@@ -34,8 +34,8 @@ export const handleAnthropicRequest = async (
       response.setHeader('Cache-Control', 'no-cache');
       response.setHeader('Connection', 'keep-alive');
 
-      const responseMessage = await parseAnthropicStream(chunks, response, model);
-      return responseMessage;
+      const parsedResponse = await parseAnthropicStream(chunks, response, model);
+      return parsedResponse;
     } catch (error: any) {
       if (!response.headersSent) {
         if (error instanceof Anthropic.APIError) {
@@ -62,8 +62,8 @@ export const handleAnthropicRequest = async (
         tool_choice,
       });
 
-      const responseMessage = parseAnthropicResponse(result, response, model);
-      return responseMessage;
+      const parsedResponse = parseAnthropicResponse(result, response, model);
+      return parsedResponse;
     } catch (error: any) {
       if (error instanceof Anthropic.APIError) {
         response.status(error.status ?? 400).json(error.message);

diff --git a/quadratic-api/src/ai/handler/bedrock.ts b/quadratic-api/src/ai/handler/bedrock.ts
@@ -3,7 +3,7 @@ import Anthropic from '@anthropic-ai/sdk';
 import { BedrockRuntimeClient, ConverseCommand, ConverseStreamCommand } from '@aws-sdk/client-bedrock-runtime';
 import { type Response } from 'express';
 import { getModelOptions, isBedrockAnthropicModel } from 'quadratic-shared/ai/helpers/model.helper';
-import type { AIMessagePrompt, AIRequestBody, BedrockModel } from 'quadratic-shared/typesAndSchemasAI';
+import type { AIRequestBody, BedrockModel, ParsedAIResponse } from 'quadratic-shared/typesAndSchemasAI';
 import { AWS_S3_ACCESS_KEY_ID, AWS_S3_REGION, AWS_S3_SECRET_ACCESS_KEY } from '../../env-vars';
 import { getAnthropicApiArgs, parseAnthropicResponse, parseAnthropicStream } from '../helpers/anthropic.helper';
 import { getBedrockApiArgs, parseBedrockResponse, parseBedrockStream } from '../helpers/bedrock.helper';
@@ -25,7 +25,7 @@ export const handleBedrockRequest = async (
   model: BedrockModel,
   args: Omit<AIRequestBody, 'chatId' | 'fileUuid' | 'source' | 'model'>,
   response: Response
-): Promise<AIMessagePrompt | undefined> => {
+): Promise<ParsedAIResponse | undefined> => {
   const { stream, temperature, max_tokens } = getModelOptions(model, args);
 
   if (isBedrockAnthropicModel(model)) {
@@ -47,8 +47,8 @@ export const handleBedrockRequest = async (
         response.setHeader('Cache-Control', 'no-cache');
         response.setHeader('Connection', 'keep-alive');
 
-        const responseMessage = await parseAnthropicStream(chunks, response, model);
-        return responseMessage;
+        const parsedResponse = await parseAnthropicStream(chunks, response, model);
+        return parsedResponse;
       } catch (error: any) {
         if (!response.headersSent) {
           if (error instanceof Anthropic.APIError) {
@@ -74,8 +74,8 @@ export const handleBedrockRequest = async (
           tool_choice,
         });
 
-        const responseMessage = parseAnthropicResponse(result, response, model);
-        return responseMessage;
+        const parsedResponse = parseAnthropicResponse(result, response, model);
+        return parsedResponse;
       } catch (error: any) {
         if (error instanceof Anthropic.APIError) {
           response.status(error.status ?? 400).json(error.message);
@@ -108,8 +108,8 @@ export const handleBedrockRequest = async (
         response.setHeader('Cache-Control', 'no-cache');
         response.setHeader('Connection', 'keep-alive');
 
-        const responseMessage = await parseBedrockStream(chunks, response, model);
-        return responseMessage;
+        const parsedResponse = await parseBedrockStream(chunks, response, model);
+        return parsedResponse;
       } catch (error: any) {
         if (!response.headersSent) {
           if (error.response) {
@@ -138,8 +138,8 @@ export const handleBedrockRequest = async (
         });
 
         const result = await bedrock.send(command);
-        const responseMessage = parseBedrockResponse(result.output, response, model);
-        return responseMessage;
+        const parsedResponse = parseBedrockResponse(result, response, model);
+        return parsedResponse;
       } catch (error: any) {
         if (error.response) {
           response.status(error.response.status).json(error.response.data);

diff --git a/quadratic-api/src/ai/handler/openai.ts b/quadratic-api/src/ai/handler/openai.ts
@@ -1,7 +1,7 @@
 import { type Response } from 'express';
 import OpenAI from 'openai';
 import { getModelOptions } from 'quadratic-shared/ai/helpers/model.helper';
-import type { AIMessagePrompt, AIRequestHelperArgs, OpenAIModel } from 'quadratic-shared/typesAndSchemasAI';
+import type { AIRequestHelperArgs, OpenAIModel, ParsedAIResponse } from 'quadratic-shared/typesAndSchemasAI';
 import { OPENAI_API_KEY } from '../../env-vars';
 import { getOpenAIApiArgs, parseOpenAIResponse, parseOpenAIStream } from '../helpers/openai.helper';
 
@@ -13,7 +13,7 @@ export const handleOpenAIRequest = async (
   model: OpenAIModel,
   args: AIRequestHelperArgs,
   response: Response
-): Promise<AIMessagePrompt | undefined> => {
+): Promise<ParsedAIResponse | undefined> => {
   const { messages, tools, tool_choice } = getOpenAIApiArgs(args);
   const { stream, temperature } = getModelOptions(model, args);
 
@@ -26,14 +26,17 @@ export const handleOpenAIRequest = async (
         stream: true,
         tools,
         tool_choice,
+        stream_options: {
+          include_usage: true,
+        },
       });
 
       response.setHeader('Content-Type', 'text/event-stream');
       response.setHeader('Cache-Control', 'no-cache');
       response.setHeader('Connection', 'keep-alive');
 
-      const responseMessage = await parseOpenAIStream(completion, response, model);
-      return responseMessage;
+      const parsedResponse = await parseOpenAIStream(completion, response, model);
+      return parsedResponse;
     } catch (error: any) {
       if (!response.headersSent) {
         if (error instanceof OpenAI.APIError) {
@@ -57,8 +60,8 @@ export const handleOpenAIRequest = async (
         tool_choice,
       });
 
-      const responseMessage = parseOpenAIResponse(result, response, model);
-      return responseMessage;
+      const parsedResponse = parseOpenAIResponse(result, response, model);
+      return parsedResponse;
     } catch (error: any) {
       if (error instanceof OpenAI.APIError) {
         response.status(error.status ?? 400).json(error.message);

diff --git a/quadratic-api/src/ai/helpers/anthropic.helper.ts b/quadratic-api/src/ai/helpers/anthropic.helper.ts
@@ -10,7 +10,9 @@ import type {
   AIRequestBody,
   AnthropicModel,
   BedrockAnthropicModel,
+  ParsedAIResponse,
 } from 'quadratic-shared/typesAndSchemasAI';
+import { calculateUsage } from './usage.helper';
 
 export function getAnthropicApiArgs(args: Omit<AIRequestBody, 'chatId' | 'fileUuid' | 'source' | 'model'>): {
   system: string | TextBlockParam[] | undefined;
@@ -21,7 +23,17 @@ export function getAnthropicApiArgs(args: Omit<AIRequestBody, 'chatId' | 'fileUu
   const { messages: chatMessages, useTools, toolName } = args;
 
   const { systemMessages, promptMessages } = getSystemPromptMessages(chatMessages);
+
+  // without prompt caching of system messages
   const system = systemMessages.join('\n\n');
+
+  // with prompt caching of system messages
+  // const system: TextBlockParam[] = systemMessages.map((message, index) => ({
+  //   type: 'text' as const,
+  //   text: message,
+  //   ...(index < 4 ? { cache_control: { type: 'ephemeral' } } : {}),
+  // }));
+
   const messages: MessageParam[] = promptMessages.reduce<MessageParam[]>((acc, message) => {
     if (message.role === 'assistant' && message.contextType === 'userPrompt' && message.toolCalls.length > 0) {
       const anthropicMessages: MessageParam[] = [
@@ -120,7 +132,7 @@ export async function parseAnthropicStream(
   chunks: Stream<Anthropic.Messages.RawMessageStreamEvent>,
   response: Response,
   model: AnthropicModel | BedrockAnthropicModel
-) {
+): Promise<ParsedAIResponse> {
   const responseMessage: AIMessagePrompt = {
     role: 'assistant',
     content: '',
@@ -129,46 +141,70 @@ export async function parseAnthropicStream(
     model,
   };
 
+  let input_tokens = 0;
+  let output_tokens = 0;
+  let cache_read_tokens = 0;
+  let cache_write_tokens = 0;
+
   for await (const chunk of chunks) {
     if (!response.writableEnded) {
-      if (chunk.type === 'content_block_start') {
-        if (chunk.content_block.type === 'text') {
-          responseMessage.content += chunk.content_block.text;
-        } else if (chunk.content_block.type === 'tool_use') {
-          const toolCalls = [...responseMessage.toolCalls];
-          const toolCall = {
-            id: chunk.content_block.id,
-            name: chunk.content_block.name,
-            arguments: '',
-            loading: true,
-          };
-          toolCalls.push(toolCall);
-          responseMessage.toolCalls = toolCalls;
-        }
-      } else if (chunk.type === 'content_block_delta') {
-        if (chunk.delta.type === 'text_delta') {
-          responseMessage.content += chunk.delta.text;
-        } else if (chunk.delta.type === 'input_json_delta') {
-          const toolCalls = [...responseMessage.toolCalls];
-          const toolCall = {
-            ...(toolCalls.pop() ?? {
-              id: '',
-              name: '',
+      switch (chunk.type) {
+        case 'content_block_start':
+          if (chunk.content_block.type === 'text') {
+            responseMessage.content += chunk.content_block.text;
+          } else if (chunk.content_block.type === 'tool_use') {
+            const toolCalls = [...responseMessage.toolCalls];
+            const toolCall = {
+              id: chunk.content_block.id,
+              name: chunk.content_block.name,
               arguments: '',
               loading: true,
-            }),
-          };
-          toolCall.arguments += chunk.delta.partial_json;
-          toolCalls.push(toolCall);
-          responseMessage.toolCalls = toolCalls;
-        }
-      } else if (chunk.type === 'content_block_stop') {
-        const toolCalls = [...responseMessage.toolCalls];
-        const toolCall = toolCalls.pop();
-        if (toolCall) {
-          toolCalls.push({ ...toolCall, loading: false });
-          responseMessage.toolCalls = toolCalls;
-        }
+            };
+            toolCalls.push(toolCall);
+            responseMessage.toolCalls = toolCalls;
+          }
+          break;
+        case 'content_block_delta':
+          if (chunk.delta.type === 'text_delta') {
+            responseMessage.content += chunk.delta.text;
+          } else if (chunk.delta.type === 'input_json_delta') {
+            const toolCalls = [...responseMessage.toolCalls];
+            const toolCall = {
+              ...(toolCalls.pop() ?? {
+                id: '',
+                name: '',
+                arguments: '',
+                loading: true,
+              }),
+            };
+            toolCall.arguments += chunk.delta.partial_json;
+            toolCalls.push(toolCall);
+            responseMessage.toolCalls = toolCalls;
+          }
+          break;
+        case 'content_block_stop':
+          {
+            const toolCalls = [...responseMessage.toolCalls];
+            const toolCall = toolCalls.pop();
+            if (toolCall) {
+              toolCalls.push({ ...toolCall, loading: false });
+              responseMessage.toolCalls = toolCalls;
+            }
+          }
+          break;
+        case 'message_start':
+          if (chunk.message.usage) {
+            input_tokens = Math.max(input_tokens, chunk.message.usage.input_tokens);
+            output_tokens = Math.max(output_tokens, chunk.message.usage.output_tokens);
+            cache_read_tokens = Math.max(cache_read_tokens, chunk.message.usage.cache_read_input_tokens ?? 0);
+            cache_write_tokens = Math.max(cache_write_tokens, chunk.message.usage.cache_creation_input_tokens ?? 0);
+          }
+          break;
+        case 'message_delta':
+          if (chunk.usage) {
+            output_tokens = Math.max(output_tokens, chunk.usage.output_tokens);
+          }
+          break;
       }
 
       response.write(`data: ${JSON.stringify(responseMessage)}\n\n`);
@@ -187,14 +223,16 @@ export async function parseAnthropicStream(
     response.end();
   }
 
-  return responseMessage;
+  const usage = calculateUsage({ model, input_tokens, output_tokens, cache_read_tokens, cache_write_tokens });
+
+  return { responseMessage, usage };
 }
 
 export function parseAnthropicResponse(
   result: Anthropic.Messages.Message,
   response: Response,
   model: AnthropicModel | BedrockAnthropicModel
-): AIMessagePrompt {
+): ParsedAIResponse {
   const responseMessage: AIMessagePrompt = {
     role: 'assistant',
     content: '',
@@ -233,5 +271,11 @@ export function parseAnthropicResponse(
 
   response.json(responseMessage);
 
-  return responseMessage;
+  const input_tokens = result.usage.input_tokens;
+  const output_tokens = result.usage.output_tokens;
+  const cache_read_tokens = result.usage.cache_read_input_tokens ?? 0;
+  const cache_write_tokens = result.usage.cache_creation_input_tokens ?? 0;
+  const usage = calculateUsage({ model, input_tokens, output_tokens, cache_read_tokens, cache_write_tokens });
+
+  return { responseMessage, usage };
 }