From 0209b06a3dfddc90312ec92ac52188169ebcaa90 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 13:46:56 -0500 Subject: [PATCH 01/20] add cache writes Signed-off-by: Aaron Choo --- api/v1alpha1/ai_gateway_route.go | 2 + api/v1alpha1/shared_types.go | 11 +++-- examples/token_ratelimit/token_ratelimit.yaml | 2 + internal/apischema/openai/openai.go | 8 ++++ internal/apischema/openai/openai_test.go | 38 +++++++++------- internal/controller/gateway.go | 2 + internal/extproc/processor_impl.go | 4 ++ internal/filterapi/filterconfig.go | 4 +- internal/metrics/metrics.go | 31 ++++++++++--- .../openinference/anthropic/messages.go | 6 ++- .../openinference/anthropic/messages_test.go | 1 + .../openinference/openai/response_attrs.go | 4 ++ .../openai/response_attrs_test.go | 1 + .../openinference/openai/responses_test.go | 43 +++++++++++++++++++ .../tracing/openinference/openinference.go | 5 +++ .../translator/openai_gcpanthropic_stream.go | 8 ++++ 16 files changed, 143 insertions(+), 27 deletions(-) diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go index d77f7d1f24..b4dac38660 100644 --- a/api/v1alpha1/ai_gateway_route.go +++ b/api/v1alpha1/ai_gateway_route.go @@ -108,6 +108,8 @@ type AIGatewayRouteSpec struct { // type: TotalToken // - metadataKey: llm_cached_input_token // type: CachedInputToken + // - metadataKey: llm_cached_write_input_token + // type: CachedWriteInputToken // ``` // Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three // rate limit buckets for each unique x-user-id header value. One bucket is for the input token, diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go index 2241beb051..7a34cf4128 100644 --- a/api/v1alpha1/shared_types.go +++ b/api/v1alpha1/shared_types.go @@ -100,9 +100,9 @@ type LLMRequestCost struct { MetadataKey string `json:"metadataKey"` // Type specifies the type of the request cost. The default is "OutputToken", // and it uses "output token" as the cost. The other types are "InputToken", "TotalToken", - // and "CEL". + // "CachedInputToken", "CachedWriteInputToken", and "CEL". // - // +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;TotalToken;CEL + // +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedWriteInputToken;TotalToken;CEL Type LLMRequestCostType `json:"type"` // CEL is the CEL expression to calculate the cost of the request. // The CEL expression must return a signed or unsigned integer. If the @@ -113,7 +113,8 @@ type LLMRequestCost struct { // * model: the model name extracted from the request content. Type: string. // * backend: the backend name in the form of "name.namespace". Type: string. // * input_tokens: the number of input tokens. Type: unsigned integer. - // * cached_input_tokens: the number of cached input tokens. Type: unsigned integer. + // * cached_input_tokens: the number of cached read input tokens. Type: unsigned integer. + // * cached_write_input_tokens: the number of cached write input tokens. Type: unsigned integer. // * output_tokens: the number of output tokens. Type: unsigned integer. // * total_tokens: the total number of tokens. Type: unsigned integer. // @@ -121,7 +122,7 @@ type LLMRequestCost struct { // // * "model == 'llama' ? input_tokens + output_token * 0.5 : total_tokens" // * "backend == 'foo.default' ? input_tokens + output_tokens : total_tokens" - // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + output_tokens : total_tokens" + // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_write_input_tokens * 1.25 + output_tokens : total_tokens" // * "input_tokens + output_tokens + total_tokens" // * "input_tokens * output_tokens" // @@ -137,6 +138,8 @@ const ( LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken" // LLMRequestCostTypeCachedInputToken is the cost type of the cached input token. LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken" + // LLMRequestCostTypeCachedWriteInputToken is the cost type of the cached input token. + LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken" // LLMRequestCostTypeOutputToken is the cost type of the output token. LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken" // LLMRequestCostTypeTotalToken is the cost type of the total token. diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml index 8c803f21d2..2224ed724a 100644 --- a/examples/token_ratelimit/token_ratelimit.yaml +++ b/examples/token_ratelimit/token_ratelimit.yaml @@ -51,6 +51,8 @@ spec: type: InputToken - metadataKey: llm_cached_input_token type: CachedInputToken + - metadataKey: llm_cached_write_input_token + type: CachedWriteInputToken - metadataKey: llm_output_token type: OutputToken - metadataKey: llm_total_token diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go index bfe6584b29..0c74a3e244 100644 --- a/internal/apischema/openai/openai.go +++ b/internal/apischema/openai/openai.go @@ -1382,6 +1382,8 @@ type PromptTokensDetails struct { AudioTokens int `json:"audio_tokens,omitzero"` // Cached tokens present in the prompt. CachedTokens int `json:"cached_tokens,omitzero"` + // Tokens written to the cache. + CachedWriteTokens int `json:"cached_write_tokens,omitzero"` } // ChatCompletionResponseChunk is described in the OpenAI API documentation: @@ -2535,6 +2537,9 @@ type ResponseUsageInputTokensDetails struct { // The number of tokens that were retrieved from the cache. // [More on prompt caching](https://platform.openai.com/docs/guides/prompt-caching). CachedTokens int64 `json:"cached_tokens"` + + // The number of tokens that were written to the cache. + CachedWriteTokens int64 `json:"cached_write_tokens"` } // A detailed breakdown of the output tokens. @@ -2548,6 +2553,9 @@ type ResponseTokensDetails struct { // CachedTokens: Number of cached tokens. CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api + // CachedWriteTokens: number of tokens that were written to the cache. + CachedWriteTokens int64 `json:"cached_write_tokens"` //nolint:tagliatelle + // ReasoningTokens: Number of reasoning tokens (for reasoning models). ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go index 974efe11ee..d9df99e6a4 100644 --- a/internal/apischema/openai/openai_test.go +++ b/internal/apischema/openai/openai_test.go @@ -1742,26 +1742,30 @@ func TestPromptTokensDetails(t *testing.T) { { name: "with text tokens", details: PromptTokensDetails{ - TextTokens: 15, - AudioTokens: 8, - CachedTokens: 384, + TextTokens: 15, + AudioTokens: 8, + CachedTokens: 384, + CachedWriteTokens: 10, }, expected: `{ "text_tokens": 15, "audio_tokens": 8, - "cached_tokens": 384 + "cached_tokens": 384, + "cached_write_tokens": 10 }`, }, { name: "with zero text tokens omitted", details: PromptTokensDetails{ - TextTokens: 0, - AudioTokens: 8, - CachedTokens: 384, + TextTokens: 0, + AudioTokens: 8, + CachedTokens: 384, + CachedWriteTokens: 10, }, expected: `{ "audio_tokens": 8, - "cached_tokens": 384 + "cached_tokens": 384, + "cached_write_tokens": 10 }`, }, } @@ -1818,8 +1822,9 @@ func TestChatCompletionResponseUsage(t *testing.T) { RejectedPredictionTokens: 0, }, PromptTokensDetails: &PromptTokensDetails{ - AudioTokens: 8, - CachedTokens: 384, + AudioTokens: 8, + CachedTokens: 384, + CachedWriteTokens: 13, }, }, expected: `{ @@ -1832,7 +1837,8 @@ func TestChatCompletionResponseUsage(t *testing.T) { }, "prompt_tokens_details": { "audio_tokens": 8, - "cached_tokens": 384 + "cached_tokens": 384, + "cached_write_tokens": 13 } }`, }, @@ -1850,9 +1856,10 @@ func TestChatCompletionResponseUsage(t *testing.T) { RejectedPredictionTokens: 0, }, PromptTokensDetails: &PromptTokensDetails{ - TextTokens: 15, - AudioTokens: 8, - CachedTokens: 384, + TextTokens: 15, + AudioTokens: 8, + CachedTokens: 384, + CachedWriteTokens: 21, }, }, expected: `{ @@ -1867,7 +1874,8 @@ func TestChatCompletionResponseUsage(t *testing.T) { "prompt_tokens_details": { "text_tokens": 15, "audio_tokens": 8, - "cached_tokens": 384 + "cached_tokens": 384, + "cached_write_tokens": 21 } }`, }, diff --git a/internal/controller/gateway.go b/internal/controller/gateway.go index 2ae6286f6d..db952afb21 100644 --- a/internal/controller/gateway.go +++ b/internal/controller/gateway.go @@ -404,6 +404,8 @@ func (c *GatewayController) reconcileFilterConfigSecret( fc.Type = filterapi.LLMRequestCostTypeInputToken case aigv1a1.LLMRequestCostTypeCachedInputToken: fc.Type = filterapi.LLMRequestCostTypeCachedInputToken + case aigv1a1.LLMRequestCostTypeCachedWriteInputToken: + fc.Type = filterapi.LLMRequestCostTypeCachedWriteInputToken case aigv1a1.LLMRequestCostTypeOutputToken: fc.Type = filterapi.LLMRequestCostTypeOutputToken case aigv1a1.LLMRequestCostTypeTotalToken: diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go index 0c9e859934..887fa89e0c 100644 --- a/internal/extproc/processor_impl.go +++ b/internal/extproc/processor_impl.go @@ -533,6 +533,8 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU cost, _ = costs.InputTokens() case filterapi.LLMRequestCostTypeCachedInputToken: cost, _ = costs.CachedInputTokens() + case filterapi.LLMRequestCostTypeCachedWriteInputToken: + cost, _ = costs.CachedWriteInputTokens() case filterapi.LLMRequestCostTypeOutputToken: cost, _ = costs.OutputTokens() case filterapi.LLMRequestCostTypeTotalToken: @@ -540,6 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU case filterapi.LLMRequestCostTypeCEL: in, _ := costs.InputTokens() cachedIn, _ := costs.CachedInputTokens() + cachedWrite, _ := costs.CachedInputWriteTokens() out, _ := costs.OutputTokens() total, _ := costs.TotalTokens() costU64, err := llmcostcel.EvaluateProgram( @@ -548,6 +551,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU backendName, in, cachedIn, + cachedWrite, out, total, ) diff --git a/internal/filterapi/filterconfig.go b/internal/filterapi/filterconfig.go index 226cf95446..f191d6eb01 100644 --- a/internal/filterapi/filterconfig.go +++ b/internal/filterapi/filterconfig.go @@ -79,8 +79,10 @@ const ( LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken" // LLMRequestCostTypeInputToken specifies that the request cost is calculated from the input token. LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken" - // LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached input token. + // LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached read input token. LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken" + // LLMRequestCostTypeCachedWriteInputToken specifies that the request cost is calculated from the cached write input token. + LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken" // LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token. LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken" // LLMRequestCostTypeCEL specifies that the request cost is calculated from the CEL expression. diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 4a4fab54db..6c473863bd 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -149,8 +149,10 @@ type TokenUsage struct { totalTokens uint32 // CachedInputTokens is the total number of tokens read from cache. cachedInputTokens uint32 + // CachedWriteInputTokens is the total number of tokens written to cache. + cachedWriteInputTokens uint32 - inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet bool + inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedWriteInputTokenSet bool } // InputTokens returns the number of input tokens and whether it was set. @@ -173,6 +175,11 @@ func (u *TokenUsage) CachedInputTokens() (uint32, bool) { return u.cachedInputTokens, u.cachedInputTokenSet } +// CachedWriteInputTokens returns the number of cached write input tokens and whether it was set. +func (u *TokenUsage) CachedWriteInputTokens() (uint32, bool) { + return u.cachedWriteInputTokens, u.cachedWriteInputTokenSet +} + // SetInputTokens sets the number of input tokens and marks the field as set. func (u *TokenUsage) SetInputTokens(tokens uint32) { u.inputTokens = tokens @@ -197,6 +204,12 @@ func (u *TokenUsage) SetCachedInputTokens(tokens uint32) { u.cachedInputTokenSet = true } +// SetCachedWriteInputTokens sets the number of cached write input tokens and marks the field as set. +func (u *TokenUsage) SetCachedWriteInputTokens(tokens uint32) { + u.cachedWriteInputTokens = tokens + u.cachedWriteInputTokenSet = true +} + // AddInputTokens increments the recorded input tokens and marks the field as set. func (u *TokenUsage) AddInputTokens(tokens uint32) { u.inputTokenSet = true @@ -215,6 +228,12 @@ func (u *TokenUsage) AddCachedInputTokens(tokens uint32) { u.cachedInputTokens += tokens } +// AddCachedWriteInputTokens increments the recorded cached write input tokens and marks the field as set. +func (u *TokenUsage) AddCachedWriteInputTokens(tokens uint32) { + u.cachedWriteInputTokenSet = true + u.cachedWriteInputTokens += tokens +} + // Override updates the TokenUsage fields with values from another TokenUsage instance. // Only fields that are marked as set in the other instance will override the current values. func (u *TokenUsage) Override(other TokenUsage) { @@ -234,6 +253,10 @@ func (u *TokenUsage) Override(other TokenUsage) { u.cachedInputTokens = other.cachedInputTokens u.cachedInputTokenSet = true } + if other.cachedWriteInputTokenSet { + u.cachedWriteInputTokens = other.cachedWriteInputTokens + u.cachedWriteInputTokenSet = true + } } // ExtractTokenUsageFromAnthropic extracts the correct token usage from Anthropic API response. @@ -246,13 +269,11 @@ func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, // Calculate total input tokens as per Anthropic API documentation totalInputTokens := inputTokens + cacheCreationTokens + cacheReadTokens - // Cache tokens include both read and creation tokens - totalCachedTokens := cacheReadTokens + cacheCreationTokens - var usage TokenUsage usage.SetInputTokens(uint32(totalInputTokens)) //nolint:gosec usage.SetOutputTokens(uint32(outputTokens)) //nolint:gosec usage.SetTotalTokens(uint32(totalInputTokens + outputTokens)) //nolint:gosec - usage.SetCachedInputTokens(uint32(totalCachedTokens)) //nolint:gosec + usage.SetCachedInputTokens(uint32(cacheReadTokens)) //nolint:gosec + usage.SetCachedWriteInputTokens(uint32(cacheCreationTokens)) //nolint:gosec return usage } diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go index 4b6fe1f771..340e37c459 100644 --- a/internal/tracing/openinference/anthropic/messages.go +++ b/internal/tracing/openinference/anthropic/messages.go @@ -214,13 +214,15 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer int64(u.CacheCreationInputTokens), ) input, _ := cost.InputTokens() - cache, _ := cost.CachedInputTokens() + cacheRead, _ := cost.CachedInputTokens() + cacheCreation, _ := cost.CachedIWritenputTokens() output, _ := cost.OutputTokens() total, _ := cost.TotalTokens() attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPrompt, int(input)), - attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(cache)), + attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(cacheRead)), + attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(cacheCreation)), attribute.Int(openinference.LLMTokenCountCompletion, int(output)), attribute.Int(openinference.LLMTokenCountTotal, int(total)), ) diff --git a/internal/tracing/openinference/anthropic/messages_test.go b/internal/tracing/openinference/anthropic/messages_test.go index 7f6384a624..6a6cf3bbff 100644 --- a/internal/tracing/openinference/anthropic/messages_test.go +++ b/internal/tracing/openinference/anthropic/messages_test.go @@ -326,6 +326,7 @@ func TestMessageRecorder_RecordResponse(t *testing.T) { attribute.String(openinference.OutputMessageToolCallAttribute(1, 0, openinference.ToolCallFunctionArguments), `{"timezone":"UTC"}`), attribute.Int(openinference.LLMTokenCountPrompt, 10), attribute.Int(openinference.LLMTokenCountPromptCacheHit, 0), + attribute.Int(openinference.LLMTokenCountPromptCacheWrite, 0), attribute.Int(openinference.LLMTokenCountCompletion, 5), attribute.Int(openinference.LLMTokenCountTotal, 15), }, diff --git a/internal/tracing/openinference/openai/response_attrs.go b/internal/tracing/openinference/openai/response_attrs.go index a771014b0b..178c22f0da 100644 --- a/internal/tracing/openinference/openai/response_attrs.go +++ b/internal/tracing/openinference/openai/response_attrs.go @@ -58,6 +58,7 @@ func buildResponseAttributes(resp *openai.ChatCompletionResponse, config *openin attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptAudio, td.AudioTokens), attribute.Int(openinference.LLMTokenCountPromptCacheHit, td.CachedTokens), + attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedWriteTokens), ) } } @@ -193,6 +194,9 @@ func buildResponsesResponseAttributes(resp *openai.Response, _ *openinference.Tr if resp.Usage.InputTokensDetails.CachedTokens > 0 { attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(resp.Usage.InputTokensDetails.CachedTokens))) } + if resp.Usage.InputTokensDetails.CachedWriteTokens > 0 { + attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedWriteTokens))) + } } return attrs diff --git a/internal/tracing/openinference/openai/response_attrs_test.go b/internal/tracing/openinference/openai/response_attrs_test.go index c5de581f30..f147b891fe 100644 --- a/internal/tracing/openinference/openai/response_attrs_test.go +++ b/internal/tracing/openinference/openai/response_attrs_test.go @@ -182,6 +182,7 @@ func TestBuildResponseAttributes(t *testing.T) { attribute.Int(openinference.LLMTokenCountPrompt, 9), attribute.Int(openinference.LLMTokenCountPromptAudio, 0), attribute.Int(openinference.LLMTokenCountPromptCacheHit, 0), + attribute.Int(openinference.LLMTokenCountPromptCacheWrite, 0), attribute.Int(openinference.LLMTokenCountCompletion, 9), attribute.Int(openinference.LLMTokenCountCompletionAudio, 0), attribute.Int(openinference.LLMTokenCountCompletionReasoning, 0), diff --git a/internal/tracing/openinference/openai/responses_test.go b/internal/tracing/openinference/openai/responses_test.go index d623b124f7..76fbdc9eb5 100644 --- a/internal/tracing/openinference/openai/responses_test.go +++ b/internal/tracing/openinference/openai/responses_test.go @@ -58,6 +58,34 @@ var ( } basicResponseRespBody = mustJSON(basicResponseResp) + responseWithCacheWrite = &openai.Response{ + ID: "resp-456", + Model: openai.ModelGPT5Nano, + Output: []responses.ResponseOutputItemUnion{ + { + ID: "msg_02", + Type: "message", + Role: "assistant", + Content: []responses.ResponseOutputMessageContentUnion{ + { + Type: "output_text", + Text: "This response includes cache write tokens.", + }, + }, + }, + }, + Usage: &openai.ResponseUsage{ + InputTokens: 100, + InputTokensDetails: openai.ResponseUsageInputTokensDetails{ + CachedTokens: 10, + CachedWriteTokens: 50, + }, + OutputTokens: 25, + TotalTokens: 125, + }, + } + responseWithCacheWriteBody = mustJSON(responseWithCacheWrite) + responseReqWithStreaming = &openai.ResponseRequest{ Model: openai.ModelGPT5Nano, Input: responses.ResponseNewParamsInputUnion{ @@ -173,6 +201,21 @@ func TestResponsesRecorder_RecordResponse(t *testing.T) { }, expectedStatus: trace.Status{Code: codes.Ok, Description: ""}, }, + { + name: "response with cache write", + resp: responseWithCacheWrite, + config: &openinference.TraceConfig{}, + expectedAttrs: []attribute.KeyValue{ + attribute.String(openinference.LLMModelName, openai.ModelGPT5Nano), + attribute.Int(openinference.LLMTokenCountPrompt, 100), + attribute.Int(openinference.LLMTokenCountCompletion, 25), + attribute.Int(openinference.LLMTokenCountTotal, 125), + attribute.Int(openinference.LLMTokenCountPromptCacheHit, 10), + attribute.Int(openinference.LLMTokenCountPromptCacheWrite, 50), + attribute.String(openinference.OutputValue, string(responseWithCacheWriteBody)), + }, + expectedStatus: trace.Status{Code: codes.Ok, Description: ""}, + }, } for _, tt := range tests { diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go index 7913b69f57..aa50bca94c 100644 --- a/internal/tracing/openinference/openinference.go +++ b/internal/tracing/openinference/openinference.go @@ -160,6 +160,11 @@ const ( // and cost savings from cached prompts. LLMTokenCountPromptCacheHit = "llm.token_count.prompt_details.cache_read" // #nosec G101 + // LLMTokenCountPromptCacheWrite represents the number of prompt tokens + // written to cache (cache writes). This enables tracking of cache efficiency + // and cost savings from cached prompts. + LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_write" // #nosec G101 + // LLMTokenCountPromptAudio represents the number of audio tokens in the prompt. // Used for multimodal models that support audio input. LLMTokenCountPromptAudio = "llm.token_count.prompt_details.audio" // #nosec G101 diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go index 60f4a60b05..17b6f11f75 100644 --- a/internal/translator/openai_gcpanthropic_stream.go +++ b/internal/translator/openai_gcpanthropic_stream.go @@ -211,6 +211,9 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat if cached, ok := usage.CachedInputTokens(); ok { p.tokenUsage.SetCachedInputTokens(cached) } + if cachedWrite, ok := usage.CachedWriteInputTokens(); ok { + p.tokenUsage.SetCachedWriteInputTokens(cachedWrite) + } // reset the toolIndex for each message p.toolIndex = -1 @@ -292,6 +295,11 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat // Accumulate any additional cache tokens from delta p.tokenUsage.AddCachedInputTokens(cached) } + if cached, ok := usage.CachedWriteInputTokens(); ok { + p.tokenUsage.AddInputTokens(cached) + // Accumulate any additional cache tokens from delta + p.tokenUsage.AddCachedWriteInputTokens(cached) + } if event.Delta.StopReason != "" { p.stopReason = event.Delta.StopReason } From 985d4a90bd7424ee0379405fc60b3c88fc421b85 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 14:59:50 -0500 Subject: [PATCH 02/20] found more cache needing update Signed-off-by: Aaron Choo --- internal/controller/gateway_test.go | 8 +- internal/extproc/mocks_test.go | 27 +- internal/extproc/processor_impl.go | 2 +- internal/extproc/processor_impl_test.go | 7 + internal/llmcostcel/cel.go | 29 ++- internal/metrics/genai.go | 5 +- internal/metrics/metrics_impl.go | 6 + internal/metrics/metrics_impl_test.go | 27 +- .../openinference/anthropic/messages.go | 2 +- internal/translator/anthropic_anthropic.go | 3 + .../translator/anthropic_anthropic_test.go | 6 +- .../translator/anthropic_gcpanthropic_test.go | 31 ++- internal/translator/anthropic_usage_test.go | 241 ++++++++++-------- internal/translator/cohere_rerank_v2_test.go | 2 +- .../imagegeneration_openai_openai_test.go | 4 +- internal/translator/openai_awsbedrock.go | 14 +- internal/translator/openai_awsbedrock_test.go | 6 +- .../openai_azureopenai_embeddings_test.go | 6 +- .../translator/openai_azureopenai_test.go | 4 +- internal/translator/openai_completions.go | 3 +- .../translator/openai_completions_test.go | 12 +- internal/translator/openai_embeddings_test.go | 6 +- internal/translator/openai_gcpanthropic.go | 4 +- .../translator/openai_gcpanthropic_stream.go | 7 +- .../translator/openai_gcpanthropic_test.go | 10 +- internal/translator/openai_gcpvertexai.go | 2 + .../translator/openai_gcpvertexai_test.go | 18 +- internal/translator/openai_openai.go | 3 +- internal/translator/openai_openai_test.go | 28 +- internal/translator/openai_responses.go | 11 +- internal/translator/openai_responses_test.go | 22 ++ .../testdata/aigatewayroutes/llmcosts.yaml | 2 + 32 files changed, 338 insertions(+), 220 deletions(-) diff --git a/internal/controller/gateway_test.go b/internal/controller/gateway_test.go index 3a45512524..6c3be8df2f 100644 --- a/internal/controller/gateway_test.go +++ b/internal/controller/gateway_test.go @@ -197,6 +197,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) { {MetadataKey: "bar", Type: aigv1a1.LLMRequestCostTypeOutputToken}, {MetadataKey: "baz", Type: aigv1a1.LLMRequestCostTypeTotalToken}, {MetadataKey: "qux", Type: aigv1a1.LLMRequestCostTypeCachedInputToken}, + {MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedWriteInputToken}, }, }, }, @@ -274,13 +275,14 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) { var fc filterapi.Config require.NoError(t, yaml.Unmarshal([]byte(configStr), &fc)) require.Equal(t, "dev", fc.Version) - require.Len(t, fc.LLMRequestCosts, 5) + require.Len(t, fc.LLMRequestCosts, 6) require.Equal(t, filterapi.LLMRequestCostTypeInputToken, fc.LLMRequestCosts[0].Type) require.Equal(t, filterapi.LLMRequestCostTypeOutputToken, fc.LLMRequestCosts[1].Type) require.Equal(t, filterapi.LLMRequestCostTypeTotalToken, fc.LLMRequestCosts[2].Type) require.Equal(t, filterapi.LLMRequestCostTypeCachedInputToken, fc.LLMRequestCosts[3].Type) - require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[4].Type) - require.Equal(t, `backend == 'foo.default' ? input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[4].CEL) + require.Equal(t, filterapi.LLMRequestCostTypeCachedWriteInputToken, fc.LLMRequestCosts[4].Type) + require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[5].Type) + require.Equal(t, `backend == 'foo.default' ? input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[5].CEL) require.Len(t, fc.Models, 1) require.Equal(t, "mymodel", fc.Models[0].Name) diff --git a/internal/extproc/mocks_test.go b/internal/extproc/mocks_test.go index 0536762878..a2997f14f9 100644 --- a/internal/extproc/mocks_test.go +++ b/internal/extproc/mocks_test.go @@ -171,16 +171,17 @@ func (m *mockMetricsFactory) NewMetrics() metrics.Metrics { // mockMetrics implements [metrics.Metrics] for testing. type mockMetrics struct { - requestStart time.Time - originalModel string - requestModel string - responseModel string - backend string - requestSuccessCount int - requestErrorCount int - inputTokenCount int - cachedInputTokenCount int - outputTokenCount int + requestStart time.Time + originalModel string + requestModel string + responseModel string + backend string + requestSuccessCount int + requestErrorCount int + inputTokenCount int + cachedInputTokenCount int + cachedWriteInputTokenCount int + outputTokenCount int // streamingOutputTokens tracks the cumulative output tokens recorded via RecordTokenLatency. streamingOutputTokens int timeToFirstToken float64 @@ -218,6 +219,9 @@ func (m *mockMetrics) RecordTokenUsage(_ context.Context, usage metrics.TokenUsa if cachedInput, ok := usage.CachedInputTokens(); ok { m.cachedInputTokenCount += int(cachedInput) } + if cachedWriteInput, ok := usage.CachedWriteInputTokens(); ok { + m.cachedWriteInputTokenCount += int(cachedWriteInput) + } if output, ok := usage.OutputTokens(); ok { m.outputTokenCount += int(output) } @@ -278,9 +282,10 @@ func (m *mockMetrics) RequireRequestFailure(t *testing.T) { require.Equal(t, 1, m.requestErrorCount) } -func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedOutput int) { +func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedWriteCachedInput, expectedOutput int) { require.Equal(t, expectedInput, m.inputTokenCount) require.Equal(t, expectedCachedInput, m.cachedInputTokenCount) + require.Equal(t, expectedWriteCachedInput, m.cachedWriteInputTokenCount) require.Equal(t, expectedOutput, m.outputTokenCount) } diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go index 887fa89e0c..93c35aa7cc 100644 --- a/internal/extproc/processor_impl.go +++ b/internal/extproc/processor_impl.go @@ -542,7 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU case filterapi.LLMRequestCostTypeCEL: in, _ := costs.InputTokens() cachedIn, _ := costs.CachedInputTokens() - cachedWrite, _ := costs.CachedInputWriteTokens() + cachedWrite, _ := costs.CachedWriteInputTokens() out, _ := costs.OutputTokens() total, _ := costs.TotalTokens() costU64, err := llmcostcel.EvaluateProgram( diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go index e8006f0632..a590a38d24 100644 --- a/internal/extproc/processor_impl_test.go +++ b/internal/extproc/processor_impl_test.go @@ -259,6 +259,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T mt.retUsedToken.SetOutputTokens(123) mt.retUsedToken.SetInputTokens(1) mt.retUsedToken.SetCachedInputTokens(1) + mt.retUsedToken.SetCachedWriteInputTokens(3) celProgInt, err := llmcostcel.NewProgram("54321") require.NoError(t, err) @@ -274,6 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}}, + {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cached_write_input_token_usage"}}, { CELProg: celProgInt, LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"}, @@ -309,6 +311,8 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T GetStructValue().Fields["input_token_usage"].GetNumberValue()) require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cached_input_token_usage"].GetNumberValue()) + require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. + GetStructValue().Fields["cached_write_input_token_usage"].GetNumberValue()) require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cel_int"].GetNumberValue()) require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. @@ -371,6 +375,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T mt.expResponseBody = final mt.retUsedToken.SetInputTokens(5) mt.retUsedToken.SetCachedInputTokens(3) + mt.retUsedToken.SetCachedWriteInputTokens(21) mt.retUsedToken.SetOutputTokens(138) mt.retUsedToken.SetTotalTokens(143) _, err = p.ProcessResponseBody(t.Context(), final) @@ -379,6 +384,8 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T require.Equal(t, 5, mm.inputTokenCount) require.Equal(t, 138, mm.outputTokenCount) require.Equal(t, 138, mm.streamingOutputTokens) // accumulated output tokens from stream + require.Equal(t, 3, mm.cachedInputTokenCount) + require.Equal(t, 21, mm.cachedWriteInputTokenCount) }) } diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go index 071b9b13ab..0c2f19f913 100644 --- a/internal/llmcostcel/cel.go +++ b/internal/llmcostcel/cel.go @@ -16,12 +16,13 @@ import ( ) const ( - celModelNameKey = "model" - celBackendKey = "backend" - celInputTokensKey = "input_tokens" - celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 - celOutputTokensKey = "output_tokens" - celTotalTokensKey = "total_tokens" + celModelNameKey = "model" + celBackendKey = "backend" + celInputTokensKey = "input_tokens" + celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 + celCachedWriteInputTokensKey = "cached_write_input_tokens" // #nosec G101 + celOutputTokensKey = "output_tokens" + celTotalTokensKey = "total_tokens" ) var env *cel.Env @@ -33,6 +34,7 @@ func init() { cel.Variable(celBackendKey, cel.StringType), cel.Variable(celInputTokensKey, cel.UintType), cel.Variable(celCachedInputTokensKey, cel.UintType), + cel.Variable(celCachedWriteInputTokensKey, cel.UintType), cel.Variable(celOutputTokensKey, cel.UintType), cel.Variable(celTotalTokensKey, cel.UintType), ) @@ -62,14 +64,15 @@ func NewProgram(expr string) (prog cel.Program, err error) { } // EvaluateProgram evaluates the given CEL program with the given variables. -func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, outputTokens, totalTokens uint32) (uint64, error) { +func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedWriteInputTokens, outputTokens, totalTokens uint32) (uint64, error) { out, _, err := prog.Eval(map[string]any{ - celModelNameKey: modelName, - celBackendKey: backend, - celInputTokensKey: inputTokens, - celCachedInputTokensKey: cachedInputTokens, - celOutputTokensKey: outputTokens, - celTotalTokensKey: totalTokens, + celModelNameKey: modelName, + celBackendKey: backend, + celInputTokensKey: inputTokens, + celCachedInputTokensKey: cachedInputTokens, + celCachedWriteInputTokensKey: cachedWriteInputTokens, + celOutputTokensKey: outputTokens, + celTotalTokensKey: totalTokens, }) if err != nil || out == nil { return 0, fmt.Errorf("failed to evaluate CEL expression: %w", err) diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go index 1aaee2dd97..3c4a3dc62f 100644 --- a/internal/metrics/genai.go +++ b/internal/metrics/genai.go @@ -39,8 +39,9 @@ const ( // https://github.com/open-telemetry/semantic-conventions/issues/1959 // // However, the spec says "a custom value MAY be used.", so we can use it now. - genaiTokenTypeCachedInput = "cached_input" - genaiErrorTypeFallback = "_OTHER" + genaiTokenTypeCachedInput = "cached_input" + genaiTokenTypeCachedWriteInput = "cached_write_input" + genaiErrorTypeFallback = "_OTHER" ) // GenAIOperation represents the type of generative AI operation i.e. the endpoint being called. diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go index c946c541db..e4a85e4d23 100644 --- a/internal/metrics/metrics_impl.go +++ b/internal/metrics/metrics_impl.go @@ -148,6 +148,12 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)), ) } + if cachedWriteInputTokens, ok := usage.CachedWriteInputTokens(); ok { + b.metrics.tokenUsage.Record(ctx, float64(cachedWriteInputTokens), + metric.WithAttributeSet(attrs), + metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput)), + ) + } if outputTokens, ok := usage.OutputTokens(); ok { b.metrics.tokenUsage.Record(ctx, float64(outputTokens), metric.WithAttributeSet(attrs), diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go index 7f697c7218..8bf0ff2cce 100644 --- a/internal/metrics/metrics_impl_test.go +++ b/internal/metrics/metrics_impl_test.go @@ -71,9 +71,10 @@ func TestRecordTokenUsage(t *testing.T) { attribute.Key(genaiAttributeResponseModel).String("test-model"), } // gen_ai.token.type values - https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/#common-attributes - inputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...) - outputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...) - cachedInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...) + inputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...) + outputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...) + cachedInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...) + cachedWriteInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput))...) ) pm.SetOriginalModel("test-model") @@ -81,7 +82,7 @@ func TestRecordTokenUsage(t *testing.T) { pm.SetResponseModel("test-model") pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}}) pm.RecordTokenUsage(t.Context(), TokenUsage{ - inputTokens: 10, cachedInputTokens: 8, outputTokens: 5, + inputTokens: 10, cachedInputTokens: 8, cachedWriteInputTokens: 2, outputTokens: 5, inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true, }, nil) @@ -93,6 +94,10 @@ func TestRecordTokenUsage(t *testing.T) { assert.Equal(t, uint64(1), count) assert.Equal(t, 8.0, sum) + count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs) + assert.Equal(t, uint64(1), count) + assert.Equal(t, 2.0, sum) + count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, outputAttrs) assert.Equal(t, uint64(1), count) assert.Equal(t, 5.0, sum) @@ -295,7 +300,7 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) { pm.SetRequestModel("req-model") pm.SetResponseModel("res-model") pm.RecordTokenUsage(t.Context(), TokenUsage{ - inputTokens: 2, cachedInputTokens: 1, outputTokens: 3, + inputTokens: 2, cachedInputTokens: 1, cachedWriteInputTokens: 6, outputTokens: 3, inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true, }, nil) @@ -323,6 +328,18 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) { assert.Equal(t, uint64(1), count) assert.Equal(t, 1.0, sum) + cachedWriteInputAttrs := attribute.NewSet( + attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)), + attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI), + attribute.Key(genaiAttributeOriginalModel).String("orig-model"), + attribute.Key(genaiAttributeRequestModel).String("req-model"), + attribute.Key(genaiAttributeResponseModel).String("res-model"), + attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput), + ) + count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs) + assert.Equal(t, uint64(1), count) + assert.Equal(t, 6.0, sum) + outputAttrs := attribute.NewSet( attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)), attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI), diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go index 340e37c459..1232609a37 100644 --- a/internal/tracing/openinference/anthropic/messages.go +++ b/internal/tracing/openinference/anthropic/messages.go @@ -215,7 +215,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer ) input, _ := cost.InputTokens() cacheRead, _ := cost.CachedInputTokens() - cacheCreation, _ := cost.CachedIWritenputTokens() + cacheCreation, _ := cost.CachedWriteInputTokens() output, _ := cost.OutputTokens() total, _ := cost.TotalTokens() diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go index e1462a32f8..bbbde960cc 100644 --- a/internal/translator/anthropic_anthropic.go +++ b/internal/translator/anthropic_anthropic.go @@ -181,6 +181,9 @@ func (a *anthropicToAnthropicTranslator) updateTotalTokens() { if _, cachedSet := a.streamingTokenUsage.CachedInputTokens(); !cachedSet { a.streamingTokenUsage.SetCachedInputTokens(0) } + if _, cachedSet := a.streamingTokenUsage.CachedWriteInputTokens(); !cachedSet { + a.streamingTokenUsage.SetCachedWriteInputTokens(0) + } } if inputSet && outputSet { diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go index da8f220e6d..20fc793ab5 100644 --- a/internal/translator/anthropic_anthropic_test.go +++ b/internal/translator/anthropic_anthropic_test.go @@ -95,7 +95,7 @@ func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) { require.NoError(t, err) require.Nil(t, headerMutation) require.Nil(t, bodyMutation) - expected := tokenUsageFrom(9, 0, 16, 25) + expected := tokenUsageFrom(9, 0, -1, 16, 25) require.Equal(t, expected, tokenUsage) require.Equal(t, "claude-sonnet-4-5-20250929", responseModel) } @@ -141,7 +141,7 @@ data: {"type":"message_stop" }` require.NoError(t, err) require.Nil(t, headerMutation) require.Nil(t, bodyMutation) - expected := tokenUsageFrom(10, 1, 0, 10) + expected := tokenUsageFrom(10, 1, 0, 0, 10) require.Equal(t, expected, tokenUsage) require.Equal(t, "claude-sonnet-4-5-20250929", responseModel) @@ -149,7 +149,7 @@ data: {"type":"message_stop" }` require.NoError(t, err) require.Nil(t, headerMutation) require.Nil(t, bodyMutation) - expected = tokenUsageFrom(10, 1, 16, 26) + expected = tokenUsageFrom(10, 1, 0, 16, 26) require.Equal(t, expected, tokenUsage) require.Equal(t, "claude-sonnet-4-5-20250929", responseModel) } diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go index 7116fd0261..875265d94d 100644 --- a/internal/translator/anthropic_gcpanthropic_test.go +++ b/internal/translator/anthropic_gcpanthropic_test.go @@ -467,7 +467,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_ZeroTokenUsage(t *testin _, _, tokenUsage, _, err := translator.ResponseBody(respHeaders, bodyReader, true, nil) require.NoError(t, err) - expected := tokenUsageFrom(0, 0, 0, 0) + expected := tokenUsageFrom(0, 0, 0, 0, 0) assert.Equal(t, expected, tokenUsage) } @@ -482,31 +482,31 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingTokenUsage(t *t name: "regular streaming chunk without usage", chunk: "event: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" to me.\"}}\n\n", endOfStream: false, - expectedUsage: tokenUsageFrom(-1, -1, -1, -1), + expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, { name: "message_delta chunk with token usage", chunk: "event: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\",\"stop_sequence\":null},\"usage\":{\"output_tokens\":84}}\n\n", endOfStream: false, - expectedUsage: tokenUsageFrom(0, 0, 84, 84), + expectedUsage: tokenUsageFrom(0, 0, 0, 84, 84), }, { name: "message_stop chunk without usage", chunk: "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n", endOfStream: false, - expectedUsage: tokenUsageFrom(-1, -1, -1, -1), + expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, { name: "invalid json chunk", chunk: "event: invalid\ndata: {\"invalid\": \"json\"}\n\n", endOfStream: false, - expectedUsage: tokenUsageFrom(-1, -1, -1, -1), + expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, { name: "message_delta with decimal output_tokens", chunk: "event: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"tool_use\"},\"usage\":{\"output_tokens\":42.0}}\n\n", endOfStream: false, - expectedUsage: tokenUsageFrom(0, 0, 42, 42), + expectedUsage: tokenUsageFrom(0, 0, 0, 42, 42), }, } @@ -545,12 +545,12 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te { name: "message_delta without usage field", chunk: "event: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\"}}\n\n", - expectedUsage: tokenUsageFrom(0, 0, 0, 0), + expectedUsage: tokenUsageFrom(0, 0, 0, 0, 0), }, { name: "invalid json in data", chunk: "event: message_start\ndata: {invalid json}\n\n", - expectedUsage: tokenUsageFrom(-1, -1, -1, -1), + expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, } @@ -570,7 +570,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te } } -func tokenUsageFrom(in, cachedInput, out, total int32) metrics.TokenUsage { +func tokenUsageFrom(in, cachedInput, cachedWriteInput, out, total int32) metrics.TokenUsage { var usage metrics.TokenUsage if in >= 0 { usage.SetInputTokens(uint32(in)) @@ -578,6 +578,9 @@ func tokenUsageFrom(in, cachedInput, out, total int32) metrics.TokenUsage { if cachedInput >= 0 { usage.SetCachedInputTokens(uint32(cachedInput)) } + if cachedWriteInput >= 0 { + usage.SetCachedWriteInputTokens(uint32(cachedWriteInput)) + } if out >= 0 { usage.SetOutputTokens(uint32(out)) } @@ -608,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t // 3. message_delta at the end provides output_tokens=5 but no input_tokens // 4. message_stop ends the stream messageStartChunk := `event: message_start -data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "output_tokens": 0}}} +data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "cache_write_input_tokens": 1, "output_tokens": 0}}} ` contentBlockStartChunk := `event: content_block_start data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}} @@ -635,6 +638,7 @@ data: {"type": "message_stop"} outputTokens, outputSet := tokenUsage.OutputTokens() totalTokens, totalSet := tokenUsage.TotalTokens() cachedTokens, cachedSet := tokenUsage.CachedInputTokens() + cachedWriteTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens() // Assertions assert.True(t, inputSet, "Input tokens should be set") @@ -649,6 +653,9 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") + assert.True(t, cachedWriteSet, "Cached write tokens should be set") + assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario") + _, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil) require.NoError(t, err) _, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockDeltaChunk), false, nil) @@ -665,6 +672,7 @@ data: {"type": "message_stop"} outputTokens, outputSet = tokenUsage.OutputTokens() totalTokens, totalSet = tokenUsage.TotalTokens() cachedTokens, cachedSet = tokenUsage.CachedInputTokens() + cachedWriteTokens, cachedWriteSet = tokenUsage.CachedWriteInputTokens() assert.True(t, inputSet, "Input tokens should be set") assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start") @@ -677,4 +685,7 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") + + assert.True(t, cachedWriteSet, "Cached write tokens should be set") + assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario") } diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index 427eff0c5a..bc1a8a063c 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -16,81 +16,88 @@ import ( func TestExtractLLMTokenUsage(t *testing.T) { tests := []struct { - name string - inputTokens int64 - outputTokens int64 - cacheReadTokens int64 - cacheCreationTokens int64 - expectedInputTokens uint32 - expectedOutputTokens uint32 - expectedTotalTokens uint32 - expectedCachedTokens uint32 + name string + inputTokens int64 + outputTokens int64 + cacheReadTokens int64 + cacheCreationTokens int64 + expectedInputTokens uint32 + expectedOutputTokens uint32 + expectedTotalTokens uint32 + expectedCachedTokens uint32 + expectedCachedWriteTokens uint32 }{ { - name: "basic usage without cache", - inputTokens: 100, - outputTokens: 50, - cacheReadTokens: 0, - cacheCreationTokens: 0, - expectedInputTokens: 100, - expectedOutputTokens: 50, - expectedTotalTokens: 150, - expectedCachedTokens: 0, + name: "basic usage without cache", + inputTokens: 100, + outputTokens: 50, + cacheReadTokens: 0, + cacheCreationTokens: 0, + expectedInputTokens: 100, + expectedOutputTokens: 50, + expectedTotalTokens: 150, + expectedCachedTokens: 0, + expectedCachedWriteTokens: 0, }, { - name: "usage with cache read tokens", - inputTokens: 80, - outputTokens: 30, - cacheReadTokens: 20, - cacheCreationTokens: 0, - expectedInputTokens: 100, // 80 + 0 + 20 - expectedOutputTokens: 30, - expectedTotalTokens: 130, // 100 + 30 - expectedCachedTokens: 20, // 20 + 0 + name: "usage with cache read tokens", + inputTokens: 80, + outputTokens: 30, + cacheReadTokens: 20, + cacheCreationTokens: 0, + expectedInputTokens: 100, // 80 + 0 + 20 + expectedOutputTokens: 30, + expectedTotalTokens: 130, // 100 + 30 + expectedCachedTokens: 20, // 20 + expectedCachedWriteTokens: 0, }, { - name: "usage with cache creation tokens", - inputTokens: 60, - outputTokens: 40, - cacheReadTokens: 0, - cacheCreationTokens: 15, - expectedInputTokens: 75, // 60 + 15 + 0 - expectedOutputTokens: 40, - expectedTotalTokens: 115, // 75 + 40 - expectedCachedTokens: 15, // 0 + 15 + name: "usage with cache creation tokens", + inputTokens: 60, + outputTokens: 40, + cacheReadTokens: 0, + cacheCreationTokens: 15, + expectedInputTokens: 75, // 60 + 15 + 0 + expectedOutputTokens: 40, + expectedTotalTokens: 115, // 75 + 40 + expectedCachedTokens: 0, // 0 + expectedCachedWriteTokens: 15, // 15 }, { - name: "usage with both cache types", - inputTokens: 70, - outputTokens: 25, - cacheReadTokens: 10, - cacheCreationTokens: 5, - expectedInputTokens: 85, // 70 + 5 + 10 - expectedOutputTokens: 25, - expectedTotalTokens: 110, // 85 + 25 - expectedCachedTokens: 15, // 10 + 5 + name: "usage with both cache types", + inputTokens: 70, + outputTokens: 25, + cacheReadTokens: 10, + cacheCreationTokens: 5, + expectedInputTokens: 85, // 70 + 5 + 10 + expectedOutputTokens: 25, + expectedTotalTokens: 110, // 85 + 25 + expectedCachedTokens: 10, // 10 + expectedCachedWriteTokens: 5, // 5 }, { - name: "zero values", - inputTokens: 0, - outputTokens: 0, - cacheReadTokens: 0, - cacheCreationTokens: 0, - expectedInputTokens: 0, - expectedOutputTokens: 0, - expectedTotalTokens: 0, - expectedCachedTokens: 0, + name: "zero values", + inputTokens: 0, + outputTokens: 0, + cacheReadTokens: 0, + cacheCreationTokens: 0, + expectedInputTokens: 0, + expectedOutputTokens: 0, + expectedTotalTokens: 0, + expectedCachedTokens: 0, + expectedCachedWriteTokens: 0, }, { - name: "large values", - inputTokens: 100000, - outputTokens: 50000, - cacheReadTokens: 25000, - cacheCreationTokens: 15000, - expectedInputTokens: 140000, // 100000 + 15000 + 25000 - expectedOutputTokens: 50000, - expectedTotalTokens: 190000, // 140000 + 50000 - expectedCachedTokens: 40000, // 25000 + 15000 + name: "large values", + inputTokens: 100000, + outputTokens: 50000, + cacheReadTokens: 25000, + cacheCreationTokens: 15000, + expectedInputTokens: 140000, // 100000 + 15000 + 25000 + expectedOutputTokens: 50000, + expectedTotalTokens: 190000, // 140000 + 50000 + expectedCachedTokens: 25000, // 25000 + expectedCachedWriteTokens: 15000, }, } @@ -104,12 +111,12 @@ func TestExtractLLMTokenUsage(t *testing.T) { ) expected := tokenUsageFrom( - int32(tt.expectedInputTokens), // nolint:gosec - -1, - int32(tt.expectedOutputTokens), // nolint:gosec - int32(tt.expectedTotalTokens), // nolint:gosec + int32(tt.expectedInputTokens), // nolint:gosec + int32(tt.expectedCachedTokens), // nolint:gosec + int32(tt.expectedCachedWriteTokens), // nolint:gosec + int32(tt.expectedOutputTokens), // nolint:gosec + int32(tt.expectedTotalTokens), // nolint:gosec ) - expected.SetCachedInputTokens(tt.expectedCachedTokens) assert.Equal(t, expected, result) }) } @@ -117,12 +124,13 @@ func TestExtractLLMTokenUsage(t *testing.T) { func TestExtractLLMTokenUsageFromUsage(t *testing.T) { tests := []struct { - name string - usage anthropic.Usage - expectedInputTokens int32 - expectedOutputTokens int32 - expectedTotalTokens int32 - expectedCachedTokens uint32 + name string + usage anthropic.Usage + expectedInputTokens int32 + expectedOutputTokens int32 + expectedTotalTokens int32 + expectedCachedTokens uint32 + expectedCachedWriteTokens uint32 }{ { name: "non-streaming response without cache", @@ -132,10 +140,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { CacheReadInputTokens: 0, CacheCreationInputTokens: 0, }, - expectedInputTokens: 150, - expectedOutputTokens: 75, - expectedTotalTokens: 225, - expectedCachedTokens: 0, + expectedInputTokens: 150, + expectedOutputTokens: 75, + expectedTotalTokens: 225, + expectedCachedTokens: 0, + expectedCachedWriteTokens: 0, }, { name: "non-streaming response with cache read", @@ -145,10 +154,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { CacheReadInputTokens: 25, CacheCreationInputTokens: 0, }, - expectedInputTokens: 125, // 100 + 0 + 25 - expectedOutputTokens: 50, - expectedTotalTokens: 175, // 125 + 50 - expectedCachedTokens: 25, // 25 + 0 + expectedInputTokens: 125, // 100 + 0 + 25 + expectedOutputTokens: 50, + expectedTotalTokens: 175, // 125 + 50 + expectedCachedTokens: 25, // 25 + expectedCachedWriteTokens: 0, // 0 }, { name: "non-streaming response with both cache types", @@ -158,10 +168,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { CacheReadInputTokens: 15, CacheCreationInputTokens: 10, }, - expectedInputTokens: 115, // 90 + 10 + 15 - expectedOutputTokens: 60, - expectedTotalTokens: 175, // 115 + 60 - expectedCachedTokens: 25, // 15 + 10 + expectedInputTokens: 115, // 90 + 10 + 15 + expectedOutputTokens: 60, + expectedTotalTokens: 175, // 115 + 60 + expectedCachedTokens: 25, // 15 + expectedCachedWriteTokens: 10, // 10 }, } @@ -172,8 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CacheCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, 0, tt.expectedOutputTokens, tt.expectedTotalTokens) - expected.SetCachedInputTokens(tt.expectedCachedTokens) + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) }) } @@ -181,12 +191,13 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { tests := []struct { - name string - usage anthropic.MessageDeltaUsage - expectedInputTokens int32 - expectedOutputTokens int32 - expectedTotalTokens int32 - expectedCachedTokens uint32 + name string + usage anthropic.MessageDeltaUsage + expectedInputTokens int32 + expectedOutputTokens int32 + expectedTotalTokens int32 + expectedCachedTokens uint32 + expectedCachedWriteTokens uint32 }{ { name: "message_delta event with final totals", @@ -196,10 +207,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 30, CacheCreationInputTokens: 0, }, - expectedInputTokens: 280, // 250 + 0 + 30 - expectedOutputTokens: 120, - expectedTotalTokens: 400, // 280 + 120 - expectedCachedTokens: 30, // 30 + 0 + expectedInputTokens: 280, // 250 + 0 + 30 + expectedOutputTokens: 120, + expectedTotalTokens: 400, // 280 + 120 + expectedCachedTokens: 30, // 30 + expectedCachedWriteTokens: 0, }, { name: "message_delta event with only output tokens", @@ -209,10 +221,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 0, CacheCreationInputTokens: 0, }, - expectedInputTokens: 0, - expectedOutputTokens: 85, - expectedTotalTokens: 85, - expectedCachedTokens: 0, + expectedInputTokens: 0, + expectedOutputTokens: 85, + expectedTotalTokens: 85, + expectedCachedTokens: 0, + expectedCachedWriteTokens: 0, }, { name: "message_delta with cache creation tokens", @@ -222,10 +235,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 10, CacheCreationInputTokens: 5, }, - expectedInputTokens: 165, // 150 + 5 + 10 - expectedOutputTokens: 75, - expectedTotalTokens: 240, // 165 + 75 - expectedCachedTokens: 15, // 10 + 5 + expectedInputTokens: 165, // 150 + 5 + 10 + expectedOutputTokens: 75, + expectedTotalTokens: 240, // 165 + 75 + expectedCachedTokens: 10, // 10 + expectedCachedWriteTokens: 5, // 5 }, } @@ -236,8 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CacheCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, 0, tt.expectedOutputTokens, tt.expectedTotalTokens) - expected.SetCachedInputTokens(tt.expectedCachedTokens) + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) }) } @@ -286,13 +299,15 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { assert.Equal(t, expectedTotalInput, inputTokensVal, "InputTokens should be sum of input_tokens + cache_creation_input_tokens + cache_read_input_tokens") - // Total cache should be sum of cache token types. - expectedCacheTokensInt := cacheCreationTokens + cacheReadTokens - expectedCacheTokens := uint32(expectedCacheTokensInt) // #nosec G115 - test values are small and safe cachedTokens, ok := result.CachedInputTokens() assert.True(t, ok) - assert.Equal(t, expectedCacheTokens, cachedTokens, - "CachedInputTokens should be sum of cache_creation_input_tokens + cache_read_input_tokens") + assert.Equal(t, cacheReadTokens, cachedTokens, + "CachedInputTokens should be cache_read_input_tokens") + + cachedWriteTokens, ok := result.CachedWriteInputTokens() + assert.True(t, ok) + assert.Equal(t, cacheCreationTokens, cachedWriteTokens, + "CachedWriteInputTokens should be cache_creation_input_tokens") // Total tokens should be input + output. expectedTotal := expectedTotalInput + uint32(outputTokens) diff --git a/internal/translator/cohere_rerank_v2_test.go b/internal/translator/cohere_rerank_v2_test.go index a1d64fa504..c3b93b818f 100644 --- a/internal/translator/cohere_rerank_v2_test.go +++ b/internal/translator/cohere_rerank_v2_test.go @@ -180,7 +180,7 @@ func TestCohereToCohereTranslatorV2Rerank_ResponseBody(t *testing.T) { } require.NoError(t, err) - expected := tokenUsageFrom(tc.expectedInput, -1, tc.expectedOutput, tc.expectedTotal) + expected := tokenUsageFrom(tc.expectedInput, -1, -1, tc.expectedOutput, tc.expectedTotal) require.Equal(t, expected, tokenUsage) require.Equal(t, "rerank-english-v3", responseModel) require.Nil(t, headerMutation) diff --git a/internal/translator/imagegeneration_openai_openai_test.go b/internal/translator/imagegeneration_openai_openai_test.go index d30190cd0e..c5b80da903 100644 --- a/internal/translator/imagegeneration_openai_openai_test.go +++ b/internal/translator/imagegeneration_openai_openai_test.go @@ -80,7 +80,7 @@ func TestOpenAIToOpenAIImageTranslator_ResponseBody_OK(t *testing.T) { require.NoError(t, err) require.Nil(t, hm) require.Nil(t, bm) - require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), usage) + require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), usage) require.Empty(t, responseModel) } @@ -192,5 +192,5 @@ func TestOpenAIToOpenAIImageTranslator_ResponseBody_Usage(t *testing.T) { buf, _ := json.Marshal(resp) _, _, usage, _, err := tr.ResponseBody(map[string]string{}, bytes.NewReader(buf), true, nil) require.NoError(t, err) - require.Equal(t, tokenUsageFrom(40, -1, 60, 100), usage) + require.Equal(t, tokenUsageFrom(40, -1, -1, 60, 100), usage) } diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go index cfc419171b..b566766eb9 100644 --- a/internal/translator/openai_awsbedrock.go +++ b/internal/translator/openai_awsbedrock.go @@ -707,6 +707,9 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string if usage.CacheReadInputTokens != nil { tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec } + if usage.CacheWriteInputTokens != nil { + tokenUsage.SetCachedWriteInputTokens(uint32(*usage.CacheWriteInputTokens)) + } } oaiEvent, ok := o.convertEvent(event) if !ok { @@ -749,11 +752,16 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string PromptTokens: bedrockResp.Usage.InputTokens, CompletionTokens: bedrockResp.Usage.OutputTokens, } + if openAIResp.Usage.PromptTokensDetails == nil { + openAIResp.Usage.PromptTokensDetails = &openai.PromptTokensDetails{} + } if bedrockResp.Usage.CacheReadInputTokens != nil { tokenUsage.SetCachedInputTokens(uint32(*bedrockResp.Usage.CacheReadInputTokens)) //nolint:gosec - openAIResp.Usage.PromptTokensDetails = &openai.PromptTokensDetails{ - CachedTokens: *bedrockResp.Usage.CacheReadInputTokens, - } + openAIResp.Usage.PromptTokensDetails.CachedTokens = *bedrockResp.Usage.CacheReadInputTokens + } + if bedrockResp.Usage.CacheWriteInputTokens != nil { + tokenUsage.SetCachedWriteInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec + openAIResp.Usage.PromptTokensDetails.CachedWriteTokens = *bedrockResp.Usage.CacheWriteInputTokens } } diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go index f469d9fa11..e97e39219a 100644 --- a/internal/translator/openai_awsbedrock_test.go +++ b/internal/translator/openai_awsbedrock_test.go @@ -1715,14 +1715,18 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T) expectedUsage = tokenUsageFrom( int32(tt.output.Usage.PromptTokens), // nolint:gosec -1, + -1, int32(tt.output.Usage.CompletionTokens), // nolint:gosec int32(tt.output.Usage.TotalTokens), // nolint:gosec ) if tt.input.Usage.CacheReadInputTokens != nil { expectedUsage.SetCachedInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec } + if tt.input.Usage.CacheWriteInputTokens != nil { + expectedUsage.SetCachedWriteInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec + } } else { - expectedUsage = tokenUsageFrom(-1, -1, -1, -1) + expectedUsage = tokenUsageFrom(-1, -1, -1, -1, -1) } require.Equal(t, expectedUsage, usedToken) }) diff --git a/internal/translator/openai_azureopenai_embeddings_test.go b/internal/translator/openai_azureopenai_embeddings_test.go index 4deffa3bde..7ee6dcae59 100644 --- a/internal/translator/openai_azureopenai_embeddings_test.go +++ b/internal/translator/openai_azureopenai_embeddings_test.go @@ -111,19 +111,19 @@ func TestOpenAIToAzureOpenAITranslatorV1EmbeddingResponseBody(t *testing.T) { "total_tokens": 8 } }`, - expTokenUsage: tokenUsageFrom(8, -1, -1, 8), + expTokenUsage: tokenUsageFrom(8, -1, -1, -1, 8), }, { name: "invalid_json", responseBody: `invalid json`, expError: true, - expTokenUsage: tokenUsageFrom(-1, -1, -1, -1), + expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, { name: "error_response", responseBody: `{"error": {"message": "Invalid input", "type": "BadRequestError"}}`, responseStatus: "400", - expTokenUsage: tokenUsageFrom(0, -1, -1, 0), + expTokenUsage: tokenUsageFrom(0, -1, -1, -1, 0), }, } { t.Run(tc.name, func(t *testing.T) { diff --git a/internal/translator/openai_azureopenai_test.go b/internal/translator/openai_azureopenai_test.go index 972b3647d1..c95597f541 100644 --- a/internal/translator/openai_azureopenai_test.go +++ b/internal/translator/openai_azureopenai_test.go @@ -72,7 +72,7 @@ func TestResponseModel_AzureOpenAI(t *testing.T) { _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewBuffer(body), true, nil) require.NoError(t, err) require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Uses response field as authoritative - require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage) } // TestResponseModel_AzureOpenAIStreaming tests Azure OpenAI streaming returns actual model version @@ -103,5 +103,5 @@ data: [DONE] _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(sseChunks)), true, nil) require.NoError(t, err) require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Returns actual versioned model from response - require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage) } diff --git a/internal/translator/openai_completions.go b/internal/translator/openai_completions.go index 30069ba921..3754dc2bff 100644 --- a/internal/translator/openai_completions.go +++ b/internal/translator/openai_completions.go @@ -171,7 +171,8 @@ func (o *openAIToOpenAITranslatorV1Completion) extractUsageFromBufferEvent(span tokenUsage.SetOutputTokens(uint32(usage.CompletionTokens)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(usage.TotalTokens)) //nolint:gosec if usage.PromptTokensDetails != nil { - tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCachedWriteInputTokens(uint32(usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec } // Do not mark buffering done; keep scanning to return the latest usage in this batch. } diff --git a/internal/translator/openai_completions_test.go b/internal/translator/openai_completions_test.go index 3bc6b351a6..cecc894a66 100644 --- a/internal/translator/openai_completions_test.go +++ b/internal/translator/openai_completions_test.go @@ -133,14 +133,14 @@ func TestOpenAIToOpenAITranslatorV1CompletionResponseBody(t *testing.T) { "total_tokens": 13 } }`, - expTokenUsage: tokenUsageFrom(5, -1, 8, 13), + expTokenUsage: tokenUsageFrom(5, -1, -1, 8, 13), expModel: "gpt-3.5-turbo-instruct", }, { name: "invalid_json", responseBody: `invalid json`, expError: true, - expTokenUsage: tokenUsageFrom(-1, -1, -1, -1), + expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, { name: "response_without_usage", @@ -157,7 +157,7 @@ func TestOpenAIToOpenAITranslatorV1CompletionResponseBody(t *testing.T) { } ] }`, - expTokenUsage: tokenUsageFrom(-1, -1, -1, -1), + expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1), expModel: "gpt-3.5-turbo-instruct", }, } { @@ -225,7 +225,7 @@ data: [DONE] require.NoError(t, err) require.Nil(t, headerMutation) require.Nil(t, bodyMutation) - require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), tokenUsage) + require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), tokenUsage) require.Equal(t, "gpt-3.5-turbo-instruct", responseModel) // Process chunk2. @@ -238,7 +238,7 @@ data: [DONE] require.NoError(t, err) require.Nil(t, headerMutation) require.Nil(t, bodyMutation) - require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), tokenUsage) + require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), tokenUsage) require.Equal(t, "gpt-3.5-turbo-instruct", responseModel) // Process chunk3 with usage. @@ -251,7 +251,7 @@ data: [DONE] require.NoError(t, err) require.Nil(t, headerMutation) require.Nil(t, bodyMutation) - require.Equal(t, tokenUsageFrom(5, -1, 3, 8), tokenUsage) + require.Equal(t, tokenUsageFrom(5, -1, -1, 3, 8), tokenUsage) require.Equal(t, "gpt-3.5-turbo-instruct", responseModel) } diff --git a/internal/translator/openai_embeddings_test.go b/internal/translator/openai_embeddings_test.go index f1750befc2..8f4c9517d1 100644 --- a/internal/translator/openai_embeddings_test.go +++ b/internal/translator/openai_embeddings_test.go @@ -113,19 +113,19 @@ func TestOpenAIToOpenAITranslatorV1EmbeddingResponseBody(t *testing.T) { "total_tokens": 8 } }`, - expTokenUsage: tokenUsageFrom(8, -1, -1, 8), + expTokenUsage: tokenUsageFrom(8, -1, -1, -1, 8), }, { name: "invalid_json", responseBody: `invalid json`, expError: true, - expTokenUsage: tokenUsageFrom(-1, -1, -1, -1), + expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, { name: "error_response", responseBody: `{"error": {"message": "Invalid input", "type": "BadRequestError"}}`, responseStatus: "400", - expTokenUsage: tokenUsageFrom(0, -1, -1, 0), + expTokenUsage: tokenUsageFrom(0, -1, -1, -1, 0), }, } { t.Run(tc.name, func(t *testing.T) { diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go index 2127d64f9c..7e11451f31 100644 --- a/internal/translator/openai_gcpanthropic.go +++ b/internal/translator/openai_gcpanthropic.go @@ -839,12 +839,14 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri outputTokens, _ := tokenUsage.OutputTokens() totalTokens, _ := tokenUsage.TotalTokens() cachedTokens, _ := tokenUsage.CachedInputTokens() + cacheWriteTokens, _ := tokenUsage.CachedWriteInputTokens() openAIResp.Usage = openai.Usage{ CompletionTokens: int(outputTokens), PromptTokens: int(inputTokens), TotalTokens: int(totalTokens), PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: int(cachedTokens), + CachedTokens: int(cachedTokens), + CachedWriteTokens: int(cacheWriteTokens), }, } diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go index 17b6f11f75..191b436545 100644 --- a/internal/translator/openai_gcpanthropic_stream.go +++ b/internal/translator/openai_gcpanthropic_stream.go @@ -110,6 +110,7 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t p.tokenUsage.SetTotalTokens(inputTokens + outputTokens) totalTokens, _ := p.tokenUsage.TotalTokens() cachedTokens, _ := p.tokenUsage.CachedInputTokens() + cachedWriteTokens, _ := p.tokenUsage.CachedWriteInputTokens() finalChunk := openai.ChatCompletionResponseChunk{ ID: p.activeMessageID, Created: p.created, @@ -120,7 +121,8 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t CompletionTokens: int(outputTokens), TotalTokens: int(totalTokens), PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: int(cachedTokens), + CachedTokens: int(cachedTokens), + CachedWriteTokens: int(cachedWriteTokens), }, }, Model: p.requestModel, @@ -289,12 +291,13 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat if output, ok := usage.OutputTokens(); ok { p.tokenUsage.AddOutputTokens(output) } - // Update input tokens to include any cache tokens from delta + // Update input tokens to include read cache tokens from delta if cached, ok := usage.CachedInputTokens(); ok { p.tokenUsage.AddInputTokens(cached) // Accumulate any additional cache tokens from delta p.tokenUsage.AddCachedInputTokens(cached) } + // Update input tokens to include write cache tokens from delta if cached, ok := usage.CachedWriteInputTokens(); ok { p.tokenUsage.AddInputTokens(cached) // Accumulate any additional cache tokens from delta diff --git a/internal/translator/openai_gcpanthropic_test.go b/internal/translator/openai_gcpanthropic_test.go index adaaa03fa0..ea0a11d979 100644 --- a/internal/translator/openai_gcpanthropic_test.go +++ b/internal/translator/openai_gcpanthropic_test.go @@ -598,12 +598,12 @@ func TestOpenAIToGCPAnthropicTranslatorV1ChatCompletion_ResponseBody(t *testing. require.NoError(t, err) expectedTokenUsage := tokenUsageFrom( - int32(tt.expectedOpenAIResponse.Usage.PromptTokens), // nolint:gosec - -1, - int32(tt.expectedOpenAIResponse.Usage.CompletionTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.TotalTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokens), // nolint:gosec + uint32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.CompletionTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.TotalTokens), // nolint:gosec ) - expectedTokenUsage.SetCachedInputTokens(uint32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec require.Equal(t, expectedTokenUsage, usedToken) if diff := cmp.Diff(tt.expectedOpenAIResponse, gotResp, cmpopts.IgnoreFields(openai.ChatCompletionResponse{}, "Created")); diff != "" { diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go index 716ced8cdd..232ab836cb 100644 --- a/internal/translator/openai_gcpvertexai.go +++ b/internal/translator/openai_gcpvertexai.go @@ -170,6 +170,8 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount)) //nolint:gosec tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec + // Gemini does not return cached write input tokens, set to 0. + tokenUsage.SetCachedWriteInputTokens(0) } if span != nil { diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go index 033b8782b9..93740102ce 100644 --- a/internal/translator/openai_gcpvertexai_test.go +++ b/internal/translator/openai_gcpvertexai_test.go @@ -913,7 +913,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T "total_tokens": 25 } }`), - wantTokenUsage: tokenUsageFrom(10, 10, 15, 25), + wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25), }, { name: "response with safety ratings", @@ -993,7 +993,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T "total_tokens": 20 } }`), - wantTokenUsage: tokenUsageFrom(8, 0, 12, 20), + wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20), }, { name: "empty response", @@ -1005,7 +1005,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T wantError: false, wantHeaderMut: []internalapi.Header{{contentLengthHeaderName, "28"}}, wantBodyMut: []byte(`{"object":"chat.completion"}`), - wantTokenUsage: tokenUsageFrom(-1, -1, -1, -1), + wantTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1), }, { name: "single stream chunk response", @@ -1025,7 +1025,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t data: [DONE] `), - wantTokenUsage: tokenUsageFrom(5, 0, 3, 8), + wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), }, { name: "response with model version field", @@ -1080,7 +1080,7 @@ data: [DONE] "total_tokens": 14 } }`), - wantTokenUsage: tokenUsageFrom(6, 0, 8, 14), + wantTokenUsage: tokenUsageFrom(6, 0, -1, 8, 14), }, { @@ -1149,7 +1149,7 @@ data: [DONE] "total_tokens": 20 } }`), - wantTokenUsage: tokenUsageFrom(8, 0, 12, 20), + wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20), }, { name: "response with thought summary", @@ -1214,7 +1214,7 @@ data: [DONE] } }`), - wantTokenUsage: tokenUsageFrom(10, 10, 15, 25), + wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25), }, { name: "stream chunks with thought summary", @@ -1236,7 +1236,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t data: [DONE] `), - wantTokenUsage: tokenUsageFrom(5, 0, 3, 8), + wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), }, } @@ -1355,7 +1355,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_StreamingResponseBody(t * print(bodyStr) require.Contains(t, bodyStr, "data: ") require.Contains(t, bodyStr, "chat.completion.chunk") - require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), tokenUsage) // No usage in this test chunk. + require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), tokenUsage) // No usage in this test chunk. }) } } diff --git a/internal/translator/openai_openai.go b/internal/translator/openai_openai.go index 277f2e982f..9ad3084b3b 100644 --- a/internal/translator/openai_openai.go +++ b/internal/translator/openai_openai.go @@ -141,7 +141,8 @@ func (o *openAIToOpenAITranslatorV1ChatCompletion) ResponseBody(_ map[string]str tokenUsage.SetOutputTokens(uint32(resp.Usage.CompletionTokens)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) //nolint:gosec if resp.Usage.PromptTokensDetails != nil { - tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec } // Fallback to request model for test or non-compliant OpenAI backends responseModel = cmp.Or(resp.Model, o.requestModel) diff --git a/internal/translator/openai_openai_test.go b/internal/translator/openai_openai_test.go index a90160fb22..8d95cd503b 100644 --- a/internal/translator/openai_openai_test.go +++ b/internal/translator/openai_openai_test.go @@ -49,7 +49,7 @@ data: [DONE] _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(sseChunks)), true, nil) require.NoError(t, err) require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Returns actual versioned model - require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage) } // TestResponseModel_EmptyFallback tests the fallback to request model when response model is empty @@ -83,7 +83,7 @@ func TestResponseModel_EmptyFallback(t *testing.T) { _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(responseJSON)), false, nil) require.NoError(t, err) require.Equal(t, "gpt-4o", responseModel) // Falls back to request model - require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage) }) t.Run("streaming", func(t *testing.T) { @@ -112,7 +112,7 @@ data: [DONE] _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(sseChunks)), true, nil) require.NoError(t, err) require.Equal(t, "gpt-4o-mini", responseModel) // Falls back to request model - require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage) }) t.Run("with model override", func(t *testing.T) { @@ -148,7 +148,7 @@ data: [DONE] _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(responseJSON)), false, nil) require.NoError(t, err) require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Falls back to overridden model - require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage) }) } @@ -358,7 +358,7 @@ data: [DONE] o := &openAIToOpenAITranslatorV1ChatCompletion{} _, _, usedToken, _, err := o.ResponseBody(nil, bytes.NewBuffer(body), false, s) require.NoError(t, err) - require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken) + require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken) require.Equal(t, &resp, s.Resp) }) t.Run("valid body with different response model", func(t *testing.T) { @@ -373,7 +373,7 @@ data: [DONE] o := &openAIToOpenAITranslatorV1ChatCompletion{} _, _, usedToken, _, err := o.ResponseBody(nil, bytes.NewBuffer(body), false, s) require.NoError(t, err) - require.Equal(t, tokenUsageFrom(10, -1, 20, 30), usedToken) + require.Equal(t, tokenUsageFrom(10, -1, -1, 20, 30), usedToken) require.Equal(t, &resp, s.Resp) }) }) @@ -397,7 +397,7 @@ data: [DONE] o := &openAIToOpenAITranslatorV1ChatCompletion{} _, _, usedToken, _, err := o.ResponseBody(nil, bytes.NewBuffer(body), false, s) require.NoError(t, err) - require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken) + require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken) require.Equal(t, &resp, s.Resp) }) }) @@ -409,7 +409,7 @@ func TestExtractUsageFromBufferEvent(t *testing.T) { o := &openAIToOpenAITranslatorV1ChatCompletion{} o.buffered = []byte("data: {\"usage\": {\"total_tokens\": 42}}\n") usedToken := o.extractUsageFromBufferEvent(s) - require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken) + require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken) require.Empty(t, o.buffered) require.Len(t, s.RespChunks, 1) }) @@ -418,7 +418,7 @@ func TestExtractUsageFromBufferEvent(t *testing.T) { o := &openAIToOpenAITranslatorV1ChatCompletion{} o.buffered = []byte("data: invalid\ndata: {\"usage\": {\"total_tokens\": 42}}\n") usedToken := o.extractUsageFromBufferEvent(nil) - require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken) + require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken) require.Empty(t, o.buffered) }) @@ -426,12 +426,12 @@ func TestExtractUsageFromBufferEvent(t *testing.T) { o := &openAIToOpenAITranslatorV1ChatCompletion{} o.buffered = []byte("data: {}\n\ndata: ") usedToken := o.extractUsageFromBufferEvent(nil) - require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), usedToken) + require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), usedToken) require.GreaterOrEqual(t, len(o.buffered), 1) o.buffered = append(o.buffered, []byte("{\"usage\": {\"total_tokens\": 42}}\n")...) usedToken = o.extractUsageFromBufferEvent(nil) - require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken) + require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken) require.Empty(t, o.buffered) }) @@ -439,7 +439,7 @@ func TestExtractUsageFromBufferEvent(t *testing.T) { o := &openAIToOpenAITranslatorV1ChatCompletion{} o.buffered = []byte("data: invalid\n") usedToken := o.extractUsageFromBufferEvent(nil) - require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), usedToken) + require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), usedToken) require.Empty(t, o.buffered) }) } @@ -461,7 +461,7 @@ func TestResponseModel_OpenAI(t *testing.T) { _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewBuffer(body), true, nil) require.NoError(t, err) require.Equal(t, "gpt-4o-2024-08-06", responseModel) - require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage) } // TestResponseModel_OpenAIEmbeddings tests OpenAI embeddings (not virtualized but has response field) @@ -480,5 +480,5 @@ func TestResponseModel_OpenAIEmbeddings(t *testing.T) { _, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader(body), true, nil) require.NoError(t, err) require.Equal(t, "text-embedding-ada-002", responseModel) // Uses response field as authoritative - require.Equal(t, tokenUsageFrom(10, -1, -1, 10), tokenUsage) + require.Equal(t, tokenUsageFrom(10, -1, -1, -1, 10), tokenUsage) } diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go index 0accb13b45..9550d0ce4f 100644 --- a/internal/translator/openai_responses.go +++ b/internal/translator/openai_responses.go @@ -128,10 +128,11 @@ func (o *openAIToOpenAITranslatorV1Responses) handleNonStreamingResponse(body io // TODO: Add reasoning token usage if resp.Usage != nil { - tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens)) // #nosec G115 - tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens)) // #nosec G115 - tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) // #nosec G115 - tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 + tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens)) // #nosec G115 + tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens)) // #nosec G115 + tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) // #nosec G115 + tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 + tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.InputTokensDetails.CachedWriteTokens)) // #nosec G115 } // Record non-streaming response to span if tracing is enabled. @@ -178,6 +179,8 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t tokenUsage.SetOutputTokens(uint32(respComplEvent.Response.Usage.OutputTokens)) // #nosec G115 tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens)) // #nosec G115 tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 + // Openai does not support cached write response. + tokenUsage.SetCachedWriteInputTokens(uint32(0)) // #nosec G115 } // Record streaming chunk to span if tracing is enabled. if span != nil { diff --git a/internal/translator/openai_responses_test.go b/internal/translator/openai_responses_test.go index f136d74ebe..80c574d51e 100644 --- a/internal/translator/openai_responses_test.go +++ b/internal/translator/openai_responses_test.go @@ -246,6 +246,10 @@ func TestResponsesOpenAIToOpenAITranslator_ResponseBody(t *testing.T) { cachedTokens, ok := tokenUsage.CachedInputTokens() require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) + + cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + require.True(t, ok) + require.Equal(t, uint32(0), cachedWriteTokens) }) t.Run("non-streaming response with fallback model", func(t *testing.T) { @@ -358,6 +362,10 @@ data: [DONE] cachedTokens, ok := tokenUsage.CachedInputTokens() require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) + + cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + require.True(t, ok) + require.Equal(t, uint32(0), cachedWriteTokens) }) t.Run("streaming response with fallback model", func(t *testing.T) { @@ -453,6 +461,10 @@ data: [DONE] cachedTokens, _ := tokenUsage.CachedInputTokens() require.Equal(t, uint32(2), cachedTokens) + + cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + require.True(t, ok) + require.Equal(t, uint32(0), cachedWriteTokens) }) t.Run("streaming read error", func(t *testing.T) { @@ -541,6 +553,10 @@ func TestResponses_HandleNonStreamingResponse(t *testing.T) { cachedTokens, _ := tokenUsage.CachedInputTokens() require.Equal(t, uint32(2), cachedTokens) + + cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + require.True(t, ok) + require.Equal(t, uint32(0), cachedWriteTokens) }) t.Run("invalid JSON", func(t *testing.T) { @@ -602,6 +618,10 @@ data: [DONE] cachedTokens, ok := tokenUsage.CachedInputTokens() require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) + + cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + require.True(t, ok) + require.Equal(t, uint32(0), cachedWriteTokens) }) t.Run("model extraction", func(t *testing.T) { @@ -666,9 +686,11 @@ data: [DONE] _, outputSet := tokenUsage.OutputTokens() _, totalSet := tokenUsage.TotalTokens() _, cachedSet := tokenUsage.CachedInputTokens() + _, cachedWriteSet := tokenUsage.CachedWriteInputTokens() require.False(t, totalSet) require.False(t, cachedSet) + require.False(t, cachedWriteSet) require.False(t, inputSet) require.False(t, outputSet) }) diff --git a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml index b6f1733910..4407e6ae18 100644 --- a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml +++ b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml @@ -31,6 +31,8 @@ spec: type: InputToken - metadataKey: llm_input_cached_token type: CachedInputToken + - metadataKey: llm_write_input_cached_token + type: CachedWriteInputToken - metadataKey: llm_output_token type: OutputToken - metadataKey: llm_total_token From 6ed2f434137c76b86bafaa179254057299e1d5cf Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 15:16:21 -0500 Subject: [PATCH 03/20] cache for aws; Signed-off-by: Aaron Choo --- internal/translator/openai_awsbedrock.go | 14 ++++++++------ internal/translator/openai_awsbedrock_test.go | 12 +++++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go index b566766eb9..56b4870151 100644 --- a/internal/translator/openai_awsbedrock.go +++ b/internal/translator/openai_awsbedrock.go @@ -853,14 +853,16 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe return chunk, false } chunk.Usage = &openai.Usage{ - TotalTokens: event.Usage.TotalTokens, - PromptTokens: event.Usage.InputTokens, - CompletionTokens: event.Usage.OutputTokens, + TotalTokens: event.Usage.TotalTokens, + PromptTokens: event.Usage.InputTokens, + CompletionTokens: event.Usage.OutputTokens, + PromptTokensDetails: &openai.PromptTokensDetails{}, } if event.Usage.CacheReadInputTokens != nil { - chunk.Usage.PromptTokensDetails = &openai.PromptTokensDetails{ - CachedTokens: *event.Usage.CacheReadInputTokens, - } + chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens + } + if event.Usage.CacheWriteInputTokens != nil { + chunk.Usage.PromptTokensDetails.CachedWriteTokens = *event.Usage.CacheWriteInputTokens } // messageStart event. case awsbedrock.ConverseStreamEventTypeMessageStart.String(): diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go index e97e39219a..e780b8a34c 100644 --- a/internal/translator/openai_awsbedrock_test.go +++ b/internal/translator/openai_awsbedrock_test.go @@ -1447,10 +1447,11 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T) name: "basic_testing", input: awsbedrock.ConverseResponse{ Usage: &awsbedrock.TokenUsage{ - InputTokens: 10, - OutputTokens: 20, - TotalTokens: 30, - CacheReadInputTokens: ptr.To(5), + InputTokens: 10, + OutputTokens: 20, + TotalTokens: 30, + CacheReadInputTokens: ptr.To(5), + CacheWriteInputTokens: ptr.To(7), }, Output: &awsbedrock.ConverseOutput{ Message: awsbedrock.Message{ @@ -1473,7 +1474,8 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T) PromptTokens: 10, CompletionTokens: 20, PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: 5, + CachedTokens: 5, + CachedWriteTokens: 7, }, }, Choices: []openai.ChatCompletionResponseChoice{ From 21dc66c5dbb12262028af200e7cb27d1a730200f Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 15:26:09 -0500 Subject: [PATCH 04/20] fix cel Signed-off-by: Aaron Choo --- internal/llmcostcel/cel.go | 2 +- internal/llmcostcel/cel_test.go | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go index 0c2f19f913..2d2f4ad834 100644 --- a/internal/llmcostcel/cel.go +++ b/internal/llmcostcel/cel.go @@ -56,7 +56,7 @@ func NewProgram(expr string) (prog cel.Program, err error) { } // Sanity check by evaluating the expression with some dummy values. - _, err = EvaluateProgram(prog, "dummy", "dummy", 0, 0, 0, 0) + _, err = EvaluateProgram(prog, "dummy", "dummy", 0, 0, 0, 0, 0) if err != nil { return nil, fmt.Errorf("failed to evaluate CEL expression: %w", err) } diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go index 7730b181fb..79835b7354 100644 --- a/internal/llmcostcel/cel_test.go +++ b/internal/llmcostcel/cel_test.go @@ -26,13 +26,13 @@ func TestNewProgram(t *testing.T) { require.NoError(t, err) }) t.Run("variables", func(t *testing.T) { - prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens) * output_tokens : total_tokens") + prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens - cached_write_input_tokens) * output_tokens : total_tokens") require.NoError(t, err) - v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 2, 3) + v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3) require.NoError(t, err) - require.Equal(t, uint64(200), v) + require.Equal(t, uint64(198), v) - v, err = EvaluateProgram(prog, "not_cool_model", "cool_backend", 200, 100, 2, 3) + v, err = EvaluateProgram(prog, "not_cool_model", "cool_backend", 200, 100, 1, 2, 3) require.NoError(t, err) require.Equal(t, uint64(3), v) }) @@ -59,13 +59,13 @@ func TestEvaluateProgram(t *testing.T) { t.Run("signed integer negative", func(t *testing.T) { prog, err := NewProgram("int(input_tokens) - int(output_tokens)") require.NoError(t, err) - _, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 2000, 3) + _, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 0, 2000, 3) require.ErrorContains(t, err, "CEL expression result is negative (-1900)") }) t.Run("unsigned integer overflow", func(t *testing.T) { prog, err := NewProgram("input_tokens - output_tokens") require.NoError(t, err) - _, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 2000, 3) + _, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 0, 2000, 3) require.ErrorContains(t, err, "failed to evaluate CEL expression: unsigned integer overflow") }) t.Run("ensure concurrency safety", func(t *testing.T) { @@ -76,7 +76,7 @@ func TestEvaluateProgram(t *testing.T) { synctest.Test(t, func(t *testing.T) { for range 100 { go func() { - v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 2, 3) + v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 0, 2, 3) require.NoError(t, err) require.Equal(t, uint64(200), v) }() From 4c1dc960a51e40cb10101f897ec0b8e8991162cb Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 15:27:07 -0500 Subject: [PATCH 05/20] fix -1 Signed-off-by: Aaron Choo --- internal/translator/anthropic_anthropic_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go index 20fc793ab5..236dd537d9 100644 --- a/internal/translator/anthropic_anthropic_test.go +++ b/internal/translator/anthropic_anthropic_test.go @@ -95,7 +95,7 @@ func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) { require.NoError(t, err) require.Nil(t, headerMutation) require.Nil(t, bodyMutation) - expected := tokenUsageFrom(9, 0, -1, 16, 25) + expected := tokenUsageFrom(9, 0, 0, 16, 25) require.Equal(t, expected, tokenUsage) require.Equal(t, "claude-sonnet-4-5-20250929", responseModel) } From c58a29a4f574eaad5060d01f4fd7cdbdaffb4242 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:09:22 -0500 Subject: [PATCH 06/20] find+replace cache write with cache creation Signed-off-by: Aaron Choo --- api/v1alpha1/ai_gateway_route.go | 4 +- api/v1alpha1/shared_types.go | 12 +- examples/token_ratelimit/token_ratelimit.yaml | 4 +- internal/apischema/anthropic/anthropic.go | 2 +- internal/apischema/openai/openai.go | 8 +- internal/apischema/openai/openai_test.go | 38 +-- internal/controller/gateway.go | 4 +- internal/controller/gateway_test.go | 4 +- internal/extproc/mocks_test.go | 28 +- internal/extproc/processor_impl.go | 2 +- internal/extproc/processor_impl_test.go | 6 +- internal/filterapi/filterconfig.go | 4 +- internal/llmcostcel/cel.go | 32 +-- internal/llmcostcel/cel_test.go | 2 +- internal/metrics/genai.go | 6 +- internal/metrics/metrics.go | 48 ++-- internal/metrics/metrics_impl.go | 2 +- internal/metrics/metrics_impl_test.go | 12 +- .../openinference/anthropic/messages.go | 2 +- .../openinference/openai/response_attrs.go | 6 +- internal/translator/anthropic_anthropic.go | 8 +- .../translator/anthropic_gcpanthropic_test.go | 16 +- internal/translator/anthropic_usage_test.go | 256 +++++++++--------- internal/translator/openai_awsbedrock.go | 8 +- internal/translator/openai_awsbedrock_test.go | 4 +- internal/translator/openai_completions.go | 4 +- internal/translator/openai_gcpanthropic.go | 4 +- .../translator/openai_gcpanthropic_stream.go | 12 +- internal/translator/openai_gcpvertexai.go | 4 +- internal/translator/openai_openai.go | 4 +- internal/translator/openai_responses.go | 2 +- .../testdata/aigatewayroutes/llmcosts.yaml | 2 +- 32 files changed, 275 insertions(+), 275 deletions(-) diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go index b4dac38660..6404f8e8a6 100644 --- a/api/v1alpha1/ai_gateway_route.go +++ b/api/v1alpha1/ai_gateway_route.go @@ -108,8 +108,8 @@ type AIGatewayRouteSpec struct { // type: TotalToken // - metadataKey: llm_cached_input_token // type: CachedInputToken - // - metadataKey: llm_cached_write_input_token - // type: CachedWriteInputToken + // - metadataKey: llm_cache_creation_input_token + // type: CachedCreationInputToken // ``` // Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three // rate limit buckets for each unique x-user-id header value. One bucket is for the input token, diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go index 7a34cf4128..2e391498dd 100644 --- a/api/v1alpha1/shared_types.go +++ b/api/v1alpha1/shared_types.go @@ -100,9 +100,9 @@ type LLMRequestCost struct { MetadataKey string `json:"metadataKey"` // Type specifies the type of the request cost. The default is "OutputToken", // and it uses "output token" as the cost. The other types are "InputToken", "TotalToken", - // "CachedInputToken", "CachedWriteInputToken", and "CEL". + // "CachedInputToken", "CachedCreationInputToken", and "CEL". // - // +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedWriteInputToken;TotalToken;CEL + // +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedCreationInputToken;TotalToken;CEL Type LLMRequestCostType `json:"type"` // CEL is the CEL expression to calculate the cost of the request. // The CEL expression must return a signed or unsigned integer. If the @@ -114,7 +114,7 @@ type LLMRequestCost struct { // * backend: the backend name in the form of "name.namespace". Type: string. // * input_tokens: the number of input tokens. Type: unsigned integer. // * cached_input_tokens: the number of cached read input tokens. Type: unsigned integer. - // * cached_write_input_tokens: the number of cached write input tokens. Type: unsigned integer. + // * cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer. // * output_tokens: the number of output tokens. Type: unsigned integer. // * total_tokens: the total number of tokens. Type: unsigned integer. // @@ -122,7 +122,7 @@ type LLMRequestCost struct { // // * "model == 'llama' ? input_tokens + output_token * 0.5 : total_tokens" // * "backend == 'foo.default' ? input_tokens + output_tokens : total_tokens" - // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_write_input_tokens * 1.25 + output_tokens : total_tokens" + // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens" // * "input_tokens + output_tokens + total_tokens" // * "input_tokens * output_tokens" // @@ -138,8 +138,8 @@ const ( LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken" // LLMRequestCostTypeCachedInputToken is the cost type of the cached input token. LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken" - // LLMRequestCostTypeCachedWriteInputToken is the cost type of the cached input token. - LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken" + // LLMRequestCostTypeCachedCreationInputToken is the cost type of the cached input token. + LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken" // LLMRequestCostTypeOutputToken is the cost type of the output token. LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken" // LLMRequestCostTypeTotalToken is the cost type of the total token. diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml index 2224ed724a..4ebe6217c7 100644 --- a/examples/token_ratelimit/token_ratelimit.yaml +++ b/examples/token_ratelimit/token_ratelimit.yaml @@ -51,8 +51,8 @@ spec: type: InputToken - metadataKey: llm_cached_input_token type: CachedInputToken - - metadataKey: llm_cached_write_input_token - type: CachedWriteInputToken + - metadataKey: llm_cache_creation_input_token + type: CachedCreationInputToken - metadataKey: llm_output_token type: OutputToken - metadataKey: llm_total_token diff --git a/internal/apischema/anthropic/anthropic.go b/internal/apischema/anthropic/anthropic.go index f65d102761..f02e706d92 100644 --- a/internal/apischema/anthropic/anthropic.go +++ b/internal/apischema/anthropic/anthropic.go @@ -437,7 +437,7 @@ const ( // so we use float64 to be able to unmarshal both 1234 and 1234.0 without errors. type Usage struct { // The number of input tokens used to create the cache entry. - CacheCreationInputTokens float64 `json:"cache_creation_input_tokens"` + CachedCreationInputTokens float64 `json:"cache_creation_input_tokens"` // The number of input tokens read from the cache. CacheReadInputTokens float64 `json:"cache_read_input_tokens"` // The number of input tokens which were used. diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go index 0c74a3e244..1cb6268929 100644 --- a/internal/apischema/openai/openai.go +++ b/internal/apischema/openai/openai.go @@ -1383,7 +1383,7 @@ type PromptTokensDetails struct { // Cached tokens present in the prompt. CachedTokens int `json:"cached_tokens,omitzero"` // Tokens written to the cache. - CachedWriteTokens int `json:"cached_write_tokens,omitzero"` + CachedCreationTokens int `json:"cache_creation_input_tokens,omitzero"` } // ChatCompletionResponseChunk is described in the OpenAI API documentation: @@ -2539,7 +2539,7 @@ type ResponseUsageInputTokensDetails struct { CachedTokens int64 `json:"cached_tokens"` // The number of tokens that were written to the cache. - CachedWriteTokens int64 `json:"cached_write_tokens"` + CachedCreationTokens int64 `json:"cache_creation_input_tokens"` } // A detailed breakdown of the output tokens. @@ -2553,8 +2553,8 @@ type ResponseTokensDetails struct { // CachedTokens: Number of cached tokens. CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api - // CachedWriteTokens: number of tokens that were written to the cache. - CachedWriteTokens int64 `json:"cached_write_tokens"` //nolint:tagliatelle + // CachedCreationTokens: number of tokens that were written to the cache. + CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle // ReasoningTokens: Number of reasoning tokens (for reasoning models). ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go index d9df99e6a4..c592a3b712 100644 --- a/internal/apischema/openai/openai_test.go +++ b/internal/apischema/openai/openai_test.go @@ -1742,30 +1742,30 @@ func TestPromptTokensDetails(t *testing.T) { { name: "with text tokens", details: PromptTokensDetails{ - TextTokens: 15, - AudioTokens: 8, - CachedTokens: 384, - CachedWriteTokens: 10, + TextTokens: 15, + AudioTokens: 8, + CachedTokens: 384, + CachedCreationTokens: 10, }, expected: `{ "text_tokens": 15, "audio_tokens": 8, "cached_tokens": 384, - "cached_write_tokens": 10 + "cache_creation_input_tokens": 10 }`, }, { name: "with zero text tokens omitted", details: PromptTokensDetails{ - TextTokens: 0, - AudioTokens: 8, - CachedTokens: 384, - CachedWriteTokens: 10, + TextTokens: 0, + AudioTokens: 8, + CachedTokens: 384, + CachedCreationTokens: 10, }, expected: `{ "audio_tokens": 8, "cached_tokens": 384, - "cached_write_tokens": 10 + "cache_creation_input_tokens": 10 }`, }, } @@ -1822,9 +1822,9 @@ func TestChatCompletionResponseUsage(t *testing.T) { RejectedPredictionTokens: 0, }, PromptTokensDetails: &PromptTokensDetails{ - AudioTokens: 8, - CachedTokens: 384, - CachedWriteTokens: 13, + AudioTokens: 8, + CachedTokens: 384, + CachedCreationTokens: 13, }, }, expected: `{ @@ -1838,7 +1838,7 @@ func TestChatCompletionResponseUsage(t *testing.T) { "prompt_tokens_details": { "audio_tokens": 8, "cached_tokens": 384, - "cached_write_tokens": 13 + "cache_creation_input_tokens": 13 } }`, }, @@ -1856,10 +1856,10 @@ func TestChatCompletionResponseUsage(t *testing.T) { RejectedPredictionTokens: 0, }, PromptTokensDetails: &PromptTokensDetails{ - TextTokens: 15, - AudioTokens: 8, - CachedTokens: 384, - CachedWriteTokens: 21, + TextTokens: 15, + AudioTokens: 8, + CachedTokens: 384, + CachedCreationTokens: 21, }, }, expected: `{ @@ -1875,7 +1875,7 @@ func TestChatCompletionResponseUsage(t *testing.T) { "text_tokens": 15, "audio_tokens": 8, "cached_tokens": 384, - "cached_write_tokens": 21 + "cache_creation_input_tokens": 21 } }`, }, diff --git a/internal/controller/gateway.go b/internal/controller/gateway.go index db952afb21..b2e577114f 100644 --- a/internal/controller/gateway.go +++ b/internal/controller/gateway.go @@ -404,8 +404,8 @@ func (c *GatewayController) reconcileFilterConfigSecret( fc.Type = filterapi.LLMRequestCostTypeInputToken case aigv1a1.LLMRequestCostTypeCachedInputToken: fc.Type = filterapi.LLMRequestCostTypeCachedInputToken - case aigv1a1.LLMRequestCostTypeCachedWriteInputToken: - fc.Type = filterapi.LLMRequestCostTypeCachedWriteInputToken + case aigv1a1.LLMRequestCostTypeCachedCreationInputToken: + fc.Type = filterapi.LLMRequestCostTypeCachedCreationInputToken case aigv1a1.LLMRequestCostTypeOutputToken: fc.Type = filterapi.LLMRequestCostTypeOutputToken case aigv1a1.LLMRequestCostTypeTotalToken: diff --git a/internal/controller/gateway_test.go b/internal/controller/gateway_test.go index 6c3be8df2f..fb26991883 100644 --- a/internal/controller/gateway_test.go +++ b/internal/controller/gateway_test.go @@ -197,7 +197,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) { {MetadataKey: "bar", Type: aigv1a1.LLMRequestCostTypeOutputToken}, {MetadataKey: "baz", Type: aigv1a1.LLMRequestCostTypeTotalToken}, {MetadataKey: "qux", Type: aigv1a1.LLMRequestCostTypeCachedInputToken}, - {MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedWriteInputToken}, + {MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedCreationInputToken}, }, }, }, @@ -280,7 +280,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) { require.Equal(t, filterapi.LLMRequestCostTypeOutputToken, fc.LLMRequestCosts[1].Type) require.Equal(t, filterapi.LLMRequestCostTypeTotalToken, fc.LLMRequestCosts[2].Type) require.Equal(t, filterapi.LLMRequestCostTypeCachedInputToken, fc.LLMRequestCosts[3].Type) - require.Equal(t, filterapi.LLMRequestCostTypeCachedWriteInputToken, fc.LLMRequestCosts[4].Type) + require.Equal(t, filterapi.LLMRequestCostTypeCachedCreationInputToken, fc.LLMRequestCosts[4].Type) require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[5].Type) require.Equal(t, `backend == 'foo.default' ? input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[5].CEL) require.Len(t, fc.Models, 1) diff --git a/internal/extproc/mocks_test.go b/internal/extproc/mocks_test.go index a2997f14f9..6d5087eb9d 100644 --- a/internal/extproc/mocks_test.go +++ b/internal/extproc/mocks_test.go @@ -171,17 +171,17 @@ func (m *mockMetricsFactory) NewMetrics() metrics.Metrics { // mockMetrics implements [metrics.Metrics] for testing. type mockMetrics struct { - requestStart time.Time - originalModel string - requestModel string - responseModel string - backend string - requestSuccessCount int - requestErrorCount int - inputTokenCount int - cachedInputTokenCount int - cachedWriteInputTokenCount int - outputTokenCount int + requestStart time.Time + originalModel string + requestModel string + responseModel string + backend string + requestSuccessCount int + requestErrorCount int + inputTokenCount int + cachedInputTokenCount int + cachedCreationInputTokenCount int + outputTokenCount int // streamingOutputTokens tracks the cumulative output tokens recorded via RecordTokenLatency. streamingOutputTokens int timeToFirstToken float64 @@ -219,8 +219,8 @@ func (m *mockMetrics) RecordTokenUsage(_ context.Context, usage metrics.TokenUsa if cachedInput, ok := usage.CachedInputTokens(); ok { m.cachedInputTokenCount += int(cachedInput) } - if cachedWriteInput, ok := usage.CachedWriteInputTokens(); ok { - m.cachedWriteInputTokenCount += int(cachedWriteInput) + if cachedCreationInput, ok := usage.CachedCreationInputTokens(); ok { + m.cachedCreationInputTokenCount += int(cachedCreationInput) } if output, ok := usage.OutputTokens(); ok { m.outputTokenCount += int(output) @@ -285,7 +285,7 @@ func (m *mockMetrics) RequireRequestFailure(t *testing.T) { func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedWriteCachedInput, expectedOutput int) { require.Equal(t, expectedInput, m.inputTokenCount) require.Equal(t, expectedCachedInput, m.cachedInputTokenCount) - require.Equal(t, expectedWriteCachedInput, m.cachedWriteInputTokenCount) + require.Equal(t, expectedWriteCachedInput, m.cachedCreationInputTokenCount) require.Equal(t, expectedOutput, m.outputTokenCount) } diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go index 93c35aa7cc..800b880393 100644 --- a/internal/extproc/processor_impl.go +++ b/internal/extproc/processor_impl.go @@ -533,7 +533,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU cost, _ = costs.InputTokens() case filterapi.LLMRequestCostTypeCachedInputToken: cost, _ = costs.CachedInputTokens() - case filterapi.LLMRequestCostTypeCachedWriteInputToken: + case filterapi.LLMRequestCostTypeCachedCreationInputToken: cost, _ = costs.CachedWriteInputTokens() case filterapi.LLMRequestCostTypeOutputToken: cost, _ = costs.OutputTokens() diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go index a590a38d24..e97d2fae45 100644 --- a/internal/extproc/processor_impl_test.go +++ b/internal/extproc/processor_impl_test.go @@ -259,7 +259,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T mt.retUsedToken.SetOutputTokens(123) mt.retUsedToken.SetInputTokens(1) mt.retUsedToken.SetCachedInputTokens(1) - mt.retUsedToken.SetCachedWriteInputTokens(3) + mt.retUsedToken.SetCachedCreationInputTokens(3) celProgInt, err := llmcostcel.NewProgram("54321") require.NoError(t, err) @@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}}, - {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cached_write_input_token_usage"}}, + {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cache_creation_input_token_usage"}}, { CELProg: celProgInt, LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"}, @@ -312,7 +312,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cached_input_token_usage"].GetNumberValue()) require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. - GetStructValue().Fields["cached_write_input_token_usage"].GetNumberValue()) + GetStructValue().Fields["cache_creation_input_token_usage"].GetNumberValue()) require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cel_int"].GetNumberValue()) require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. diff --git a/internal/filterapi/filterconfig.go b/internal/filterapi/filterconfig.go index f191d6eb01..b2f10de51f 100644 --- a/internal/filterapi/filterconfig.go +++ b/internal/filterapi/filterconfig.go @@ -81,8 +81,8 @@ const ( LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken" // LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached read input token. LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken" - // LLMRequestCostTypeCachedWriteInputToken specifies that the request cost is calculated from the cached write input token. - LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken" + // LLMRequestCostTypeCachedCreationInputToken specifies that the request cost is calculated from the cache creation input token. + LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken" // LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token. LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken" // LLMRequestCostTypeCEL specifies that the request cost is calculated from the CEL expression. diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go index 2d2f4ad834..5bc0008d59 100644 --- a/internal/llmcostcel/cel.go +++ b/internal/llmcostcel/cel.go @@ -16,13 +16,13 @@ import ( ) const ( - celModelNameKey = "model" - celBackendKey = "backend" - celInputTokensKey = "input_tokens" - celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 - celCachedWriteInputTokensKey = "cached_write_input_tokens" // #nosec G101 - celOutputTokensKey = "output_tokens" - celTotalTokensKey = "total_tokens" + celModelNameKey = "model" + celBackendKey = "backend" + celInputTokensKey = "input_tokens" + celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 + celCachedCreationInputTokensKey = "cache_creation_input_tokens" // #nosec G101 + celOutputTokensKey = "output_tokens" + celTotalTokensKey = "total_tokens" ) var env *cel.Env @@ -34,7 +34,7 @@ func init() { cel.Variable(celBackendKey, cel.StringType), cel.Variable(celInputTokensKey, cel.UintType), cel.Variable(celCachedInputTokensKey, cel.UintType), - cel.Variable(celCachedWriteInputTokensKey, cel.UintType), + cel.Variable(celCachedCreationInputTokensKey, cel.UintType), cel.Variable(celOutputTokensKey, cel.UintType), cel.Variable(celTotalTokensKey, cel.UintType), ) @@ -64,15 +64,15 @@ func NewProgram(expr string) (prog cel.Program, err error) { } // EvaluateProgram evaluates the given CEL program with the given variables. -func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedWriteInputTokens, outputTokens, totalTokens uint32) (uint64, error) { +func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedCreationInputTokens, outputTokens, totalTokens uint32) (uint64, error) { out, _, err := prog.Eval(map[string]any{ - celModelNameKey: modelName, - celBackendKey: backend, - celInputTokensKey: inputTokens, - celCachedInputTokensKey: cachedInputTokens, - celCachedWriteInputTokensKey: cachedWriteInputTokens, - celOutputTokensKey: outputTokens, - celTotalTokensKey: totalTokens, + celModelNameKey: modelName, + celBackendKey: backend, + celInputTokensKey: inputTokens, + celCachedInputTokensKey: cachedInputTokens, + celCachedCreationInputTokensKey: cachedCreationInputTokens, + celOutputTokensKey: outputTokens, + celTotalTokensKey: totalTokens, }) if err != nil || out == nil { return 0, fmt.Errorf("failed to evaluate CEL expression: %w", err) diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go index 79835b7354..cee9a259a5 100644 --- a/internal/llmcostcel/cel_test.go +++ b/internal/llmcostcel/cel_test.go @@ -26,7 +26,7 @@ func TestNewProgram(t *testing.T) { require.NoError(t, err) }) t.Run("variables", func(t *testing.T) { - prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens - cached_write_input_tokens) * output_tokens : total_tokens") + prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens - cache_creation_input_tokens) * output_tokens : total_tokens") require.NoError(t, err) v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3) require.NoError(t, err) diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go index 3c4a3dc62f..cb45ae6051 100644 --- a/internal/metrics/genai.go +++ b/internal/metrics/genai.go @@ -39,9 +39,9 @@ const ( // https://github.com/open-telemetry/semantic-conventions/issues/1959 // // However, the spec says "a custom value MAY be used.", so we can use it now. - genaiTokenTypeCachedInput = "cached_input" - genaiTokenTypeCachedWriteInput = "cached_write_input" - genaiErrorTypeFallback = "_OTHER" + genaiTokenTypeCachedInput = "cached_input" + genaiTokenTypeCachedCreationInput = "cache_creation_input" + genaiErrorTypeFallback = "_OTHER" ) // GenAIOperation represents the type of generative AI operation i.e. the endpoint being called. diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 6c473863bd..38fe032539 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -149,10 +149,10 @@ type TokenUsage struct { totalTokens uint32 // CachedInputTokens is the total number of tokens read from cache. cachedInputTokens uint32 - // CachedWriteInputTokens is the total number of tokens written to cache. - cachedWriteInputTokens uint32 + // CachedCreationInputTokens is the total number of tokens written to cache. + cachedCreationInputTokens uint32 - inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedWriteInputTokenSet bool + inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedCreationInputTokenSet bool } // InputTokens returns the number of input tokens and whether it was set. @@ -175,9 +175,9 @@ func (u *TokenUsage) CachedInputTokens() (uint32, bool) { return u.cachedInputTokens, u.cachedInputTokenSet } -// CachedWriteInputTokens returns the number of cached write input tokens and whether it was set. -func (u *TokenUsage) CachedWriteInputTokens() (uint32, bool) { - return u.cachedWriteInputTokens, u.cachedWriteInputTokenSet +// CachedCreationInputTokens returns the number of cache creation input tokens and whether it was set. +func (u *TokenUsage) CachedCreationInputTokens() (uint32, bool) { + return u.cachedCreationInputTokens, u.cachedCreationInputTokenSet } // SetInputTokens sets the number of input tokens and marks the field as set. @@ -204,10 +204,10 @@ func (u *TokenUsage) SetCachedInputTokens(tokens uint32) { u.cachedInputTokenSet = true } -// SetCachedWriteInputTokens sets the number of cached write input tokens and marks the field as set. -func (u *TokenUsage) SetCachedWriteInputTokens(tokens uint32) { - u.cachedWriteInputTokens = tokens - u.cachedWriteInputTokenSet = true +// SetCachedCreationInputTokens sets the number of cache creation input tokens and marks the field as set. +func (u *TokenUsage) SetCachedCreationInputTokens(tokens uint32) { + u.cachedCreationInputTokens = tokens + u.cachedCreationInputTokenSet = true } // AddInputTokens increments the recorded input tokens and marks the field as set. @@ -228,10 +228,10 @@ func (u *TokenUsage) AddCachedInputTokens(tokens uint32) { u.cachedInputTokens += tokens } -// AddCachedWriteInputTokens increments the recorded cached write input tokens and marks the field as set. -func (u *TokenUsage) AddCachedWriteInputTokens(tokens uint32) { - u.cachedWriteInputTokenSet = true - u.cachedWriteInputTokens += tokens +// AddCachedCreationInputTokens increments the recorded cache creation input tokens and marks the field as set. +func (u *TokenUsage) AddCachedCreationInputTokens(tokens uint32) { + u.cachedCreationInputTokenSet = true + u.cachedCreationInputTokens += tokens } // Override updates the TokenUsage fields with values from another TokenUsage instance. @@ -253,9 +253,9 @@ func (u *TokenUsage) Override(other TokenUsage) { u.cachedInputTokens = other.cachedInputTokens u.cachedInputTokenSet = true } - if other.cachedWriteInputTokenSet { - u.cachedWriteInputTokens = other.cachedWriteInputTokens - u.cachedWriteInputTokenSet = true + if other.cachedCreationInputTokenSet { + u.cachedCreationInputTokens = other.cachedCreationInputTokens + u.cachedCreationInputTokenSet = true } } @@ -265,15 +265,15 @@ func (u *TokenUsage) Override(other TokenUsage) { // // This function works for both streaming and non-streaming responses by accepting // the common usage fields that exist in all Anthropic usage structures. -func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cacheCreationTokens int64) TokenUsage { +func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens int64) TokenUsage { // Calculate total input tokens as per Anthropic API documentation - totalInputTokens := inputTokens + cacheCreationTokens + cacheReadTokens + totalInputTokens := inputTokens + cachedCreationTokens + cacheReadTokens var usage TokenUsage - usage.SetInputTokens(uint32(totalInputTokens)) //nolint:gosec - usage.SetOutputTokens(uint32(outputTokens)) //nolint:gosec - usage.SetTotalTokens(uint32(totalInputTokens + outputTokens)) //nolint:gosec - usage.SetCachedInputTokens(uint32(cacheReadTokens)) //nolint:gosec - usage.SetCachedWriteInputTokens(uint32(cacheCreationTokens)) //nolint:gosec + usage.SetInputTokens(uint32(totalInputTokens)) //nolint:gosec + usage.SetOutputTokens(uint32(outputTokens)) //nolint:gosec + usage.SetTotalTokens(uint32(totalInputTokens + outputTokens)) //nolint:gosec + usage.SetCachedInputTokens(uint32(cacheReadTokens)) //nolint:gosec + usage.SetCachedCreationInputTokens(uint32(cachedCreationTokens)) //nolint:gosec return usage } diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go index e4a85e4d23..32dbfc38d3 100644 --- a/internal/metrics/metrics_impl.go +++ b/internal/metrics/metrics_impl.go @@ -148,7 +148,7 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)), ) } - if cachedWriteInputTokens, ok := usage.CachedWriteInputTokens(); ok { + if cachedCreationInputTokens, ok := usage.CachedWriteInputTokens(); ok { b.metrics.tokenUsage.Record(ctx, float64(cachedWriteInputTokens), metric.WithAttributeSet(attrs), metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput)), diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go index 8bf0ff2cce..a0fce8287e 100644 --- a/internal/metrics/metrics_impl_test.go +++ b/internal/metrics/metrics_impl_test.go @@ -71,10 +71,10 @@ func TestRecordTokenUsage(t *testing.T) { attribute.Key(genaiAttributeResponseModel).String("test-model"), } // gen_ai.token.type values - https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/#common-attributes - inputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...) - outputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...) - cachedInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...) - cachedWriteInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput))...) + inputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...) + outputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...) + cachedInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...) + cachedCreationInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput))...) ) pm.SetOriginalModel("test-model") @@ -82,7 +82,7 @@ func TestRecordTokenUsage(t *testing.T) { pm.SetResponseModel("test-model") pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}}) pm.RecordTokenUsage(t.Context(), TokenUsage{ - inputTokens: 10, cachedInputTokens: 8, cachedWriteInputTokens: 2, outputTokens: 5, + inputTokens: 10, cachedInputTokens: 8, cachedCreationInputTokens: 2, outputTokens: 5, inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true, }, nil) @@ -94,7 +94,7 @@ func TestRecordTokenUsage(t *testing.T) { assert.Equal(t, uint64(1), count) assert.Equal(t, 8.0, sum) - count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs) + count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs) assert.Equal(t, uint64(1), count) assert.Equal(t, 2.0, sum) diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go index 1232609a37..b37f37aa1a 100644 --- a/internal/tracing/openinference/anthropic/messages.go +++ b/internal/tracing/openinference/anthropic/messages.go @@ -211,7 +211,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer int64(u.InputTokens), int64(u.OutputTokens), int64(u.CacheReadInputTokens), - int64(u.CacheCreationInputTokens), + int64(u.CachedCreationInputTokens), ) input, _ := cost.InputTokens() cacheRead, _ := cost.CachedInputTokens() diff --git a/internal/tracing/openinference/openai/response_attrs.go b/internal/tracing/openinference/openai/response_attrs.go index 178c22f0da..5cd3561401 100644 --- a/internal/tracing/openinference/openai/response_attrs.go +++ b/internal/tracing/openinference/openai/response_attrs.go @@ -58,7 +58,7 @@ func buildResponseAttributes(resp *openai.ChatCompletionResponse, config *openin attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptAudio, td.AudioTokens), attribute.Int(openinference.LLMTokenCountPromptCacheHit, td.CachedTokens), - attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedWriteTokens), + attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedCreationTokens), ) } } @@ -194,8 +194,8 @@ func buildResponsesResponseAttributes(resp *openai.Response, _ *openinference.Tr if resp.Usage.InputTokensDetails.CachedTokens > 0 { attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(resp.Usage.InputTokensDetails.CachedTokens))) } - if resp.Usage.InputTokensDetails.CachedWriteTokens > 0 { - attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedWriteTokens))) + if resp.Usage.InputTokensDetails.CachedCreationTokens > 0 { + attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedCreationTokens))) } } diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go index bbbde960cc..60f12cf6af 100644 --- a/internal/translator/anthropic_anthropic.go +++ b/internal/translator/anthropic_anthropic.go @@ -103,7 +103,7 @@ func (a *anthropicToAnthropicTranslator) ResponseBody(_ map[string]string, body int64(usage.InputTokens), int64(usage.OutputTokens), int64(usage.CacheReadInputTokens), - int64(usage.CacheCreationInputTokens), + int64(usage.CachedCreationInputTokens), ) if span != nil { span.RecordResponse(anthropicResp) @@ -148,7 +148,7 @@ func (a *anthropicToAnthropicTranslator) extractUsageFromBufferEvent(s tracing.M int64(u.InputTokens), int64(u.OutputTokens), int64(u.CacheReadInputTokens), - int64(u.CacheCreationInputTokens), + int64(u.CachedCreationInputTokens), ) // Override with message_start usage (contains input tokens and initial state) a.streamingTokenUsage.Override(messageStartUsage) @@ -181,8 +181,8 @@ func (a *anthropicToAnthropicTranslator) updateTotalTokens() { if _, cachedSet := a.streamingTokenUsage.CachedInputTokens(); !cachedSet { a.streamingTokenUsage.SetCachedInputTokens(0) } - if _, cachedSet := a.streamingTokenUsage.CachedWriteInputTokens(); !cachedSet { - a.streamingTokenUsage.SetCachedWriteInputTokens(0) + if _, cachedSet := a.streamingTokenUsage.CachedCreationInputTokens(); !cachedSet { + a.streamingTokenUsage.SetCachedCreationInputTokens(0) } } diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go index 875265d94d..d788a0f2bd 100644 --- a/internal/translator/anthropic_gcpanthropic_test.go +++ b/internal/translator/anthropic_gcpanthropic_test.go @@ -570,7 +570,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te } } -func tokenUsageFrom(in, cachedInput, cachedWriteInput, out, total int32) metrics.TokenUsage { +func tokenUsageFrom(in, cachedInput, cachedCreationInput, out, total int32) metrics.TokenUsage { var usage metrics.TokenUsage if in >= 0 { usage.SetInputTokens(uint32(in)) @@ -578,8 +578,8 @@ func tokenUsageFrom(in, cachedInput, cachedWriteInput, out, total int32) metrics if cachedInput >= 0 { usage.SetCachedInputTokens(uint32(cachedInput)) } - if cachedWriteInput >= 0 { - usage.SetCachedWriteInputTokens(uint32(cachedWriteInput)) + if cachedCreationInput >= 0 { + usage.SetCachedCreationInputTokens(uint32(cachedCreationInput)) } if out >= 0 { usage.SetOutputTokens(uint32(out)) @@ -638,7 +638,7 @@ data: {"type": "message_stop"} outputTokens, outputSet := tokenUsage.OutputTokens() totalTokens, totalSet := tokenUsage.TotalTokens() cachedTokens, cachedSet := tokenUsage.CachedInputTokens() - cachedWriteTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens() // Assertions assert.True(t, inputSet, "Input tokens should be set") @@ -653,8 +653,8 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") - assert.True(t, cachedWriteSet, "Cached write tokens should be set") - assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario") + assert.True(t, cachedWriteSet, "cache creation tokens should be set") + assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario") _, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil) require.NoError(t, err) @@ -686,6 +686,6 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") - assert.True(t, cachedWriteSet, "Cached write tokens should be set") - assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario") + assert.True(t, cachedWriteSet, "cache creation tokens should be set") + assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario") } diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index bc1a8a063c..016a355f04 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -16,88 +16,88 @@ import ( func TestExtractLLMTokenUsage(t *testing.T) { tests := []struct { - name string - inputTokens int64 - outputTokens int64 - cacheReadTokens int64 - cacheCreationTokens int64 - expectedInputTokens uint32 - expectedOutputTokens uint32 - expectedTotalTokens uint32 - expectedCachedTokens uint32 - expectedCachedWriteTokens uint32 + name string + inputTokens int64 + outputTokens int64 + cacheReadTokens int64 + cachedCreationTokens int64 + expectedInputTokens uint32 + expectedOutputTokens uint32 + expectedTotalTokens uint32 + expectedCachedTokens uint32 + expectedCachedCreationTokens uint32 }{ { - name: "basic usage without cache", - inputTokens: 100, - outputTokens: 50, - cacheReadTokens: 0, - cacheCreationTokens: 0, - expectedInputTokens: 100, - expectedOutputTokens: 50, - expectedTotalTokens: 150, - expectedCachedTokens: 0, - expectedCachedWriteTokens: 0, + name: "basic usage without cache", + inputTokens: 100, + outputTokens: 50, + cacheReadTokens: 0, + cachedCreationTokens: 0, + expectedInputTokens: 100, + expectedOutputTokens: 50, + expectedTotalTokens: 150, + expectedCachedTokens: 0, + expectedCachedCreationTokens: 0, }, { - name: "usage with cache read tokens", - inputTokens: 80, - outputTokens: 30, - cacheReadTokens: 20, - cacheCreationTokens: 0, - expectedInputTokens: 100, // 80 + 0 + 20 - expectedOutputTokens: 30, - expectedTotalTokens: 130, // 100 + 30 - expectedCachedTokens: 20, // 20 - expectedCachedWriteTokens: 0, + name: "usage with cache read tokens", + inputTokens: 80, + outputTokens: 30, + cacheReadTokens: 20, + cachedCreationTokens: 0, + expectedInputTokens: 100, // 80 + 0 + 20 + expectedOutputTokens: 30, + expectedTotalTokens: 130, // 100 + 30 + expectedCachedTokens: 20, // 20 + expectedCachedCreationTokens: 0, }, { - name: "usage with cache creation tokens", - inputTokens: 60, - outputTokens: 40, - cacheReadTokens: 0, - cacheCreationTokens: 15, - expectedInputTokens: 75, // 60 + 15 + 0 - expectedOutputTokens: 40, - expectedTotalTokens: 115, // 75 + 40 - expectedCachedTokens: 0, // 0 - expectedCachedWriteTokens: 15, // 15 + name: "usage with cache creation tokens", + inputTokens: 60, + outputTokens: 40, + cacheReadTokens: 0, + cachedCreationTokens: 15, + expectedInputTokens: 75, // 60 + 15 + 0 + expectedOutputTokens: 40, + expectedTotalTokens: 115, // 75 + 40 + expectedCachedTokens: 0, // 0 + expectedCachedCreationTokens: 15, // 15 }, { - name: "usage with both cache types", - inputTokens: 70, - outputTokens: 25, - cacheReadTokens: 10, - cacheCreationTokens: 5, - expectedInputTokens: 85, // 70 + 5 + 10 - expectedOutputTokens: 25, - expectedTotalTokens: 110, // 85 + 25 - expectedCachedTokens: 10, // 10 - expectedCachedWriteTokens: 5, // 5 + name: "usage with both cache types", + inputTokens: 70, + outputTokens: 25, + cacheReadTokens: 10, + cachedCreationTokens: 5, + expectedInputTokens: 85, // 70 + 5 + 10 + expectedOutputTokens: 25, + expectedTotalTokens: 110, // 85 + 25 + expectedCachedTokens: 10, // 10 + expectedCachedCreationTokens: 5, // 5 }, { - name: "zero values", - inputTokens: 0, - outputTokens: 0, - cacheReadTokens: 0, - cacheCreationTokens: 0, - expectedInputTokens: 0, - expectedOutputTokens: 0, - expectedTotalTokens: 0, - expectedCachedTokens: 0, - expectedCachedWriteTokens: 0, + name: "zero values", + inputTokens: 0, + outputTokens: 0, + cacheReadTokens: 0, + cachedCreationTokens: 0, + expectedInputTokens: 0, + expectedOutputTokens: 0, + expectedTotalTokens: 0, + expectedCachedTokens: 0, + expectedCachedCreationTokens: 0, }, { - name: "large values", - inputTokens: 100000, - outputTokens: 50000, - cacheReadTokens: 25000, - cacheCreationTokens: 15000, - expectedInputTokens: 140000, // 100000 + 15000 + 25000 - expectedOutputTokens: 50000, - expectedTotalTokens: 190000, // 140000 + 50000 - expectedCachedTokens: 25000, // 25000 - expectedCachedWriteTokens: 15000, + name: "large values", + inputTokens: 100000, + outputTokens: 50000, + cacheReadTokens: 25000, + cachedCreationTokens: 15000, + expectedInputTokens: 140000, // 100000 + 15000 + 25000 + expectedOutputTokens: 50000, + expectedTotalTokens: 190000, // 140000 + 50000 + expectedCachedTokens: 25000, // 25000 + expectedCachedCreationTokens: 15000, }, } @@ -107,15 +107,15 @@ func TestExtractLLMTokenUsage(t *testing.T) { tt.inputTokens, tt.outputTokens, tt.cacheReadTokens, - tt.cacheCreationTokens, + tt.cachedCreationTokens, ) expected := tokenUsageFrom( - int32(tt.expectedInputTokens), // nolint:gosec - int32(tt.expectedCachedTokens), // nolint:gosec - int32(tt.expectedCachedWriteTokens), // nolint:gosec - int32(tt.expectedOutputTokens), // nolint:gosec - int32(tt.expectedTotalTokens), // nolint:gosec + int32(tt.expectedInputTokens), // nolint:gosec + int32(tt.expectedCachedTokens), // nolint:gosec + int32(tt.expectedCachedCreationTokens), // nolint:gosec + int32(tt.expectedOutputTokens), // nolint:gosec + int32(tt.expectedTotalTokens), // nolint:gosec ) assert.Equal(t, expected, result) }) @@ -124,55 +124,55 @@ func TestExtractLLMTokenUsage(t *testing.T) { func TestExtractLLMTokenUsageFromUsage(t *testing.T) { tests := []struct { - name string - usage anthropic.Usage - expectedInputTokens int32 - expectedOutputTokens int32 - expectedTotalTokens int32 - expectedCachedTokens uint32 - expectedCachedWriteTokens uint32 + name string + usage anthropic.Usage + expectedInputTokens int32 + expectedOutputTokens int32 + expectedTotalTokens int32 + expectedCachedTokens uint32 + expectedCachedCreationTokens uint32 }{ { name: "non-streaming response without cache", usage: anthropic.Usage{ - InputTokens: 150, - OutputTokens: 75, - CacheReadInputTokens: 0, - CacheCreationInputTokens: 0, + InputTokens: 150, + OutputTokens: 75, + CacheReadInputTokens: 0, + CachedCreationInputTokens: 0, }, - expectedInputTokens: 150, - expectedOutputTokens: 75, - expectedTotalTokens: 225, - expectedCachedTokens: 0, - expectedCachedWriteTokens: 0, + expectedInputTokens: 150, + expectedOutputTokens: 75, + expectedTotalTokens: 225, + expectedCachedTokens: 0, + expectedCachedCreationTokens: 0, }, { name: "non-streaming response with cache read", usage: anthropic.Usage{ - InputTokens: 100, - OutputTokens: 50, - CacheReadInputTokens: 25, - CacheCreationInputTokens: 0, + InputTokens: 100, + OutputTokens: 50, + CacheReadInputTokens: 25, + CachedCreationInputTokens: 0, }, - expectedInputTokens: 125, // 100 + 0 + 25 - expectedOutputTokens: 50, - expectedTotalTokens: 175, // 125 + 50 - expectedCachedTokens: 25, // 25 - expectedCachedWriteTokens: 0, // 0 + expectedInputTokens: 125, // 100 + 0 + 25 + expectedOutputTokens: 50, + expectedTotalTokens: 175, // 125 + 50 + expectedCachedTokens: 25, // 25 + expectedCachedCreationTokens: 0, // 0 }, { name: "non-streaming response with both cache types", usage: anthropic.Usage{ - InputTokens: 90, - OutputTokens: 60, - CacheReadInputTokens: 15, - CacheCreationInputTokens: 10, + InputTokens: 90, + OutputTokens: 60, + CacheReadInputTokens: 15, + CachedCreationInputTokens: 10, }, - expectedInputTokens: 115, // 90 + 10 + 15 - expectedOutputTokens: 60, - expectedTotalTokens: 175, // 115 + 60 - expectedCachedTokens: 25, // 15 - expectedCachedWriteTokens: 10, // 10 + expectedInputTokens: 115, // 90 + 10 + 15 + expectedOutputTokens: 60, + expectedTotalTokens: 175, // 115 + 60 + expectedCachedTokens: 25, // 15 + expectedCachedCreationTokens: 10, // 10 }, } @@ -181,7 +181,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens, tt.usage.OutputTokens, tt.usage.CacheReadInputTokens, - tt.usage.CacheCreationInputTokens, + tt.usage.CachedCreationInputTokens, ) expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) @@ -202,10 +202,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { { name: "message_delta event with final totals", usage: anthropic.MessageDeltaUsage{ - InputTokens: 250, - OutputTokens: 120, - CacheReadInputTokens: 30, - CacheCreationInputTokens: 0, + InputTokens: 250, + OutputTokens: 120, + CacheReadInputTokens: 30, + CachedCreationInputTokens: 0, }, expectedInputTokens: 280, // 250 + 0 + 30 expectedOutputTokens: 120, @@ -216,10 +216,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { { name: "message_delta event with only output tokens", usage: anthropic.MessageDeltaUsage{ - InputTokens: 0, - OutputTokens: 85, - CacheReadInputTokens: 0, - CacheCreationInputTokens: 0, + InputTokens: 0, + OutputTokens: 85, + CacheReadInputTokens: 0, + CachedCreationInputTokens: 0, }, expectedInputTokens: 0, expectedOutputTokens: 85, @@ -230,10 +230,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { { name: "message_delta with cache creation tokens", usage: anthropic.MessageDeltaUsage{ - InputTokens: 150, - OutputTokens: 75, - CacheReadInputTokens: 10, - CacheCreationInputTokens: 5, + InputTokens: 150, + OutputTokens: 75, + CacheReadInputTokens: 10, + CachedCreationInputTokens: 5, }, expectedInputTokens: 165, // 150 + 5 + 10 expectedOutputTokens: 75, @@ -248,7 +248,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens, tt.usage.OutputTokens, tt.usage.CacheReadInputTokens, - tt.usage.CacheCreationInputTokens, + tt.usage.CachedCreationInputTokens, ) expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) @@ -285,14 +285,14 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { // cache_creation_input_tokens, and cache_read_input_tokens". inputTokens := int64(100) - cacheCreationTokens := int64(20) + cachedCreationTokens := int64(20) cacheReadTokens := int64(30) outputTokens := int64(50) - result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cacheCreationTokens) + result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens) // Total input should be sum of all input token types. - expectedTotalInputInt := inputTokens + cacheCreationTokens + cacheReadTokens + expectedTotalInputInt := inputTokens + cachedCreationTokens + cacheReadTokens expectedTotalInput := uint32(expectedTotalInputInt) // #nosec G115 - test values are small and safe inputTokensVal, ok := result.InputTokens() assert.True(t, ok) @@ -306,7 +306,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { cachedWriteTokens, ok := result.CachedWriteInputTokens() assert.True(t, ok) - assert.Equal(t, cacheCreationTokens, cachedWriteTokens, + assert.Equal(t, cachedCreationTokens, cachedWriteTokens, "CachedWriteInputTokens should be cache_creation_input_tokens") // Total tokens should be input + output. diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go index 56b4870151..8318fe88aa 100644 --- a/internal/translator/openai_awsbedrock.go +++ b/internal/translator/openai_awsbedrock.go @@ -708,7 +708,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec } if usage.CacheWriteInputTokens != nil { - tokenUsage.SetCachedWriteInputTokens(uint32(*usage.CacheWriteInputTokens)) + tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) } } oaiEvent, ok := o.convertEvent(event) @@ -760,8 +760,8 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string openAIResp.Usage.PromptTokensDetails.CachedTokens = *bedrockResp.Usage.CacheReadInputTokens } if bedrockResp.Usage.CacheWriteInputTokens != nil { - tokenUsage.SetCachedWriteInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec - openAIResp.Usage.PromptTokensDetails.CachedWriteTokens = *bedrockResp.Usage.CacheWriteInputTokens + tokenUsage.SetCachedCreationInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec + openAIResp.Usage.PromptTokensDetails.CachedCreationTokens = *bedrockResp.Usage.CacheWriteInputTokens } } @@ -862,7 +862,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens } if event.Usage.CacheWriteInputTokens != nil { - chunk.Usage.PromptTokensDetails.CachedWriteTokens = *event.Usage.CacheWriteInputTokens + chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens } // messageStart event. case awsbedrock.ConverseStreamEventTypeMessageStart.String(): diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go index e780b8a34c..fa4a17345e 100644 --- a/internal/translator/openai_awsbedrock_test.go +++ b/internal/translator/openai_awsbedrock_test.go @@ -1474,8 +1474,8 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T) PromptTokens: 10, CompletionTokens: 20, PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: 5, - CachedWriteTokens: 7, + CachedTokens: 5, + CachedCreationTokens: 7, }, }, Choices: []openai.ChatCompletionResponseChoice{ diff --git a/internal/translator/openai_completions.go b/internal/translator/openai_completions.go index 3754dc2bff..7f5008d8dd 100644 --- a/internal/translator/openai_completions.go +++ b/internal/translator/openai_completions.go @@ -171,8 +171,8 @@ func (o *openAIToOpenAITranslatorV1Completion) extractUsageFromBufferEvent(span tokenUsage.SetOutputTokens(uint32(usage.CompletionTokens)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(usage.TotalTokens)) //nolint:gosec if usage.PromptTokensDetails != nil { - tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens)) //nolint:gosec - tokenUsage.SetCachedWriteInputTokens(uint32(usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec + tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCachedCreationInputTokens(uint32(usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec } // Do not mark buffering done; keep scanning to return the latest usage in this batch. } diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go index 7e11451f31..6880a6f3dd 100644 --- a/internal/translator/openai_gcpanthropic.go +++ b/internal/translator/openai_gcpanthropic.go @@ -833,13 +833,13 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri usage.InputTokens, usage.OutputTokens, usage.CacheReadInputTokens, - usage.CacheCreationInputTokens, + usage.CachedCreationInputTokens, ) inputTokens, _ := tokenUsage.InputTokens() outputTokens, _ := tokenUsage.OutputTokens() totalTokens, _ := tokenUsage.TotalTokens() cachedTokens, _ := tokenUsage.CachedInputTokens() - cacheWriteTokens, _ := tokenUsage.CachedWriteInputTokens() + cacheWriteTokens, _ := tokenUsage.CachedCreationInputTokens() openAIResp.Usage = openai.Usage{ CompletionTokens: int(outputTokens), PromptTokens: int(inputTokens), diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go index 191b436545..6d32041909 100644 --- a/internal/translator/openai_gcpanthropic_stream.go +++ b/internal/translator/openai_gcpanthropic_stream.go @@ -110,7 +110,7 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t p.tokenUsage.SetTotalTokens(inputTokens + outputTokens) totalTokens, _ := p.tokenUsage.TotalTokens() cachedTokens, _ := p.tokenUsage.CachedInputTokens() - cachedWriteTokens, _ := p.tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, _ := p.tokenUsage.CachedCreationInputTokens() finalChunk := openai.ChatCompletionResponseChunk{ ID: p.activeMessageID, Created: p.created, @@ -121,8 +121,8 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t CompletionTokens: int(outputTokens), TotalTokens: int(totalTokens), PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: int(cachedTokens), - CachedWriteTokens: int(cachedWriteTokens), + CachedTokens: int(cachedTokens), + CachedCreationTokens: int(cachedCreationTokens), }, }, Model: p.requestModel, @@ -203,7 +203,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat u.InputTokens, u.OutputTokens, u.CacheReadInputTokens, - u.CacheCreationInputTokens, + u.CachedCreationInputTokens, ) // For message_start, we store the initial usage but don't add to the accumulated // The message_delta event will contain the final totals @@ -213,7 +213,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat if cached, ok := usage.CachedInputTokens(); ok { p.tokenUsage.SetCachedInputTokens(cached) } - if cachedWrite, ok := usage.CachedWriteInputTokens(); ok { + if cachedCreation, ok := usage.CachedWriteInputTokens(); ok { p.tokenUsage.SetCachedWriteInputTokens(cachedWrite) } @@ -285,7 +285,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat u.InputTokens, u.OutputTokens, u.CacheReadInputTokens, - u.CacheCreationInputTokens, + u.CachedCreationInputTokens, ) // For message_delta, accumulate the incremental output tokens if output, ok := usage.OutputTokens(); ok { diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go index 232ab836cb..74e99c5741 100644 --- a/internal/translator/openai_gcpvertexai.go +++ b/internal/translator/openai_gcpvertexai.go @@ -170,8 +170,8 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount)) //nolint:gosec tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec - // Gemini does not return cached write input tokens, set to 0. - tokenUsage.SetCachedWriteInputTokens(0) + // Gemini does not return cache creation input tokens, set to 0. + tokenUsage.SetCachedCreationInputTokens(0) } if span != nil { diff --git a/internal/translator/openai_openai.go b/internal/translator/openai_openai.go index 9ad3084b3b..65b33a34cd 100644 --- a/internal/translator/openai_openai.go +++ b/internal/translator/openai_openai.go @@ -141,8 +141,8 @@ func (o *openAIToOpenAITranslatorV1ChatCompletion) ResponseBody(_ map[string]str tokenUsage.SetOutputTokens(uint32(resp.Usage.CompletionTokens)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) //nolint:gosec if resp.Usage.PromptTokensDetails != nil { - tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec - tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec + tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec } // Fallback to request model for test or non-compliant OpenAI backends responseModel = cmp.Or(resp.Model, o.requestModel) diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go index 9550d0ce4f..7ed57bcb11 100644 --- a/internal/translator/openai_responses.go +++ b/internal/translator/openai_responses.go @@ -179,7 +179,7 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t tokenUsage.SetOutputTokens(uint32(respComplEvent.Response.Usage.OutputTokens)) // #nosec G115 tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens)) // #nosec G115 tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 - // Openai does not support cached write response. + // Openai does not support cache creation response. tokenUsage.SetCachedWriteInputTokens(uint32(0)) // #nosec G115 } // Record streaming chunk to span if tracing is enabled. diff --git a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml index 4407e6ae18..38b2851a85 100644 --- a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml +++ b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml @@ -32,7 +32,7 @@ spec: - metadataKey: llm_input_cached_token type: CachedInputToken - metadataKey: llm_write_input_cached_token - type: CachedWriteInputToken + type: CachedCreationInputToken - metadataKey: llm_output_token type: OutputToken - metadataKey: llm_total_token From 72408dddbea5b7e6e00350263944a6e4515ef670 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:11:29 -0500 Subject: [PATCH 07/20] missed a few Signed-off-by: Aaron Choo --- internal/extproc/processor_impl.go | 6 +-- internal/extproc/processor_impl_test.go | 6 +-- internal/metrics/metrics_impl.go | 6 +-- internal/metrics/metrics_impl_test.go | 8 +-- .../openinference/anthropic/messages.go | 2 +- .../openinference/openai/responses_test.go | 6 +-- .../tracing/openinference/openinference.go | 4 +- .../translator/anthropic_gcpanthropic_test.go | 14 ++--- internal/translator/anthropic_usage_test.go | 54 +++++++++---------- internal/translator/openai_awsbedrock_test.go | 2 +- internal/translator/openai_gcpanthropic.go | 4 +- .../translator/openai_gcpanthropic_stream.go | 8 +-- internal/translator/openai_responses.go | 12 ++--- internal/translator/openai_responses_test.go | 24 ++++----- 14 files changed, 78 insertions(+), 78 deletions(-) diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go index 800b880393..729268644c 100644 --- a/internal/extproc/processor_impl.go +++ b/internal/extproc/processor_impl.go @@ -534,7 +534,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU case filterapi.LLMRequestCostTypeCachedInputToken: cost, _ = costs.CachedInputTokens() case filterapi.LLMRequestCostTypeCachedCreationInputToken: - cost, _ = costs.CachedWriteInputTokens() + cost, _ = costs.CachedCreationInputTokens() case filterapi.LLMRequestCostTypeOutputToken: cost, _ = costs.OutputTokens() case filterapi.LLMRequestCostTypeTotalToken: @@ -542,7 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU case filterapi.LLMRequestCostTypeCEL: in, _ := costs.InputTokens() cachedIn, _ := costs.CachedInputTokens() - cachedWrite, _ := costs.CachedWriteInputTokens() + cachedCreation, _ := costs.CachedCreationInputTokens() out, _ := costs.OutputTokens() total, _ := costs.TotalTokens() costU64, err := llmcostcel.EvaluateProgram( @@ -551,7 +551,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU backendName, in, cachedIn, - cachedWrite, + cachedCreation, out, total, ) diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go index e97d2fae45..7b498f12af 100644 --- a/internal/extproc/processor_impl_test.go +++ b/internal/extproc/processor_impl_test.go @@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}}, - {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cache_creation_input_token_usage"}}, + {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cache_creation_input_token_usage"}}, { CELProg: celProgInt, LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"}, @@ -375,7 +375,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T mt.expResponseBody = final mt.retUsedToken.SetInputTokens(5) mt.retUsedToken.SetCachedInputTokens(3) - mt.retUsedToken.SetCachedWriteInputTokens(21) + mt.retUsedToken.SetCachedCreationInputTokens(21) mt.retUsedToken.SetOutputTokens(138) mt.retUsedToken.SetTotalTokens(143) _, err = p.ProcessResponseBody(t.Context(), final) @@ -385,7 +385,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T require.Equal(t, 138, mm.outputTokenCount) require.Equal(t, 138, mm.streamingOutputTokens) // accumulated output tokens from stream require.Equal(t, 3, mm.cachedInputTokenCount) - require.Equal(t, 21, mm.cachedWriteInputTokenCount) + require.Equal(t, 21, mm.cachedCreationInputTokenCount) }) } diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go index 32dbfc38d3..8f13a50104 100644 --- a/internal/metrics/metrics_impl.go +++ b/internal/metrics/metrics_impl.go @@ -148,10 +148,10 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)), ) } - if cachedCreationInputTokens, ok := usage.CachedWriteInputTokens(); ok { - b.metrics.tokenUsage.Record(ctx, float64(cachedWriteInputTokens), + if cachedCreationInputTokens, ok := usage.CachedCreationInputTokens(); ok { + b.metrics.tokenUsage.Record(ctx, float64(cachedCreationInputTokens), metric.WithAttributeSet(attrs), - metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput)), + metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput)), ) } if outputTokens, ok := usage.OutputTokens(); ok { diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go index a0fce8287e..581815e1bc 100644 --- a/internal/metrics/metrics_impl_test.go +++ b/internal/metrics/metrics_impl_test.go @@ -300,7 +300,7 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) { pm.SetRequestModel("req-model") pm.SetResponseModel("res-model") pm.RecordTokenUsage(t.Context(), TokenUsage{ - inputTokens: 2, cachedInputTokens: 1, cachedWriteInputTokens: 6, outputTokens: 3, + inputTokens: 2, cachedInputTokens: 1, cachedCreationInputTokens: 6, outputTokens: 3, inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true, }, nil) @@ -328,15 +328,15 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) { assert.Equal(t, uint64(1), count) assert.Equal(t, 1.0, sum) - cachedWriteInputAttrs := attribute.NewSet( + cachedCreationInputAttrs := attribute.NewSet( attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)), attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI), attribute.Key(genaiAttributeOriginalModel).String("orig-model"), attribute.Key(genaiAttributeRequestModel).String("req-model"), attribute.Key(genaiAttributeResponseModel).String("res-model"), - attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput), + attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput), ) - count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs) + count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs) assert.Equal(t, uint64(1), count) assert.Equal(t, 6.0, sum) diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go index b37f37aa1a..23a52a618c 100644 --- a/internal/tracing/openinference/anthropic/messages.go +++ b/internal/tracing/openinference/anthropic/messages.go @@ -215,7 +215,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer ) input, _ := cost.InputTokens() cacheRead, _ := cost.CachedInputTokens() - cacheCreation, _ := cost.CachedWriteInputTokens() + cacheCreation, _ := cost.CachedCreationInputTokens() output, _ := cost.OutputTokens() total, _ := cost.TotalTokens() diff --git a/internal/tracing/openinference/openai/responses_test.go b/internal/tracing/openinference/openai/responses_test.go index 76fbdc9eb5..3191c6e82d 100644 --- a/internal/tracing/openinference/openai/responses_test.go +++ b/internal/tracing/openinference/openai/responses_test.go @@ -77,8 +77,8 @@ var ( Usage: &openai.ResponseUsage{ InputTokens: 100, InputTokensDetails: openai.ResponseUsageInputTokensDetails{ - CachedTokens: 10, - CachedWriteTokens: 50, + CachedTokens: 10, + CachedCreationTokens: 50, }, OutputTokens: 25, TotalTokens: 125, @@ -202,7 +202,7 @@ func TestResponsesRecorder_RecordResponse(t *testing.T) { expectedStatus: trace.Status{Code: codes.Ok, Description: ""}, }, { - name: "response with cache write", + name: "response with cache creation", resp: responseWithCacheWrite, config: &openinference.TraceConfig{}, expectedAttrs: []attribute.KeyValue{ diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go index aa50bca94c..f0f6d65ece 100644 --- a/internal/tracing/openinference/openinference.go +++ b/internal/tracing/openinference/openinference.go @@ -161,9 +161,9 @@ const ( LLMTokenCountPromptCacheHit = "llm.token_count.prompt_details.cache_read" // #nosec G101 // LLMTokenCountPromptCacheWrite represents the number of prompt tokens - // written to cache (cache writes). This enables tracking of cache efficiency + // created to cache (cache write). This enables tracking of cache efficiency // and cost savings from cached prompts. - LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_write" // #nosec G101 + LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_creation" // #nosec G101 // LLMTokenCountPromptAudio represents the number of audio tokens in the prompt. // Used for multimodal models that support audio input. diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go index d788a0f2bd..d2c87e4935 100644 --- a/internal/translator/anthropic_gcpanthropic_test.go +++ b/internal/translator/anthropic_gcpanthropic_test.go @@ -611,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t // 3. message_delta at the end provides output_tokens=5 but no input_tokens // 4. message_stop ends the stream messageStartChunk := `event: message_start -data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "cache_write_input_tokens": 1, "output_tokens": 0}}} +data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "cache_creation_input_tokens": 1, "output_tokens": 0}}} ` contentBlockStartChunk := `event: content_block_start data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}} @@ -638,7 +638,7 @@ data: {"type": "message_stop"} outputTokens, outputSet := tokenUsage.OutputTokens() totalTokens, totalSet := tokenUsage.TotalTokens() cachedTokens, cachedSet := tokenUsage.CachedInputTokens() - cachedCreationTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, cachedCreationSet := tokenUsage.CachedCreationInputTokens() // Assertions assert.True(t, inputSet, "Input tokens should be set") @@ -653,8 +653,8 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") - assert.True(t, cachedWriteSet, "cache creation tokens should be set") - assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario") + assert.True(t, cachedCreationSet, "cache creation tokens should be set") + assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario") _, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil) require.NoError(t, err) @@ -672,7 +672,7 @@ data: {"type": "message_stop"} outputTokens, outputSet = tokenUsage.OutputTokens() totalTokens, totalSet = tokenUsage.TotalTokens() cachedTokens, cachedSet = tokenUsage.CachedInputTokens() - cachedWriteTokens, cachedWriteSet = tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, cachedCreationSet = tokenUsage.CachedCreationInputTokens() assert.True(t, inputSet, "Input tokens should be set") assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start") @@ -686,6 +686,6 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") - assert.True(t, cachedWriteSet, "cache creation tokens should be set") - assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario") + assert.True(t, cachedCreationSet, "cache creation tokens should be set") + assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario") } diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index 016a355f04..42e2b0e83f 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -183,7 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CachedCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) }) } @@ -191,13 +191,13 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { tests := []struct { - name string - usage anthropic.MessageDeltaUsage - expectedInputTokens int32 - expectedOutputTokens int32 - expectedTotalTokens int32 - expectedCachedTokens uint32 - expectedCachedWriteTokens uint32 + name string + usage anthropic.MessageDeltaUsage + expectedInputTokens int32 + expectedOutputTokens int32 + expectedTotalTokens int32 + expectedCachedTokens uint32 + expectedCachedCreationTokens uint32 }{ { name: "message_delta event with final totals", @@ -207,11 +207,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 30, CachedCreationInputTokens: 0, }, - expectedInputTokens: 280, // 250 + 0 + 30 - expectedOutputTokens: 120, - expectedTotalTokens: 400, // 280 + 120 - expectedCachedTokens: 30, // 30 - expectedCachedWriteTokens: 0, + expectedInputTokens: 280, // 250 + 0 + 30 + expectedOutputTokens: 120, + expectedTotalTokens: 400, // 280 + 120 + expectedCachedTokens: 30, // 30 + expectedCachedCreationTokens: 0, }, { name: "message_delta event with only output tokens", @@ -221,11 +221,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 0, CachedCreationInputTokens: 0, }, - expectedInputTokens: 0, - expectedOutputTokens: 85, - expectedTotalTokens: 85, - expectedCachedTokens: 0, - expectedCachedWriteTokens: 0, + expectedInputTokens: 0, + expectedOutputTokens: 85, + expectedTotalTokens: 85, + expectedCachedTokens: 0, + expectedCachedCreationTokens: 0, }, { name: "message_delta with cache creation tokens", @@ -235,11 +235,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 10, CachedCreationInputTokens: 5, }, - expectedInputTokens: 165, // 150 + 5 + 10 - expectedOutputTokens: 75, - expectedTotalTokens: 240, // 165 + 75 - expectedCachedTokens: 10, // 10 - expectedCachedWriteTokens: 5, // 5 + expectedInputTokens: 165, // 150 + 5 + 10 + expectedOutputTokens: 75, + expectedTotalTokens: 240, // 165 + 75 + expectedCachedTokens: 10, // 10 + expectedCachedCreationTokens: 5, // 5 }, } @@ -250,7 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CachedCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) }) } @@ -304,10 +304,10 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { assert.Equal(t, cacheReadTokens, cachedTokens, "CachedInputTokens should be cache_read_input_tokens") - cachedWriteTokens, ok := result.CachedWriteInputTokens() + cachedCreationTokens, ok := result.CachedCreationInputTokens() assert.True(t, ok) - assert.Equal(t, cachedCreationTokens, cachedWriteTokens, - "CachedWriteInputTokens should be cache_creation_input_tokens") + assert.Equal(t, cachedCreationTokens, cachedCreationTokens, + "CachedCreationInputTokens should be cache_creation_input_tokens") // Total tokens should be input + output. expectedTotal := expectedTotalInput + uint32(outputTokens) diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go index fa4a17345e..c533ec6dbf 100644 --- a/internal/translator/openai_awsbedrock_test.go +++ b/internal/translator/openai_awsbedrock_test.go @@ -1725,7 +1725,7 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T) expectedUsage.SetCachedInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec } if tt.input.Usage.CacheWriteInputTokens != nil { - expectedUsage.SetCachedWriteInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec + expectedUsage.SetCachedCreationInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec } } else { expectedUsage = tokenUsageFrom(-1, -1, -1, -1, -1) diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go index 6880a6f3dd..dce744da00 100644 --- a/internal/translator/openai_gcpanthropic.go +++ b/internal/translator/openai_gcpanthropic.go @@ -845,8 +845,8 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri PromptTokens: int(inputTokens), TotalTokens: int(totalTokens), PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: int(cachedTokens), - CachedWriteTokens: int(cacheWriteTokens), + CachedTokens: int(cachedTokens), + CachedCreationTokens: int(cacheWriteTokens), }, } diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go index 6d32041909..9e5cc95796 100644 --- a/internal/translator/openai_gcpanthropic_stream.go +++ b/internal/translator/openai_gcpanthropic_stream.go @@ -213,8 +213,8 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat if cached, ok := usage.CachedInputTokens(); ok { p.tokenUsage.SetCachedInputTokens(cached) } - if cachedCreation, ok := usage.CachedWriteInputTokens(); ok { - p.tokenUsage.SetCachedWriteInputTokens(cachedWrite) + if cachedCreation, ok := usage.CachedCreationInputTokens(); ok { + p.tokenUsage.SetCachedCreationInputTokens(cachedCreation) } // reset the toolIndex for each message @@ -298,10 +298,10 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat p.tokenUsage.AddCachedInputTokens(cached) } // Update input tokens to include write cache tokens from delta - if cached, ok := usage.CachedWriteInputTokens(); ok { + if cached, ok := usage.CachedCreationInputTokens(); ok { p.tokenUsage.AddInputTokens(cached) // Accumulate any additional cache tokens from delta - p.tokenUsage.AddCachedWriteInputTokens(cached) + p.tokenUsage.AddCachedCreationInputTokens(cached) } if event.Delta.StopReason != "" { p.stopReason = event.Delta.StopReason diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go index 7ed57bcb11..30934c5c63 100644 --- a/internal/translator/openai_responses.go +++ b/internal/translator/openai_responses.go @@ -128,11 +128,11 @@ func (o *openAIToOpenAITranslatorV1Responses) handleNonStreamingResponse(body io // TODO: Add reasoning token usage if resp.Usage != nil { - tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens)) // #nosec G115 - tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens)) // #nosec G115 - tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) // #nosec G115 - tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 - tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.InputTokensDetails.CachedWriteTokens)) // #nosec G115 + tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens)) // #nosec G115 + tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens)) // #nosec G115 + tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) // #nosec G115 + tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 + tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.InputTokensDetails.CachedCreationTokens)) // #nosec G115 } // Record non-streaming response to span if tracing is enabled. @@ -180,7 +180,7 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens)) // #nosec G115 tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 // Openai does not support cache creation response. - tokenUsage.SetCachedWriteInputTokens(uint32(0)) // #nosec G115 + tokenUsage.SetCachedCreationInputTokens(uint32(0)) // #nosec G115 } // Record streaming chunk to span if tracing is enabled. if span != nil { diff --git a/internal/translator/openai_responses_test.go b/internal/translator/openai_responses_test.go index 80c574d51e..363933d525 100644 --- a/internal/translator/openai_responses_test.go +++ b/internal/translator/openai_responses_test.go @@ -247,9 +247,9 @@ func TestResponsesOpenAIToOpenAITranslator_ResponseBody(t *testing.T) { require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) - cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedWriteTokens) + require.Equal(t, uint32(0), cachedCreationTokens) }) t.Run("non-streaming response with fallback model", func(t *testing.T) { @@ -363,9 +363,9 @@ data: [DONE] require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) - cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedWriteTokens) + require.Equal(t, uint32(0), cachedCreationTokens) }) t.Run("streaming response with fallback model", func(t *testing.T) { @@ -462,9 +462,9 @@ data: [DONE] cachedTokens, _ := tokenUsage.CachedInputTokens() require.Equal(t, uint32(2), cachedTokens) - cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedWriteTokens) + require.Equal(t, uint32(0), cachedCreationTokens) }) t.Run("streaming read error", func(t *testing.T) { @@ -554,9 +554,9 @@ func TestResponses_HandleNonStreamingResponse(t *testing.T) { cachedTokens, _ := tokenUsage.CachedInputTokens() require.Equal(t, uint32(2), cachedTokens) - cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedWriteTokens) + require.Equal(t, uint32(0), cachedCreationTokens) }) t.Run("invalid JSON", func(t *testing.T) { @@ -619,9 +619,9 @@ data: [DONE] require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) - cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens() + cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedWriteTokens) + require.Equal(t, uint32(0), cachedCreationTokens) }) t.Run("model extraction", func(t *testing.T) { @@ -686,11 +686,11 @@ data: [DONE] _, outputSet := tokenUsage.OutputTokens() _, totalSet := tokenUsage.TotalTokens() _, cachedSet := tokenUsage.CachedInputTokens() - _, cachedWriteSet := tokenUsage.CachedWriteInputTokens() + _, cachedCreationSet := tokenUsage.CachedCreationInputTokens() require.False(t, totalSet) require.False(t, cachedSet) - require.False(t, cachedWriteSet) + require.False(t, cachedCreationSet) require.False(t, inputSet) require.False(t, outputSet) }) From ea12f8ea4ce3e520154aeaba77291cc5e7b2adb0 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:16:17 -0500 Subject: [PATCH 08/20] fix typo Signed-off-by: Aaron Choo --- api/v1alpha1/ai_gateway_route.go | 2 +- api/v1alpha1/shared_types.go | 4 ++-- examples/token_ratelimit/token_ratelimit.yaml | 2 +- internal/apischema/anthropic/anthropic.go | 2 +- internal/apischema/openai/openai.go | 6 +++--- internal/apischema/openai/openai_test.go | 8 ++++---- internal/extproc/processor_impl_test.go | 4 ++-- internal/filterapi/runtime_test.go | 2 +- internal/llmcostcel/cel.go | 4 ++-- internal/llmcostcel/cel_test.go | 2 +- internal/metrics/genai.go | 2 +- internal/metrics/metrics.go | 2 +- internal/metrics/metrics_impl_test.go | 4 ++-- internal/tracing/openinference/openinference.go | 2 +- internal/translator/anthropic_anthropic_test.go | 4 ++-- internal/translator/anthropic_gcpanthropic_test.go | 2 +- internal/translator/anthropic_usage_test.go | 6 +++--- tests/data-plane/testupstream_test.go | 6 +++--- 18 files changed, 32 insertions(+), 32 deletions(-) diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go index 6404f8e8a6..f99a66bfc4 100644 --- a/api/v1alpha1/ai_gateway_route.go +++ b/api/v1alpha1/ai_gateway_route.go @@ -108,7 +108,7 @@ type AIGatewayRouteSpec struct { // type: TotalToken // - metadataKey: llm_cached_input_token // type: CachedInputToken - // - metadataKey: llm_cache_creation_input_token + // - metadataKey: llm_cached_creation_input_token // type: CachedCreationInputToken // ``` // Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go index 2e391498dd..ed4e2c4dc3 100644 --- a/api/v1alpha1/shared_types.go +++ b/api/v1alpha1/shared_types.go @@ -114,7 +114,7 @@ type LLMRequestCost struct { // * backend: the backend name in the form of "name.namespace". Type: string. // * input_tokens: the number of input tokens. Type: unsigned integer. // * cached_input_tokens: the number of cached read input tokens. Type: unsigned integer. - // * cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer. + // * cached_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer. // * output_tokens: the number of output tokens. Type: unsigned integer. // * total_tokens: the total number of tokens. Type: unsigned integer. // @@ -122,7 +122,7 @@ type LLMRequestCost struct { // // * "model == 'llama' ? input_tokens + output_token * 0.5 : total_tokens" // * "backend == 'foo.default' ? input_tokens + output_tokens : total_tokens" - // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens" + // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens * 1.25 + output_tokens : total_tokens" // * "input_tokens + output_tokens + total_tokens" // * "input_tokens * output_tokens" // diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml index 4ebe6217c7..42b35a9a18 100644 --- a/examples/token_ratelimit/token_ratelimit.yaml +++ b/examples/token_ratelimit/token_ratelimit.yaml @@ -51,7 +51,7 @@ spec: type: InputToken - metadataKey: llm_cached_input_token type: CachedInputToken - - metadataKey: llm_cache_creation_input_token + - metadataKey: llm_cached_creation_input_token type: CachedCreationInputToken - metadataKey: llm_output_token type: OutputToken diff --git a/internal/apischema/anthropic/anthropic.go b/internal/apischema/anthropic/anthropic.go index f02e706d92..55dbaceee5 100644 --- a/internal/apischema/anthropic/anthropic.go +++ b/internal/apischema/anthropic/anthropic.go @@ -437,7 +437,7 @@ const ( // so we use float64 to be able to unmarshal both 1234 and 1234.0 without errors. type Usage struct { // The number of input tokens used to create the cache entry. - CachedCreationInputTokens float64 `json:"cache_creation_input_tokens"` + CachedCreationInputTokens float64 `json:"cached_creation_input_tokens"` // The number of input tokens read from the cache. CacheReadInputTokens float64 `json:"cache_read_input_tokens"` // The number of input tokens which were used. diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go index 1cb6268929..02e8b9eae8 100644 --- a/internal/apischema/openai/openai.go +++ b/internal/apischema/openai/openai.go @@ -1383,7 +1383,7 @@ type PromptTokensDetails struct { // Cached tokens present in the prompt. CachedTokens int `json:"cached_tokens,omitzero"` // Tokens written to the cache. - CachedCreationTokens int `json:"cache_creation_input_tokens,omitzero"` + CachedCreationTokens int `json:"cached_creation_input_tokens,omitzero"` } // ChatCompletionResponseChunk is described in the OpenAI API documentation: @@ -2539,7 +2539,7 @@ type ResponseUsageInputTokensDetails struct { CachedTokens int64 `json:"cached_tokens"` // The number of tokens that were written to the cache. - CachedCreationTokens int64 `json:"cache_creation_input_tokens"` + CachedCreationTokens int64 `json:"cached_creation_input_tokens"` } // A detailed breakdown of the output tokens. @@ -2554,7 +2554,7 @@ type ResponseTokensDetails struct { CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api // CachedCreationTokens: number of tokens that were written to the cache. - CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle + CachedCreationTokens int64 `json:"cached_creation_input_tokens"` //nolint:tagliatelle // ReasoningTokens: Number of reasoning tokens (for reasoning models). ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go index c592a3b712..21f1fab649 100644 --- a/internal/apischema/openai/openai_test.go +++ b/internal/apischema/openai/openai_test.go @@ -1751,7 +1751,7 @@ func TestPromptTokensDetails(t *testing.T) { "text_tokens": 15, "audio_tokens": 8, "cached_tokens": 384, - "cache_creation_input_tokens": 10 + "cached_creation_input_tokens": 10 }`, }, { @@ -1765,7 +1765,7 @@ func TestPromptTokensDetails(t *testing.T) { expected: `{ "audio_tokens": 8, "cached_tokens": 384, - "cache_creation_input_tokens": 10 + "cached_creation_input_tokens": 10 }`, }, } @@ -1838,7 +1838,7 @@ func TestChatCompletionResponseUsage(t *testing.T) { "prompt_tokens_details": { "audio_tokens": 8, "cached_tokens": 384, - "cache_creation_input_tokens": 13 + "cached_creation_input_tokens": 13 } }`, }, @@ -1875,7 +1875,7 @@ func TestChatCompletionResponseUsage(t *testing.T) { "text_tokens": 15, "audio_tokens": 8, "cached_tokens": 384, - "cache_creation_input_tokens": 21 + "cached_creation_input_tokens": 21 } }`, }, diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go index 7b498f12af..60f612fa17 100644 --- a/internal/extproc/processor_impl_test.go +++ b/internal/extproc/processor_impl_test.go @@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}}, - {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cache_creation_input_token_usage"}}, + {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cached_creation_input_token_usage"}}, { CELProg: celProgInt, LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"}, @@ -312,7 +312,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cached_input_token_usage"].GetNumberValue()) require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. - GetStructValue().Fields["cache_creation_input_token_usage"].GetNumberValue()) + GetStructValue().Fields["cached_creation_input_token_usage"].GetNumberValue()) require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cel_int"].GetNumberValue()) require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. diff --git a/internal/filterapi/runtime_test.go b/internal/filterapi/runtime_test.go index a0ac5d6fa2..cb9f8d4af0 100644 --- a/internal/filterapi/runtime_test.go +++ b/internal/filterapi/runtime_test.go @@ -59,7 +59,7 @@ func TestServer_LoadConfig(t *testing.T) { require.Equal(t, "1 + 1", rc.RequestCosts[1].CEL) prog := rc.RequestCosts[1].CELProg require.NotNil(t, prog) - val, err := llmcostcel.EvaluateProgram(prog, "", "", 1, 1, 1, 1) + val, err := llmcostcel.EvaluateProgram(prog, "", "", 1, 1, 1, 1, 1) require.NoError(t, err) require.Equal(t, uint64(2), val) require.Equal(t, config.Models, rc.DeclaredModels) diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go index 5bc0008d59..46d06c7130 100644 --- a/internal/llmcostcel/cel.go +++ b/internal/llmcostcel/cel.go @@ -19,8 +19,8 @@ const ( celModelNameKey = "model" celBackendKey = "backend" celInputTokensKey = "input_tokens" - celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 - celCachedCreationInputTokensKey = "cache_creation_input_tokens" // #nosec G101 + celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 + celCachedCreationInputTokensKey = "cached_creation_input_tokens" // #nosec G101 celOutputTokensKey = "output_tokens" celTotalTokensKey = "total_tokens" ) diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go index cee9a259a5..92a323fc63 100644 --- a/internal/llmcostcel/cel_test.go +++ b/internal/llmcostcel/cel_test.go @@ -26,7 +26,7 @@ func TestNewProgram(t *testing.T) { require.NoError(t, err) }) t.Run("variables", func(t *testing.T) { - prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens - cache_creation_input_tokens) * output_tokens : total_tokens") + prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens - cached_creation_input_tokens) * output_tokens : total_tokens") require.NoError(t, err) v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3) require.NoError(t, err) diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go index cb45ae6051..f739bf8764 100644 --- a/internal/metrics/genai.go +++ b/internal/metrics/genai.go @@ -40,7 +40,7 @@ const ( // // However, the spec says "a custom value MAY be used.", so we can use it now. genaiTokenTypeCachedInput = "cached_input" - genaiTokenTypeCachedCreationInput = "cache_creation_input" + genaiTokenTypeCachedCreationInput = "cached_creation_input" genaiErrorTypeFallback = "_OTHER" ) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 38fe032539..a68d810dac 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -261,7 +261,7 @@ func (u *TokenUsage) Override(other TokenUsage) { // ExtractTokenUsageFromAnthropic extracts the correct token usage from Anthropic API response. // According to Claude API documentation, total input tokens is the summation of: -// input_tokens + cache_creation_input_tokens + cache_read_input_tokens +// input_tokens + cached_creation_input_tokens + cache_read_input_tokens // // This function works for both streaming and non-streaming responses by accepting // the common usage fields that exist in all Anthropic usage structures. diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go index 581815e1bc..bcf6bc8282 100644 --- a/internal/metrics/metrics_impl_test.go +++ b/internal/metrics/metrics_impl_test.go @@ -83,7 +83,7 @@ func TestRecordTokenUsage(t *testing.T) { pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}}) pm.RecordTokenUsage(t.Context(), TokenUsage{ inputTokens: 10, cachedInputTokens: 8, cachedCreationInputTokens: 2, outputTokens: 5, - inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true, + inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true, }, nil) count, sum := testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, inputAttrs) @@ -301,7 +301,7 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) { pm.SetResponseModel("res-model") pm.RecordTokenUsage(t.Context(), TokenUsage{ inputTokens: 2, cachedInputTokens: 1, cachedCreationInputTokens: 6, outputTokens: 3, - inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true, + inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true, }, nil) inputAttrs := attribute.NewSet( diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go index f0f6d65ece..23fea486a9 100644 --- a/internal/tracing/openinference/openinference.go +++ b/internal/tracing/openinference/openinference.go @@ -163,7 +163,7 @@ const ( // LLMTokenCountPromptCacheWrite represents the number of prompt tokens // created to cache (cache write). This enables tracking of cache efficiency // and cost savings from cached prompts. - LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_creation" // #nosec G101 + LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cached_creation" // #nosec G101 // LLMTokenCountPromptAudio represents the number of audio tokens in the prompt. // Used for multimodal models that support audio input. diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go index 236dd537d9..9b175e6723 100644 --- a/internal/translator/anthropic_anthropic_test.go +++ b/internal/translator/anthropic_anthropic_test.go @@ -89,7 +89,7 @@ func TestAnthropicToAnthropic_ResponseHeaders(t *testing.T) { func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) { translator := NewAnthropicToAnthropicTranslator("", "") require.NotNil(t, translator) - const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}` + const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}` headerMutation, bodyMutation, tokenUsage, responseModel, err := translator.ResponseBody(nil, strings.NewReader(responseBody), true, nil) require.NoError(t, err) @@ -108,7 +108,7 @@ func TestAnthropicToAnthropic_ResponseBody_streaming(t *testing.T) { // We split the response into two parts to simulate streaming where each part can end in the // middle of an event. const responseHead = `event: message_start -data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}} } +data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}} } event: content_block_start data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go index d2c87e4935..d893dfdab5 100644 --- a/internal/translator/anthropic_gcpanthropic_test.go +++ b/internal/translator/anthropic_gcpanthropic_test.go @@ -611,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t // 3. message_delta at the end provides output_tokens=5 but no input_tokens // 4. message_stop ends the stream messageStartChunk := `event: message_start -data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "cache_creation_input_tokens": 1, "output_tokens": 0}}} +data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "cached_creation_input_tokens": 1, "output_tokens": 0}}} ` contentBlockStartChunk := `event: content_block_start data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}} diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index 42e2b0e83f..a267d65222 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -282,7 +282,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { t.Run("claude API documentation example", func(t *testing.T) { // This test verifies compliance with Claude API documentation: // "Total input tokens in a request is the summation of input_tokens, - // cache_creation_input_tokens, and cache_read_input_tokens". + // cached_creation_input_tokens, and cache_read_input_tokens". inputTokens := int64(100) cachedCreationTokens := int64(20) @@ -297,7 +297,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { inputTokensVal, ok := result.InputTokens() assert.True(t, ok) assert.Equal(t, expectedTotalInput, inputTokensVal, - "InputTokens should be sum of input_tokens + cache_creation_input_tokens + cache_read_input_tokens") + "InputTokens should be sum of input_tokens + cached_creation_input_tokens + cache_read_input_tokens") cachedTokens, ok := result.CachedInputTokens() assert.True(t, ok) @@ -307,7 +307,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { cachedCreationTokens, ok := result.CachedCreationInputTokens() assert.True(t, ok) assert.Equal(t, cachedCreationTokens, cachedCreationTokens, - "CachedCreationInputTokens should be cache_creation_input_tokens") + "CachedCreationInputTokens should be cached_creation_input_tokens") // Total tokens should be input + output. expectedTotal := expectedTotalInput + uint32(outputTokens) diff --git a/tests/data-plane/testupstream_test.go b/tests/data-plane/testupstream_test.go index 6cc8b1000c..d1ef2abba8 100644 --- a/tests/data-plane/testupstream_test.go +++ b/tests/data-plane/testupstream_test.go @@ -931,7 +931,7 @@ data: {"type": "message_stop"} ] }`, expPath: "/v1/messages", - responseBody: `{"model":"foo","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`, + responseBody: `{"model":"foo","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`, expStatus: http.StatusOK, }, { @@ -954,7 +954,7 @@ data: {"type": "message_stop"} expPath: "/v1/messages", responseBody: ` event: message_start -data: {"type":"message_start","message":{"model":"foo","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard"}} } +data: {"type":"message_start","message":{"model":"foo","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard"}} } event: content_block_start data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } @@ -975,7 +975,7 @@ event: content_block_stop data: {"type":"content_block_stop","index":0 } event: message_delta -data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":16} } +data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":16} } event: message_stop data: {"type":"message_stop" } From ff671649b60aa7deb6af5b09d13d3eca79f38942 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:23:27 -0500 Subject: [PATCH 09/20] make apigen Signed-off-by: Aaron Choo --- ...gateway.envoyproxy.io_aigatewayroutes.yaml | 45 ++++++++++--------- site/docs/api/api.mdx | 11 +++-- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml index c69175ff95..e75770a409 100644 --- a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml +++ b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml @@ -173,16 +173,18 @@ spec: metadataKey: llm_input_token\n\t type: InputToken\n\t- metadataKey: llm_output_token\n\t type: OutputToken\n\t- metadataKey: llm_total_token\n\t \ type: TotalToken\n\t- metadataKey: llm_cached_input_token\n\t - \ type: CachedInputToken\n```\nThen, with the following BackendTrafficPolicy - of Envoy Gateway, you can have three\nrate limit buckets for each - unique x-user-id header value. One bucket is for the input token,\nthe - other is for the output token, and the last one is for the total - token.\nEach bucket will be reduced by the corresponding token usage - captured by the AI Gateway filter.\n\n```yaml\n\tapiVersion: gateway.envoyproxy.io/v1alpha1\n\tkind: - BackendTrafficPolicy\n\tmetadata:\n\t name: some-example-token-rate-limit\n\t - \ namespace: default\n\tspec:\n\t targetRefs:\n\t - group: gateway.networking.k8s.io\n\t - \ kind: HTTPRoute\n\t name: usage-rate-limit\n\t rateLimit:\n\t - \ type: Global\n\t global:\n\t rules:\n\t - clientSelectors:\n\t + \ type: CachedInputToken\n- metadataKey: llm_cached_creation_input_token\n + \ type: CachedCreationInputToken\n```\nThen, with the following + BackendTrafficPolicy of Envoy Gateway, you can have three\nrate + limit buckets for each unique x-user-id header value. One bucket + is for the input token,\nthe other is for the output token, and + the last one is for the total token.\nEach bucket will be reduced + by the corresponding token usage captured by the AI Gateway filter.\n\n```yaml\n\tapiVersion: + gateway.envoyproxy.io/v1alpha1\n\tkind: BackendTrafficPolicy\n\tmetadata:\n\t + \ name: some-example-token-rate-limit\n\t namespace: default\n\tspec:\n\t + \ targetRefs:\n\t - group: gateway.networking.k8s.io\n\t kind: + HTTPRoute\n\t name: usage-rate-limit\n\t rateLimit:\n\t type: + Global\n\t global:\n\t rules:\n\t - clientSelectors:\n\t \ # Do the rate limiting based on the x-user-id header.\n\t \ - headers:\n\t - name: x-user-id\n\t \ type: Distinct\n\t limit:\n\t # @@ -227,15 +229,17 @@ spec: Type: string.\n\t* backend: the backend name in the form of \"name.namespace\". Type: string.\n\t* input_tokens: the number of input tokens. Type: unsigned integer.\n\t* cached_input_tokens: - the number of cached input tokens. Type: unsigned integer.\n\t* - output_tokens: the number of output tokens. Type: unsigned - integer.\n\t* total_tokens: the total number of tokens. Type: - unsigned integer.\n\nFor example, the following expressions - are valid:\n\n\t* \"model == 'llama' ? input_tokens + output_token - * 0.5 : total_tokens\"\n\t* \"backend == 'foo.default' ? input_tokens - + output_tokens : total_tokens\"\n\t* \"backend == 'bar.default' - ? (input_tokens - cached_input_tokens) + cached_input_tokens - * 0.1 + output_tokens : total_tokens\"\n\t* \"input_tokens + the number of cached read input tokens. Type: unsigned integer.\n\t* + cached_creation_input_tokens: the number of cache creation + input tokens. Type: unsigned integer.\n\t* output_tokens: + the number of output tokens. Type: unsigned integer.\n\t* + total_tokens: the total number of tokens. Type: unsigned integer.\n\nFor + example, the following expressions are valid:\n\n\t* \"model + == 'llama' ? input_tokens + output_token * 0.5 : total_tokens\"\n\t* + \"backend == 'foo.default' ? input_tokens + output_tokens + : total_tokens\"\n\t* \"backend == 'bar.default' ? (input_tokens + - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens + * 1.25 + output_tokens : total_tokens\"\n\t* \"input_tokens + output_tokens + total_tokens\"\n\t* \"input_tokens * output_tokens\"" type: string metadataKey: @@ -246,11 +250,12 @@ spec: description: |- Type specifies the type of the request cost. The default is "OutputToken", and it uses "output token" as the cost. The other types are "InputToken", "TotalToken", - and "CEL". + "CachedInputToken", "CachedCreationInputToken", and "CEL". enum: - OutputToken - InputToken - CachedInputToken + - CachedCreationInputToken - TotalToken - CEL type: string diff --git a/site/docs/api/api.mdx b/site/docs/api/api.mdx index 36b396c219..0516d744cb 100644 --- a/site/docs/api/api.mdx +++ b/site/docs/api/api.mdx @@ -757,7 +757,7 @@ AIGatewayRouteSpec details the AIGatewayRoute configuration. name="llmRequestCosts" type="[LLMRequestCost](#llmrequestcost) array" required="false" - description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.
The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic
metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,
For example, let's say we have the following LLMRequestCosts configuration:
```yaml
llmRequestCosts:
- metadataKey: llm_input_token
type: InputToken
- metadataKey: llm_output_token
type: OutputToken
- metadataKey: llm_total_token
type: TotalToken
- metadataKey: llm_cached_input_token
type: CachedInputToken
```
Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
the other is for the output token, and the last one is for the total token.
Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.
```yaml
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: BackendTrafficPolicy
metadata:
name: some-example-token-rate-limit
namespace: default
spec:
targetRefs:
- group: gateway.networking.k8s.io
kind: HTTPRoute
name: usage-rate-limit
rateLimit:
type: Global
global:
rules:
- clientSelectors:
# Do the rate limiting based on the x-user-id header.
- headers:
- name: x-user-id
type: Distinct
limit:
# Configures the number of `tokens` allowed per hour.
requests: 10000
unit: Hour
cost:
request:
from: Number
# Setting the request cost to zero allows to only check the rate limit budget,
# and not consume the budget on the request path.
number: 0
# This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.
# The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited
# if the budget is exhausted.
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_input_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_output_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_total_token
```
Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and
different costs are configured for the same metadata key, the ai-gateway will pick one of them
to configure the metadata key in the generated HTTPRoute, and ignore the rest." + description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.
The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic
metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,
For example, let's say we have the following LLMRequestCosts configuration:
```yaml
llmRequestCosts:
- metadataKey: llm_input_token
type: InputToken
- metadataKey: llm_output_token
type: OutputToken
- metadataKey: llm_total_token
type: TotalToken
- metadataKey: llm_cached_input_token
type: CachedInputToken
- metadataKey: llm_cached_creation_input_token
type: CachedCreationInputToken
```
Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
the other is for the output token, and the last one is for the total token.
Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.
```yaml
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: BackendTrafficPolicy
metadata:
name: some-example-token-rate-limit
namespace: default
spec:
targetRefs:
- group: gateway.networking.k8s.io
kind: HTTPRoute
name: usage-rate-limit
rateLimit:
type: Global
global:
rules:
- clientSelectors:
# Do the rate limiting based on the x-user-id header.
- headers:
- name: x-user-id
type: Distinct
limit:
# Configures the number of `tokens` allowed per hour.
requests: 10000
unit: Hour
cost:
request:
from: Number
# Setting the request cost to zero allows to only check the rate limit budget,
# and not consume the budget on the request path.
number: 0
# This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.
# The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited
# if the budget is exhausted.
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_input_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_output_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_total_token
```
Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and
different costs are configured for the same metadata key, the ai-gateway will pick one of them
to configure the metadata key in the generated HTTPRoute, and ignore the rest." /> @@ -1664,12 +1664,12 @@ LLMRequestCost configures each request cost. name="type" type="[LLMRequestCostType](#llmrequestcosttype)" required="true" - description="Type specifies the type of the request cost. The default is `OutputToken`,
and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,
and `CEL`." + description="Type specifies the type of the request cost. The default is `OutputToken`,
and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,
`CachedInputToken`, `CachedCreationInputToken`, and `CEL`." /> @@ -1696,6 +1696,11 @@ LLMRequestCostType specifies the type of the LLMRequestCost. type="enum" required="false" description="LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.
" +/> Date: Fri, 2 Jan 2026 16:36:57 -0500 Subject: [PATCH 10/20] fix cached -> cache for anthropic Signed-off-by: Aaron Choo --- internal/apischema/anthropic/anthropic.go | 2 +- internal/translator/anthropic_anthropic_test.go | 4 ++-- internal/translator/anthropic_gcpanthropic_test.go | 2 +- internal/translator/anthropic_usage_test.go | 12 ++++++------ internal/translator/openai_gcpanthropic_stream.go | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/internal/apischema/anthropic/anthropic.go b/internal/apischema/anthropic/anthropic.go index 55dbaceee5..f65d102761 100644 --- a/internal/apischema/anthropic/anthropic.go +++ b/internal/apischema/anthropic/anthropic.go @@ -437,7 +437,7 @@ const ( // so we use float64 to be able to unmarshal both 1234 and 1234.0 without errors. type Usage struct { // The number of input tokens used to create the cache entry. - CachedCreationInputTokens float64 `json:"cached_creation_input_tokens"` + CacheCreationInputTokens float64 `json:"cache_creation_input_tokens"` // The number of input tokens read from the cache. CacheReadInputTokens float64 `json:"cache_read_input_tokens"` // The number of input tokens which were used. diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go index 9b175e6723..6d7e12f7ab 100644 --- a/internal/translator/anthropic_anthropic_test.go +++ b/internal/translator/anthropic_anthropic_test.go @@ -89,7 +89,7 @@ func TestAnthropicToAnthropic_ResponseHeaders(t *testing.T) { func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) { translator := NewAnthropicToAnthropicTranslator("", "") require.NotNil(t, translator) - const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}` + const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}` headerMutation, bodyMutation, tokenUsage, responseModel, err := translator.ResponseBody(nil, strings.NewReader(responseBody), true, nil) require.NoError(t, err) @@ -108,7 +108,7 @@ func TestAnthropicToAnthropic_ResponseBody_streaming(t *testing.T) { // We split the response into two parts to simulate streaming where each part can end in the // middle of an event. const responseHead = `event: message_start -data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}} } +data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}} } event: content_block_start data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go index d893dfdab5..d2c87e4935 100644 --- a/internal/translator/anthropic_gcpanthropic_test.go +++ b/internal/translator/anthropic_gcpanthropic_test.go @@ -611,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t // 3. message_delta at the end provides output_tokens=5 but no input_tokens // 4. message_stop ends the stream messageStartChunk := `event: message_start -data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "cached_creation_input_tokens": 1, "output_tokens": 0}}} +data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "cache_creation_input_tokens": 1, "output_tokens": 0}}} ` contentBlockStartChunk := `event: content_block_start data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}} diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index a267d65222..01ab735f63 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -282,22 +282,22 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { t.Run("claude API documentation example", func(t *testing.T) { // This test verifies compliance with Claude API documentation: // "Total input tokens in a request is the summation of input_tokens, - // cached_creation_input_tokens, and cache_read_input_tokens". + // cache_creation_input_tokens, and cache_read_input_tokens". inputTokens := int64(100) - cachedCreationTokens := int64(20) + cachedWriteTokens := int64(20) cacheReadTokens := int64(30) outputTokens := int64(50) - result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens) + result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedWriteTokens) // Total input should be sum of all input token types. - expectedTotalInputInt := inputTokens + cachedCreationTokens + cacheReadTokens + expectedTotalInputInt := inputTokens + cachedWriteTokens + cacheReadTokens expectedTotalInput := uint32(expectedTotalInputInt) // #nosec G115 - test values are small and safe inputTokensVal, ok := result.InputTokens() assert.True(t, ok) assert.Equal(t, expectedTotalInput, inputTokensVal, - "InputTokens should be sum of input_tokens + cached_creation_input_tokens + cache_read_input_tokens") + "InputTokens should be sum of input_tokens + cache_creation_input_tokens + cache_read_input_tokens") cachedTokens, ok := result.CachedInputTokens() assert.True(t, ok) @@ -307,7 +307,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { cachedCreationTokens, ok := result.CachedCreationInputTokens() assert.True(t, ok) assert.Equal(t, cachedCreationTokens, cachedCreationTokens, - "CachedCreationInputTokens should be cached_creation_input_tokens") + "CachedCreationInputTokens should be cache_creation_input_tokens") // Total tokens should be input + output. expectedTotal := expectedTotalInput + uint32(outputTokens) diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go index 9e5cc95796..9840af1650 100644 --- a/internal/translator/openai_gcpanthropic_stream.go +++ b/internal/translator/openai_gcpanthropic_stream.go @@ -203,7 +203,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat u.InputTokens, u.OutputTokens, u.CacheReadInputTokens, - u.CachedCreationInputTokens, + u.CacheCreationInputTokens, ) // For message_start, we store the initial usage but don't add to the accumulated // The message_delta event will contain the final totals From d8a318fea7a1be2b6e8d6cf040b33df36564195d Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:42:12 -0500 Subject: [PATCH 11/20] missing a few typos Signed-off-by: Aaron Choo --- internal/tracing/openinference/anthropic/messages.go | 2 +- internal/translator/anthropic_anthropic.go | 2 +- internal/translator/openai_gcpanthropic_stream.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go index 23a52a618c..ec6940056f 100644 --- a/internal/tracing/openinference/anthropic/messages.go +++ b/internal/tracing/openinference/anthropic/messages.go @@ -211,7 +211,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer int64(u.InputTokens), int64(u.OutputTokens), int64(u.CacheReadInputTokens), - int64(u.CachedCreationInputTokens), + int64(u.CacheCreationInputTokens), ) input, _ := cost.InputTokens() cacheRead, _ := cost.CachedInputTokens() diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go index 60f12cf6af..86356d03a3 100644 --- a/internal/translator/anthropic_anthropic.go +++ b/internal/translator/anthropic_anthropic.go @@ -148,7 +148,7 @@ func (a *anthropicToAnthropicTranslator) extractUsageFromBufferEvent(s tracing.M int64(u.InputTokens), int64(u.OutputTokens), int64(u.CacheReadInputTokens), - int64(u.CachedCreationInputTokens), + int64(u.CacheCreationInputTokens), ) // Override with message_start usage (contains input tokens and initial state) a.streamingTokenUsage.Override(messageStartUsage) diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go index 9840af1650..5d9e12c4a1 100644 --- a/internal/translator/openai_gcpanthropic_stream.go +++ b/internal/translator/openai_gcpanthropic_stream.go @@ -285,7 +285,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat u.InputTokens, u.OutputTokens, u.CacheReadInputTokens, - u.CachedCreationInputTokens, + u.CacheCreationInputTokens, ) // For message_delta, accumulate the incremental output tokens if output, ok := usage.OutputTokens(); ok { From f2a3cbbba49894d491338b79e509a7b826dd0fbf Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:45:28 -0500 Subject: [PATCH 12/20] update typo Signed-off-by: Aaron Choo --- internal/translator/anthropic_anthropic.go | 2 +- internal/translator/anthropic_usage_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go index 86356d03a3..1f5c8d5bf0 100644 --- a/internal/translator/anthropic_anthropic.go +++ b/internal/translator/anthropic_anthropic.go @@ -103,7 +103,7 @@ func (a *anthropicToAnthropicTranslator) ResponseBody(_ map[string]string, body int64(usage.InputTokens), int64(usage.OutputTokens), int64(usage.CacheReadInputTokens), - int64(usage.CachedCreationInputTokens), + int64(usage.CacheCreationInputTokens), ) if span != nil { span.RecordResponse(anthropicResp) diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index 01ab735f63..4058038688 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -248,7 +248,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens, tt.usage.OutputTokens, tt.usage.CacheReadInputTokens, - tt.usage.CachedCreationInputTokens, + tt.usage.CacheCreationInputTokens, ) expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) From d027658142c62fd79cc765478b4a5de74d664da9 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:50:19 -0500 Subject: [PATCH 13/20] last try fixing typos Signed-off-by: Aaron Choo --- internal/translator/anthropic_usage_test.go | 2 +- internal/translator/openai_gcpanthropic.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index 4058038688..4ff13ab56d 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -181,7 +181,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens, tt.usage.OutputTokens, tt.usage.CacheReadInputTokens, - tt.usage.CachedCreationInputTokens, + tt.usage.CacheCreationInputTokens, ) expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) assert.Equal(t, expected, result) diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go index dce744da00..6a916cc0e2 100644 --- a/internal/translator/openai_gcpanthropic.go +++ b/internal/translator/openai_gcpanthropic.go @@ -833,7 +833,7 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri usage.InputTokens, usage.OutputTokens, usage.CacheReadInputTokens, - usage.CachedCreationInputTokens, + usage.CacheCreationInputTokens, ) inputTokens, _ := tokenUsage.InputTokens() outputTokens, _ := tokenUsage.OutputTokens() From 35b06ed9186d3db9b601c195503a9c5a06e8ffba Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 16:55:03 -0500 Subject: [PATCH 14/20] update anthropic Signed-off-by: Aaron Choo --- internal/translator/anthropic_usage_test.go | 48 +++++++++---------- .../translator/openai_gcpanthropic_test.go | 10 ++-- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index 4ff13ab56d..bfce108d37 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -135,10 +135,10 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { { name: "non-streaming response without cache", usage: anthropic.Usage{ - InputTokens: 150, - OutputTokens: 75, - CacheReadInputTokens: 0, - CachedCreationInputTokens: 0, + InputTokens: 150, + OutputTokens: 75, + CacheReadInputTokens: 0, + CacheCreationInputTokens: 0, }, expectedInputTokens: 150, expectedOutputTokens: 75, @@ -149,10 +149,10 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { { name: "non-streaming response with cache read", usage: anthropic.Usage{ - InputTokens: 100, - OutputTokens: 50, - CacheReadInputTokens: 25, - CachedCreationInputTokens: 0, + InputTokens: 100, + OutputTokens: 50, + CacheReadInputTokens: 25, + CacheCreationInputTokens: 0, }, expectedInputTokens: 125, // 100 + 0 + 25 expectedOutputTokens: 50, @@ -163,10 +163,10 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { { name: "non-streaming response with both cache types", usage: anthropic.Usage{ - InputTokens: 90, - OutputTokens: 60, - CacheReadInputTokens: 15, - CachedCreationInputTokens: 10, + InputTokens: 90, + OutputTokens: 60, + CacheReadInputTokens: 15, + CacheCreationInputTokens: 10, }, expectedInputTokens: 115, // 90 + 10 + 15 expectedOutputTokens: 60, @@ -202,10 +202,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { { name: "message_delta event with final totals", usage: anthropic.MessageDeltaUsage{ - InputTokens: 250, - OutputTokens: 120, - CacheReadInputTokens: 30, - CachedCreationInputTokens: 0, + InputTokens: 250, + OutputTokens: 120, + CacheReadInputTokens: 30, + CacheCreationInputTokens: 0, }, expectedInputTokens: 280, // 250 + 0 + 30 expectedOutputTokens: 120, @@ -216,10 +216,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { { name: "message_delta event with only output tokens", usage: anthropic.MessageDeltaUsage{ - InputTokens: 0, - OutputTokens: 85, - CacheReadInputTokens: 0, - CachedCreationInputTokens: 0, + InputTokens: 0, + OutputTokens: 85, + CacheReadInputTokens: 0, + CacheCreationInputTokens: 0, }, expectedInputTokens: 0, expectedOutputTokens: 85, @@ -230,10 +230,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { { name: "message_delta with cache creation tokens", usage: anthropic.MessageDeltaUsage{ - InputTokens: 150, - OutputTokens: 75, - CacheReadInputTokens: 10, - CachedCreationInputTokens: 5, + InputTokens: 150, + OutputTokens: 75, + CacheReadInputTokens: 10, + CacheCreationInputTokens: 5, }, expectedInputTokens: 165, // 150 + 5 + 10 expectedOutputTokens: 75, diff --git a/internal/translator/openai_gcpanthropic_test.go b/internal/translator/openai_gcpanthropic_test.go index ea0a11d979..e48cd64242 100644 --- a/internal/translator/openai_gcpanthropic_test.go +++ b/internal/translator/openai_gcpanthropic_test.go @@ -598,11 +598,11 @@ func TestOpenAIToGCPAnthropicTranslatorV1ChatCompletion_ResponseBody(t *testing. require.NoError(t, err) expectedTokenUsage := tokenUsageFrom( - int32(tt.expectedOpenAIResponse.Usage.PromptTokens), // nolint:gosec - uint32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.CompletionTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.TotalTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedCreationTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.CompletionTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.TotalTokens), // nolint:gosec ) require.Equal(t, expectedTokenUsage, usedToken) From bfd00542fd11eb48c8981dc4ec6e32fad942382a Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 17:30:02 -0500 Subject: [PATCH 15/20] fix some tests Signed-off-by: Aaron Choo --- internal/apischema/openai/openai.go | 2 +- internal/translator/anthropic_usage_test.go | 4 ++-- internal/translator/openai_awsbedrock.go | 22 ++++++++++--------- .../translator/openai_gcpvertexai_test.go | 12 +++++----- .../data-plane/vcr/prometheus_metrics_test.go | 8 +++++-- 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go index 02e8b9eae8..e5810218b4 100644 --- a/internal/apischema/openai/openai.go +++ b/internal/apischema/openai/openai.go @@ -2554,7 +2554,7 @@ type ResponseTokensDetails struct { CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api // CachedCreationTokens: number of tokens that were written to the cache. - CachedCreationTokens int64 `json:"cached_creation_input_tokens"` //nolint:tagliatelle + CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle // ReasoningTokens: Number of reasoning tokens (for reasoning models). ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index bfce108d37..aa84680b1b 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -183,7 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CacheCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec assert.Equal(t, expected, result) }) } @@ -250,7 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CacheCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec assert.Equal(t, expected, result) }) } diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go index 8318fe88aa..531e9b3c03 100644 --- a/internal/translator/openai_awsbedrock.go +++ b/internal/translator/openai_awsbedrock.go @@ -853,16 +853,18 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe return chunk, false } chunk.Usage = &openai.Usage{ - TotalTokens: event.Usage.TotalTokens, - PromptTokens: event.Usage.InputTokens, - CompletionTokens: event.Usage.OutputTokens, - PromptTokensDetails: &openai.PromptTokensDetails{}, - } - if event.Usage.CacheReadInputTokens != nil { - chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens - } - if event.Usage.CacheWriteInputTokens != nil { - chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens + TotalTokens: event.Usage.TotalTokens, + PromptTokens: event.Usage.InputTokens, + CompletionTokens: event.Usage.OutputTokens, + } + if event.Usage.CacheReadInputTokens != nil || event.Usage.CacheWriteInputTokens != nil { + chunk.Usage.PromptTokensDetails = &openai.PromptTokensDetails{} + if event.Usage.CacheReadInputTokens != nil { + chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens + } + if event.Usage.CacheWriteInputTokens != nil { + chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens + } } // messageStart event. case awsbedrock.ConverseStreamEventTypeMessageStart.String(): diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go index 93740102ce..88cf30c0e4 100644 --- a/internal/translator/openai_gcpvertexai_test.go +++ b/internal/translator/openai_gcpvertexai_test.go @@ -913,7 +913,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T "total_tokens": 25 } }`), - wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25), + wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25), }, { name: "response with safety ratings", @@ -993,7 +993,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T "total_tokens": 20 } }`), - wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20), + wantTokenUsage: tokenUsageFrom(8, 0, 0, 12, 20), }, { name: "empty response", @@ -1025,7 +1025,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t data: [DONE] `), - wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), + wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8), }, { name: "response with model version field", @@ -1080,7 +1080,7 @@ data: [DONE] "total_tokens": 14 } }`), - wantTokenUsage: tokenUsageFrom(6, 0, -1, 8, 14), + wantTokenUsage: tokenUsageFrom(6, 0, 0, 8, 14), }, { @@ -1214,7 +1214,7 @@ data: [DONE] } }`), - wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25), + wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25), }, { name: "stream chunks with thought summary", @@ -1236,7 +1236,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t data: [DONE] `), - wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), + wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8), }, } diff --git a/tests/data-plane/vcr/prometheus_metrics_test.go b/tests/data-plane/vcr/prometheus_metrics_test.go index be17251fec..4951b4994d 100644 --- a/tests/data-plane/vcr/prometheus_metrics_test.go +++ b/tests/data-plane/vcr/prometheus_metrics_test.go @@ -106,8 +106,8 @@ func verifyPrometheusRequestDuration(t *testing.T, metric *dto.MetricFamily, exp func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expectedModel string) { t.Helper() require.NotNil(t, metric) - require.Len(t, metric.Metric, 3) - var inputMetric, cachedInputMetric, outputMetric *dto.Metric + require.Len(t, metric.Metric, 4) + var inputMetric, cachedInputMetric, cachedCreationInputMetric, outputMetric *dto.Metric for _, m := range metric.Metric { for _, label := range m.Label { if *label.Name == "gen_ai_token_type" { @@ -116,6 +116,8 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected inputMetric = m case "cached_input": cachedInputMetric = m + case "cached_creation_input": + cachedCreationInputMetric = m case "output": outputMetric = m } @@ -125,6 +127,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected } require.NotNil(t, inputMetric, "Input metric not found") require.NotNil(t, cachedInputMetric, "Cached Input metric not found") + require.NotNil(t, cachedCreationInputMetric, "Cached Creation Input metric not found") require.NotNil(t, outputMetric, "Output metric not found") type testCase struct { @@ -136,6 +139,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected cases := []testCase{ {inputMetric, "input", 8}, {cachedInputMetric, "cached_input", 0}, + {cachedCreationInputMetric, "cached_creation_input", 0}, {outputMetric, "output", 377}, } From ac126dfb0a319b5d35d7d562ccc4f3d0dd92e6dd Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 18:06:08 -0500 Subject: [PATCH 16/20] fix more tests Signed-off-by: Aaron Choo --- internal/translator/anthropic_gcpanthropic_test.go | 8 ++++---- internal/translator/anthropic_usage_test.go | 6 +++--- internal/translator/openai_awsbedrock.go | 2 +- internal/translator/openai_gcpvertexai_test.go | 10 +++++----- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go index d2c87e4935..187ed4bbc5 100644 --- a/internal/translator/anthropic_gcpanthropic_test.go +++ b/internal/translator/anthropic_gcpanthropic_test.go @@ -642,13 +642,13 @@ data: {"type": "message_stop"} // Assertions assert.True(t, inputSet, "Input tokens should be set") - assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start") + assert.Equal(t, uint32(21), inputTokens, "Input tokens should be preserved from message_start") assert.True(t, outputSet, "Output tokens should be set") assert.Equal(t, uint32(0), outputTokens, "Output tokens should come from message_delta") assert.True(t, totalSet, "Total tokens should be calculated") - assert.Equal(t, uint32(20), totalTokens, "Total tokens should be input + output") + assert.Equal(t, uint32(21), totalTokens, "Total tokens should be input + output") assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") @@ -675,13 +675,13 @@ data: {"type": "message_stop"} cachedCreationTokens, cachedCreationSet = tokenUsage.CachedCreationInputTokens() assert.True(t, inputSet, "Input tokens should be set") - assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start") + assert.Equal(t, uint32(21), inputTokens, "Input tokens should be preserved from message_start") assert.True(t, outputSet, "Output tokens should be set") assert.Equal(t, uint32(5), outputTokens, "Output tokens should come from message_delta") assert.True(t, totalSet, "Total tokens should be calculated") - assert.Equal(t, uint32(25), totalTokens, "Total tokens should be input + output") + assert.Equal(t, uint32(26), totalTokens, "Total tokens should be input + output") assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index aa84680b1b..86e7a50772 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -171,7 +171,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { expectedInputTokens: 115, // 90 + 10 + 15 expectedOutputTokens: 60, expectedTotalTokens: 175, // 115 + 60 - expectedCachedTokens: 25, // 15 + expectedCachedTokens: 15, // 15 expectedCachedCreationTokens: 10, // 10 }, } @@ -301,12 +301,12 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { cachedTokens, ok := result.CachedInputTokens() assert.True(t, ok) - assert.Equal(t, cacheReadTokens, cachedTokens, + assert.Equal(t, uint32(cacheReadTokens), cachedTokens, "CachedInputTokens should be cache_read_input_tokens") cachedCreationTokens, ok := result.CachedCreationInputTokens() assert.True(t, ok) - assert.Equal(t, cachedCreationTokens, cachedCreationTokens, + assert.Equal(t, uint32(cachedWriteTokens), cachedCreationTokens, "CachedCreationInputTokens should be cache_creation_input_tokens") // Total tokens should be input + output. diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go index 531e9b3c03..fad5dbb2cd 100644 --- a/internal/translator/openai_awsbedrock.go +++ b/internal/translator/openai_awsbedrock.go @@ -708,7 +708,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec } if usage.CacheWriteInputTokens != nil { - tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) + tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) //nolint:gosec } } oaiEvent, ok := o.convertEvent(event) diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go index 88cf30c0e4..279580d2fb 100644 --- a/internal/translator/openai_gcpvertexai_test.go +++ b/internal/translator/openai_gcpvertexai_test.go @@ -1025,7 +1025,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t data: [DONE] `), - wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8), + wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), // Does not support cache creation. }, { name: "response with model version field", @@ -1080,7 +1080,7 @@ data: [DONE] "total_tokens": 14 } }`), - wantTokenUsage: tokenUsageFrom(6, 0, 0, 8, 14), + wantTokenUsage: tokenUsageFrom(6, 0, -1, 8, 14), // Does not support Cache Creation. }, { @@ -1149,7 +1149,7 @@ data: [DONE] "total_tokens": 20 } }`), - wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20), + wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20), // Does not support Cache Creation. }, { name: "response with thought summary", @@ -1214,7 +1214,7 @@ data: [DONE] } }`), - wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25), + wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25), // Does not support Cache Creation. }, { name: "stream chunks with thought summary", @@ -1236,7 +1236,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t data: [DONE] `), - wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8), + wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), // Does not support Cache Creation. }, } From a1c4f48a2eb638022d9b4b82ed232ded82c0ee43 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 18:20:26 -0500 Subject: [PATCH 17/20] fixed Signed-off-by: Aaron Choo --- internal/translator/openai_awsbedrock.go | 14 +++++++------- internal/translator/openai_gcpvertexai.go | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go index fad5dbb2cd..6efaea9db7 100644 --- a/internal/translator/openai_awsbedrock.go +++ b/internal/translator/openai_awsbedrock.go @@ -752,7 +752,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string PromptTokens: bedrockResp.Usage.InputTokens, CompletionTokens: bedrockResp.Usage.OutputTokens, } - if openAIResp.Usage.PromptTokensDetails == nil { + if bedrockResp.Usage.CacheReadInputTokens != nil || bedrockResp.Usage.CacheWriteInputTokens != nil { openAIResp.Usage.PromptTokensDetails = &openai.PromptTokensDetails{} } if bedrockResp.Usage.CacheReadInputTokens != nil { @@ -859,12 +859,12 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe } if event.Usage.CacheReadInputTokens != nil || event.Usage.CacheWriteInputTokens != nil { chunk.Usage.PromptTokensDetails = &openai.PromptTokensDetails{} - if event.Usage.CacheReadInputTokens != nil { - chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens - } - if event.Usage.CacheWriteInputTokens != nil { - chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens - } + } + if event.Usage.CacheReadInputTokens != nil { + chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens + } + if event.Usage.CacheWriteInputTokens != nil { + chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens } // messageStart event. case awsbedrock.ConverseStreamEventTypeMessageStart.String(): diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go index 74e99c5741..7aca029b4f 100644 --- a/internal/translator/openai_gcpvertexai.go +++ b/internal/translator/openai_gcpvertexai.go @@ -170,8 +170,7 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount)) //nolint:gosec tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec - // Gemini does not return cache creation input tokens, set to 0. - tokenUsage.SetCachedCreationInputTokens(0) + // Gemini does not return cache creation input tokens; Skipping setCachedCreationInputTokens. } if span != nil { From 337bd11f957e8de684f2736e2fc01938b728efce Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 18:33:06 -0500 Subject: [PATCH 18/20] negative Signed-off-by: Aaron Choo --- internal/translator/openai_gcpvertexai_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go index 279580d2fb..ef9051bb7c 100644 --- a/internal/translator/openai_gcpvertexai_test.go +++ b/internal/translator/openai_gcpvertexai_test.go @@ -913,7 +913,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T "total_tokens": 25 } }`), - wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25), + wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25), }, { name: "response with safety ratings", @@ -993,7 +993,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T "total_tokens": 20 } }`), - wantTokenUsage: tokenUsageFrom(8, 0, 0, 12, 20), + wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20), }, { name: "empty response", From da31c228f967d99a3987f4f6436fafe72323d7d8 Mon Sep 17 00:00:00 2001 From: Aaron Choo Date: Fri, 2 Jan 2026 19:16:13 -0500 Subject: [PATCH 19/20] updated cached creation -> cache creation Signed-off-by: Aaron Choo --- api/v1alpha1/ai_gateway_route.go | 4 +- api/v1alpha1/shared_types.go | 12 +- examples/token_ratelimit/token_ratelimit.yaml | 4 +- internal/apischema/openai/openai.go | 8 +- internal/apischema/openai/openai_test.go | 38 +-- internal/controller/gateway.go | 4 +- internal/controller/gateway_test.go | 4 +- internal/extproc/mocks_test.go | 28 +- internal/extproc/processor_impl.go | 8 +- internal/extproc/processor_impl_test.go | 10 +- internal/filterapi/filterconfig.go | 4 +- internal/llmcostcel/cel.go | 32 +-- internal/llmcostcel/cel_test.go | 2 +- internal/metrics/genai.go | 6 +- internal/metrics/metrics.go | 50 ++-- internal/metrics/metrics_impl.go | 6 +- internal/metrics/metrics_impl_test.go | 24 +- .../openinference/anthropic/messages.go | 2 +- .../openinference/openai/response_attrs.go | 6 +- .../openinference/openai/responses_test.go | 4 +- .../tracing/openinference/openinference.go | 2 +- internal/translator/anthropic_anthropic.go | 4 +- .../translator/anthropic_anthropic_test.go | 4 +- .../translator/anthropic_gcpanthropic_test.go | 18 +- internal/translator/anthropic_usage_test.go | 250 +++++++++--------- internal/translator/openai_awsbedrock.go | 8 +- internal/translator/openai_awsbedrock_test.go | 6 +- internal/translator/openai_completions.go | 4 +- internal/translator/openai_gcpanthropic.go | 6 +- .../translator/openai_gcpanthropic_stream.go | 14 +- .../translator/openai_gcpanthropic_test.go | 10 +- internal/translator/openai_gcpvertexai.go | 2 +- internal/translator/openai_openai.go | 4 +- internal/translator/openai_responses.go | 12 +- internal/translator/openai_responses_test.go | 24 +- ...gateway.envoyproxy.io_aigatewayroutes.yaml | 12 +- site/docs/api/api.mdx | 10 +- .../testdata/aigatewayroutes/llmcosts.yaml | 2 +- tests/data-plane/testupstream_test.go | 6 +- .../data-plane/vcr/prometheus_metrics_test.go | 10 +- 40 files changed, 332 insertions(+), 332 deletions(-) diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go index f99a66bfc4..2e8a109090 100644 --- a/api/v1alpha1/ai_gateway_route.go +++ b/api/v1alpha1/ai_gateway_route.go @@ -108,8 +108,8 @@ type AIGatewayRouteSpec struct { // type: TotalToken // - metadataKey: llm_cached_input_token // type: CachedInputToken - // - metadataKey: llm_cached_creation_input_token - // type: CachedCreationInputToken + // - metadataKey: llm_cache_creation_input_token + // type: CacheCreationInputToken // ``` // Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three // rate limit buckets for each unique x-user-id header value. One bucket is for the input token, diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go index ed4e2c4dc3..09fad0431a 100644 --- a/api/v1alpha1/shared_types.go +++ b/api/v1alpha1/shared_types.go @@ -100,9 +100,9 @@ type LLMRequestCost struct { MetadataKey string `json:"metadataKey"` // Type specifies the type of the request cost. The default is "OutputToken", // and it uses "output token" as the cost. The other types are "InputToken", "TotalToken", - // "CachedInputToken", "CachedCreationInputToken", and "CEL". + // "CachedInputToken", "CacheCreationInputToken", and "CEL". // - // +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedCreationInputToken;TotalToken;CEL + // +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CacheCreationInputToken;TotalToken;CEL Type LLMRequestCostType `json:"type"` // CEL is the CEL expression to calculate the cost of the request. // The CEL expression must return a signed or unsigned integer. If the @@ -114,7 +114,7 @@ type LLMRequestCost struct { // * backend: the backend name in the form of "name.namespace". Type: string. // * input_tokens: the number of input tokens. Type: unsigned integer. // * cached_input_tokens: the number of cached read input tokens. Type: unsigned integer. - // * cached_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer. + // * cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer. // * output_tokens: the number of output tokens. Type: unsigned integer. // * total_tokens: the total number of tokens. Type: unsigned integer. // @@ -122,7 +122,7 @@ type LLMRequestCost struct { // // * "model == 'llama' ? input_tokens + output_token * 0.5 : total_tokens" // * "backend == 'foo.default' ? input_tokens + output_tokens : total_tokens" - // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens * 1.25 + output_tokens : total_tokens" + // * "backend == 'bar.default' ? (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens" // * "input_tokens + output_tokens + total_tokens" // * "input_tokens * output_tokens" // @@ -138,8 +138,8 @@ const ( LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken" // LLMRequestCostTypeCachedInputToken is the cost type of the cached input token. LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken" - // LLMRequestCostTypeCachedCreationInputToken is the cost type of the cached input token. - LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken" + // LLMRequestCostTypeCacheCreationInputToken is the cost type of the cached input token. + LLMRequestCostTypeCacheCreationInputToken LLMRequestCostType = "CacheCreationInputToken" // LLMRequestCostTypeOutputToken is the cost type of the output token. LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken" // LLMRequestCostTypeTotalToken is the cost type of the total token. diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml index 42b35a9a18..3743b0dacd 100644 --- a/examples/token_ratelimit/token_ratelimit.yaml +++ b/examples/token_ratelimit/token_ratelimit.yaml @@ -51,8 +51,8 @@ spec: type: InputToken - metadataKey: llm_cached_input_token type: CachedInputToken - - metadataKey: llm_cached_creation_input_token - type: CachedCreationInputToken + - metadataKey: llm_cache_creation_input_token + type: CacheCreationInputToken - metadataKey: llm_output_token type: OutputToken - metadataKey: llm_total_token diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go index e5810218b4..56277e3563 100644 --- a/internal/apischema/openai/openai.go +++ b/internal/apischema/openai/openai.go @@ -1383,7 +1383,7 @@ type PromptTokensDetails struct { // Cached tokens present in the prompt. CachedTokens int `json:"cached_tokens,omitzero"` // Tokens written to the cache. - CachedCreationTokens int `json:"cached_creation_input_tokens,omitzero"` + CacheCreationTokens int `json:"cache_creation_input_tokens,omitzero"` } // ChatCompletionResponseChunk is described in the OpenAI API documentation: @@ -2539,7 +2539,7 @@ type ResponseUsageInputTokensDetails struct { CachedTokens int64 `json:"cached_tokens"` // The number of tokens that were written to the cache. - CachedCreationTokens int64 `json:"cached_creation_input_tokens"` + CacheCreationTokens int64 `json:"cache_creation_input_tokens"` } // A detailed breakdown of the output tokens. @@ -2553,8 +2553,8 @@ type ResponseTokensDetails struct { // CachedTokens: Number of cached tokens. CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api - // CachedCreationTokens: number of tokens that were written to the cache. - CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle + // CacheCreationTokens: number of tokens that were written to the cache. + CacheCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle // ReasoningTokens: Number of reasoning tokens (for reasoning models). ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go index 21f1fab649..44a1f2aba5 100644 --- a/internal/apischema/openai/openai_test.go +++ b/internal/apischema/openai/openai_test.go @@ -1742,30 +1742,30 @@ func TestPromptTokensDetails(t *testing.T) { { name: "with text tokens", details: PromptTokensDetails{ - TextTokens: 15, - AudioTokens: 8, - CachedTokens: 384, - CachedCreationTokens: 10, + TextTokens: 15, + AudioTokens: 8, + CachedTokens: 384, + CacheCreationTokens: 10, }, expected: `{ "text_tokens": 15, "audio_tokens": 8, "cached_tokens": 384, - "cached_creation_input_tokens": 10 + "cache_creation_input_tokens": 10 }`, }, { name: "with zero text tokens omitted", details: PromptTokensDetails{ - TextTokens: 0, - AudioTokens: 8, - CachedTokens: 384, - CachedCreationTokens: 10, + TextTokens: 0, + AudioTokens: 8, + CachedTokens: 384, + CacheCreationTokens: 10, }, expected: `{ "audio_tokens": 8, "cached_tokens": 384, - "cached_creation_input_tokens": 10 + "cache_creation_input_tokens": 10 }`, }, } @@ -1822,9 +1822,9 @@ func TestChatCompletionResponseUsage(t *testing.T) { RejectedPredictionTokens: 0, }, PromptTokensDetails: &PromptTokensDetails{ - AudioTokens: 8, - CachedTokens: 384, - CachedCreationTokens: 13, + AudioTokens: 8, + CachedTokens: 384, + CacheCreationTokens: 13, }, }, expected: `{ @@ -1838,7 +1838,7 @@ func TestChatCompletionResponseUsage(t *testing.T) { "prompt_tokens_details": { "audio_tokens": 8, "cached_tokens": 384, - "cached_creation_input_tokens": 13 + "cache_creation_input_tokens": 13 } }`, }, @@ -1856,10 +1856,10 @@ func TestChatCompletionResponseUsage(t *testing.T) { RejectedPredictionTokens: 0, }, PromptTokensDetails: &PromptTokensDetails{ - TextTokens: 15, - AudioTokens: 8, - CachedTokens: 384, - CachedCreationTokens: 21, + TextTokens: 15, + AudioTokens: 8, + CachedTokens: 384, + CacheCreationTokens: 21, }, }, expected: `{ @@ -1875,7 +1875,7 @@ func TestChatCompletionResponseUsage(t *testing.T) { "text_tokens": 15, "audio_tokens": 8, "cached_tokens": 384, - "cached_creation_input_tokens": 21 + "cache_creation_input_tokens": 21 } }`, }, diff --git a/internal/controller/gateway.go b/internal/controller/gateway.go index b2e577114f..7469dc711e 100644 --- a/internal/controller/gateway.go +++ b/internal/controller/gateway.go @@ -404,8 +404,8 @@ func (c *GatewayController) reconcileFilterConfigSecret( fc.Type = filterapi.LLMRequestCostTypeInputToken case aigv1a1.LLMRequestCostTypeCachedInputToken: fc.Type = filterapi.LLMRequestCostTypeCachedInputToken - case aigv1a1.LLMRequestCostTypeCachedCreationInputToken: - fc.Type = filterapi.LLMRequestCostTypeCachedCreationInputToken + case aigv1a1.LLMRequestCostTypeCacheCreationInputToken: + fc.Type = filterapi.LLMRequestCostTypeCacheCreationInputToken case aigv1a1.LLMRequestCostTypeOutputToken: fc.Type = filterapi.LLMRequestCostTypeOutputToken case aigv1a1.LLMRequestCostTypeTotalToken: diff --git a/internal/controller/gateway_test.go b/internal/controller/gateway_test.go index fb26991883..e2f98b88c6 100644 --- a/internal/controller/gateway_test.go +++ b/internal/controller/gateway_test.go @@ -197,7 +197,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) { {MetadataKey: "bar", Type: aigv1a1.LLMRequestCostTypeOutputToken}, {MetadataKey: "baz", Type: aigv1a1.LLMRequestCostTypeTotalToken}, {MetadataKey: "qux", Type: aigv1a1.LLMRequestCostTypeCachedInputToken}, - {MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedCreationInputToken}, + {MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCacheCreationInputToken}, }, }, }, @@ -280,7 +280,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) { require.Equal(t, filterapi.LLMRequestCostTypeOutputToken, fc.LLMRequestCosts[1].Type) require.Equal(t, filterapi.LLMRequestCostTypeTotalToken, fc.LLMRequestCosts[2].Type) require.Equal(t, filterapi.LLMRequestCostTypeCachedInputToken, fc.LLMRequestCosts[3].Type) - require.Equal(t, filterapi.LLMRequestCostTypeCachedCreationInputToken, fc.LLMRequestCosts[4].Type) + require.Equal(t, filterapi.LLMRequestCostTypeCacheCreationInputToken, fc.LLMRequestCosts[4].Type) require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[5].Type) require.Equal(t, `backend == 'foo.default' ? input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[5].CEL) require.Len(t, fc.Models, 1) diff --git a/internal/extproc/mocks_test.go b/internal/extproc/mocks_test.go index 6d5087eb9d..fd375c5192 100644 --- a/internal/extproc/mocks_test.go +++ b/internal/extproc/mocks_test.go @@ -171,17 +171,17 @@ func (m *mockMetricsFactory) NewMetrics() metrics.Metrics { // mockMetrics implements [metrics.Metrics] for testing. type mockMetrics struct { - requestStart time.Time - originalModel string - requestModel string - responseModel string - backend string - requestSuccessCount int - requestErrorCount int - inputTokenCount int - cachedInputTokenCount int - cachedCreationInputTokenCount int - outputTokenCount int + requestStart time.Time + originalModel string + requestModel string + responseModel string + backend string + requestSuccessCount int + requestErrorCount int + inputTokenCount int + cachedInputTokenCount int + cacheCreationInputTokenCount int + outputTokenCount int // streamingOutputTokens tracks the cumulative output tokens recorded via RecordTokenLatency. streamingOutputTokens int timeToFirstToken float64 @@ -219,8 +219,8 @@ func (m *mockMetrics) RecordTokenUsage(_ context.Context, usage metrics.TokenUsa if cachedInput, ok := usage.CachedInputTokens(); ok { m.cachedInputTokenCount += int(cachedInput) } - if cachedCreationInput, ok := usage.CachedCreationInputTokens(); ok { - m.cachedCreationInputTokenCount += int(cachedCreationInput) + if cacheCreationInput, ok := usage.CacheCreationInputTokens(); ok { + m.cacheCreationInputTokenCount += int(cacheCreationInput) } if output, ok := usage.OutputTokens(); ok { m.outputTokenCount += int(output) @@ -285,7 +285,7 @@ func (m *mockMetrics) RequireRequestFailure(t *testing.T) { func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedWriteCachedInput, expectedOutput int) { require.Equal(t, expectedInput, m.inputTokenCount) require.Equal(t, expectedCachedInput, m.cachedInputTokenCount) - require.Equal(t, expectedWriteCachedInput, m.cachedCreationInputTokenCount) + require.Equal(t, expectedWriteCachedInput, m.cacheCreationInputTokenCount) require.Equal(t, expectedOutput, m.outputTokenCount) } diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go index 729268644c..c7cf651fb9 100644 --- a/internal/extproc/processor_impl.go +++ b/internal/extproc/processor_impl.go @@ -533,8 +533,8 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU cost, _ = costs.InputTokens() case filterapi.LLMRequestCostTypeCachedInputToken: cost, _ = costs.CachedInputTokens() - case filterapi.LLMRequestCostTypeCachedCreationInputToken: - cost, _ = costs.CachedCreationInputTokens() + case filterapi.LLMRequestCostTypeCacheCreationInputToken: + cost, _ = costs.CacheCreationInputTokens() case filterapi.LLMRequestCostTypeOutputToken: cost, _ = costs.OutputTokens() case filterapi.LLMRequestCostTypeTotalToken: @@ -542,7 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU case filterapi.LLMRequestCostTypeCEL: in, _ := costs.InputTokens() cachedIn, _ := costs.CachedInputTokens() - cachedCreation, _ := costs.CachedCreationInputTokens() + cacheCreation, _ := costs.CacheCreationInputTokens() out, _ := costs.OutputTokens() total, _ := costs.TotalTokens() costU64, err := llmcostcel.EvaluateProgram( @@ -551,7 +551,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU backendName, in, cachedIn, - cachedCreation, + cacheCreation, out, total, ) diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go index 60f612fa17..26855743ff 100644 --- a/internal/extproc/processor_impl_test.go +++ b/internal/extproc/processor_impl_test.go @@ -259,7 +259,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T mt.retUsedToken.SetOutputTokens(123) mt.retUsedToken.SetInputTokens(1) mt.retUsedToken.SetCachedInputTokens(1) - mt.retUsedToken.SetCachedCreationInputTokens(3) + mt.retUsedToken.SetCacheCreationInputTokens(3) celProgInt, err := llmcostcel.NewProgram("54321") require.NoError(t, err) @@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}}, {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}}, - {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cached_creation_input_token_usage"}}, + {LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCacheCreationInputToken, MetadataKey: "cache_creation_input_token_usage"}}, { CELProg: celProgInt, LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"}, @@ -312,7 +312,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cached_input_token_usage"].GetNumberValue()) require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. - GetStructValue().Fields["cached_creation_input_token_usage"].GetNumberValue()) + GetStructValue().Fields["cache_creation_input_token_usage"].GetNumberValue()) require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. GetStructValue().Fields["cel_int"].GetNumberValue()) require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace]. @@ -375,7 +375,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T mt.expResponseBody = final mt.retUsedToken.SetInputTokens(5) mt.retUsedToken.SetCachedInputTokens(3) - mt.retUsedToken.SetCachedCreationInputTokens(21) + mt.retUsedToken.SetCacheCreationInputTokens(21) mt.retUsedToken.SetOutputTokens(138) mt.retUsedToken.SetTotalTokens(143) _, err = p.ProcessResponseBody(t.Context(), final) @@ -385,7 +385,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T require.Equal(t, 138, mm.outputTokenCount) require.Equal(t, 138, mm.streamingOutputTokens) // accumulated output tokens from stream require.Equal(t, 3, mm.cachedInputTokenCount) - require.Equal(t, 21, mm.cachedCreationInputTokenCount) + require.Equal(t, 21, mm.cacheCreationInputTokenCount) }) } diff --git a/internal/filterapi/filterconfig.go b/internal/filterapi/filterconfig.go index b2f10de51f..947cce5f4f 100644 --- a/internal/filterapi/filterconfig.go +++ b/internal/filterapi/filterconfig.go @@ -81,8 +81,8 @@ const ( LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken" // LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached read input token. LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken" - // LLMRequestCostTypeCachedCreationInputToken specifies that the request cost is calculated from the cache creation input token. - LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken" + // LLMRequestCostTypeCacheCreationInputToken specifies that the request cost is calculated from the cache creation input token. + LLMRequestCostTypeCacheCreationInputToken LLMRequestCostType = "CacheCreationInputToken" // LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token. LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken" // LLMRequestCostTypeCEL specifies that the request cost is calculated from the CEL expression. diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go index 46d06c7130..c2ad384268 100644 --- a/internal/llmcostcel/cel.go +++ b/internal/llmcostcel/cel.go @@ -16,13 +16,13 @@ import ( ) const ( - celModelNameKey = "model" - celBackendKey = "backend" - celInputTokensKey = "input_tokens" - celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 - celCachedCreationInputTokensKey = "cached_creation_input_tokens" // #nosec G101 - celOutputTokensKey = "output_tokens" - celTotalTokensKey = "total_tokens" + celModelNameKey = "model" + celBackendKey = "backend" + celInputTokensKey = "input_tokens" + celCachedInputTokensKey = "cached_input_tokens" // #nosec G101 + celCacheCreationInputTokensKey = "cache_creation_input_tokens" // #nosec G101 + celOutputTokensKey = "output_tokens" + celTotalTokensKey = "total_tokens" ) var env *cel.Env @@ -34,7 +34,7 @@ func init() { cel.Variable(celBackendKey, cel.StringType), cel.Variable(celInputTokensKey, cel.UintType), cel.Variable(celCachedInputTokensKey, cel.UintType), - cel.Variable(celCachedCreationInputTokensKey, cel.UintType), + cel.Variable(celCacheCreationInputTokensKey, cel.UintType), cel.Variable(celOutputTokensKey, cel.UintType), cel.Variable(celTotalTokensKey, cel.UintType), ) @@ -64,15 +64,15 @@ func NewProgram(expr string) (prog cel.Program, err error) { } // EvaluateProgram evaluates the given CEL program with the given variables. -func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedCreationInputTokens, outputTokens, totalTokens uint32) (uint64, error) { +func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cacheCreationInputTokens, outputTokens, totalTokens uint32) (uint64, error) { out, _, err := prog.Eval(map[string]any{ - celModelNameKey: modelName, - celBackendKey: backend, - celInputTokensKey: inputTokens, - celCachedInputTokensKey: cachedInputTokens, - celCachedCreationInputTokensKey: cachedCreationInputTokens, - celOutputTokensKey: outputTokens, - celTotalTokensKey: totalTokens, + celModelNameKey: modelName, + celBackendKey: backend, + celInputTokensKey: inputTokens, + celCachedInputTokensKey: cachedInputTokens, + celCacheCreationInputTokensKey: cacheCreationInputTokens, + celOutputTokensKey: outputTokens, + celTotalTokensKey: totalTokens, }) if err != nil || out == nil { return 0, fmt.Errorf("failed to evaluate CEL expression: %w", err) diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go index 92a323fc63..cee9a259a5 100644 --- a/internal/llmcostcel/cel_test.go +++ b/internal/llmcostcel/cel_test.go @@ -26,7 +26,7 @@ func TestNewProgram(t *testing.T) { require.NoError(t, err) }) t.Run("variables", func(t *testing.T) { - prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens - cached_creation_input_tokens) * output_tokens : total_tokens") + prog, err := NewProgram("model == 'cool_model' ? (input_tokens - cached_input_tokens - cache_creation_input_tokens) * output_tokens : total_tokens") require.NoError(t, err) v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3) require.NoError(t, err) diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go index f739bf8764..de560f77bb 100644 --- a/internal/metrics/genai.go +++ b/internal/metrics/genai.go @@ -39,9 +39,9 @@ const ( // https://github.com/open-telemetry/semantic-conventions/issues/1959 // // However, the spec says "a custom value MAY be used.", so we can use it now. - genaiTokenTypeCachedInput = "cached_input" - genaiTokenTypeCachedCreationInput = "cached_creation_input" - genaiErrorTypeFallback = "_OTHER" + genaiTokenTypeCachedInput = "cached_input" + genaiTokenTypeCacheCreationInput = "cache_creation_input" + genaiErrorTypeFallback = "_OTHER" ) // GenAIOperation represents the type of generative AI operation i.e. the endpoint being called. diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index a68d810dac..e9929df812 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -149,10 +149,10 @@ type TokenUsage struct { totalTokens uint32 // CachedInputTokens is the total number of tokens read from cache. cachedInputTokens uint32 - // CachedCreationInputTokens is the total number of tokens written to cache. - cachedCreationInputTokens uint32 + // CacheCreationInputTokens is the total number of tokens written to cache. + cacheCreationInputTokens uint32 - inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedCreationInputTokenSet bool + inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cacheCreationInputTokenSet bool } // InputTokens returns the number of input tokens and whether it was set. @@ -175,9 +175,9 @@ func (u *TokenUsage) CachedInputTokens() (uint32, bool) { return u.cachedInputTokens, u.cachedInputTokenSet } -// CachedCreationInputTokens returns the number of cache creation input tokens and whether it was set. -func (u *TokenUsage) CachedCreationInputTokens() (uint32, bool) { - return u.cachedCreationInputTokens, u.cachedCreationInputTokenSet +// CacheCreationInputTokens returns the number of cache creation input tokens and whether it was set. +func (u *TokenUsage) CacheCreationInputTokens() (uint32, bool) { + return u.cacheCreationInputTokens, u.cacheCreationInputTokenSet } // SetInputTokens sets the number of input tokens and marks the field as set. @@ -204,10 +204,10 @@ func (u *TokenUsage) SetCachedInputTokens(tokens uint32) { u.cachedInputTokenSet = true } -// SetCachedCreationInputTokens sets the number of cache creation input tokens and marks the field as set. -func (u *TokenUsage) SetCachedCreationInputTokens(tokens uint32) { - u.cachedCreationInputTokens = tokens - u.cachedCreationInputTokenSet = true +// SetCacheCreationInputTokens sets the number of cache creation input tokens and marks the field as set. +func (u *TokenUsage) SetCacheCreationInputTokens(tokens uint32) { + u.cacheCreationInputTokens = tokens + u.cacheCreationInputTokenSet = true } // AddInputTokens increments the recorded input tokens and marks the field as set. @@ -228,10 +228,10 @@ func (u *TokenUsage) AddCachedInputTokens(tokens uint32) { u.cachedInputTokens += tokens } -// AddCachedCreationInputTokens increments the recorded cache creation input tokens and marks the field as set. -func (u *TokenUsage) AddCachedCreationInputTokens(tokens uint32) { - u.cachedCreationInputTokenSet = true - u.cachedCreationInputTokens += tokens +// AddCacheCreationInputTokens increments the recorded cache creation input tokens and marks the field as set. +func (u *TokenUsage) AddCacheCreationInputTokens(tokens uint32) { + u.cacheCreationInputTokenSet = true + u.cacheCreationInputTokens += tokens } // Override updates the TokenUsage fields with values from another TokenUsage instance. @@ -253,27 +253,27 @@ func (u *TokenUsage) Override(other TokenUsage) { u.cachedInputTokens = other.cachedInputTokens u.cachedInputTokenSet = true } - if other.cachedCreationInputTokenSet { - u.cachedCreationInputTokens = other.cachedCreationInputTokens - u.cachedCreationInputTokenSet = true + if other.cacheCreationInputTokenSet { + u.cacheCreationInputTokens = other.cacheCreationInputTokens + u.cacheCreationInputTokenSet = true } } // ExtractTokenUsageFromAnthropic extracts the correct token usage from Anthropic API response. // According to Claude API documentation, total input tokens is the summation of: -// input_tokens + cached_creation_input_tokens + cache_read_input_tokens +// input_tokens + cache_creation_input_tokens + cache_read_input_tokens // // This function works for both streaming and non-streaming responses by accepting // the common usage fields that exist in all Anthropic usage structures. -func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens int64) TokenUsage { +func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cacheCreationTokens int64) TokenUsage { // Calculate total input tokens as per Anthropic API documentation - totalInputTokens := inputTokens + cachedCreationTokens + cacheReadTokens + totalInputTokens := inputTokens + cacheCreationTokens + cacheReadTokens var usage TokenUsage - usage.SetInputTokens(uint32(totalInputTokens)) //nolint:gosec - usage.SetOutputTokens(uint32(outputTokens)) //nolint:gosec - usage.SetTotalTokens(uint32(totalInputTokens + outputTokens)) //nolint:gosec - usage.SetCachedInputTokens(uint32(cacheReadTokens)) //nolint:gosec - usage.SetCachedCreationInputTokens(uint32(cachedCreationTokens)) //nolint:gosec + usage.SetInputTokens(uint32(totalInputTokens)) //nolint:gosec + usage.SetOutputTokens(uint32(outputTokens)) //nolint:gosec + usage.SetTotalTokens(uint32(totalInputTokens + outputTokens)) //nolint:gosec + usage.SetCachedInputTokens(uint32(cacheReadTokens)) //nolint:gosec + usage.SetCacheCreationInputTokens(uint32(cacheCreationTokens)) //nolint:gosec return usage } diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go index 8f13a50104..cbf8748467 100644 --- a/internal/metrics/metrics_impl.go +++ b/internal/metrics/metrics_impl.go @@ -148,10 +148,10 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)), ) } - if cachedCreationInputTokens, ok := usage.CachedCreationInputTokens(); ok { - b.metrics.tokenUsage.Record(ctx, float64(cachedCreationInputTokens), + if cacheCreationInputTokens, ok := usage.CacheCreationInputTokens(); ok { + b.metrics.tokenUsage.Record(ctx, float64(cacheCreationInputTokens), metric.WithAttributeSet(attrs), - metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput)), + metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCacheCreationInput)), ) } if outputTokens, ok := usage.OutputTokens(); ok { diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go index bcf6bc8282..720fa5899d 100644 --- a/internal/metrics/metrics_impl_test.go +++ b/internal/metrics/metrics_impl_test.go @@ -71,10 +71,10 @@ func TestRecordTokenUsage(t *testing.T) { attribute.Key(genaiAttributeResponseModel).String("test-model"), } // gen_ai.token.type values - https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/#common-attributes - inputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...) - outputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...) - cachedInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...) - cachedCreationInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput))...) + inputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...) + outputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...) + cachedInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...) + cacheCreationInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCacheCreationInput))...) ) pm.SetOriginalModel("test-model") @@ -82,8 +82,8 @@ func TestRecordTokenUsage(t *testing.T) { pm.SetResponseModel("test-model") pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}}) pm.RecordTokenUsage(t.Context(), TokenUsage{ - inputTokens: 10, cachedInputTokens: 8, cachedCreationInputTokens: 2, outputTokens: 5, - inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true, + inputTokens: 10, cachedInputTokens: 8, cacheCreationInputTokens: 2, outputTokens: 5, + inputTokenSet: true, cachedInputTokenSet: true, cacheCreationInputTokenSet: true, outputTokenSet: true, }, nil) count, sum := testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, inputAttrs) @@ -94,7 +94,7 @@ func TestRecordTokenUsage(t *testing.T) { assert.Equal(t, uint64(1), count) assert.Equal(t, 8.0, sum) - count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs) + count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cacheCreationInputAttrs) assert.Equal(t, uint64(1), count) assert.Equal(t, 2.0, sum) @@ -300,8 +300,8 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) { pm.SetRequestModel("req-model") pm.SetResponseModel("res-model") pm.RecordTokenUsage(t.Context(), TokenUsage{ - inputTokens: 2, cachedInputTokens: 1, cachedCreationInputTokens: 6, outputTokens: 3, - inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true, + inputTokens: 2, cachedInputTokens: 1, cacheCreationInputTokens: 6, outputTokens: 3, + inputTokenSet: true, cachedInputTokenSet: true, cacheCreationInputTokenSet: true, outputTokenSet: true, }, nil) inputAttrs := attribute.NewSet( @@ -328,15 +328,15 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) { assert.Equal(t, uint64(1), count) assert.Equal(t, 1.0, sum) - cachedCreationInputAttrs := attribute.NewSet( + cacheCreationInputAttrs := attribute.NewSet( attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)), attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI), attribute.Key(genaiAttributeOriginalModel).String("orig-model"), attribute.Key(genaiAttributeRequestModel).String("req-model"), attribute.Key(genaiAttributeResponseModel).String("res-model"), - attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput), + attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCacheCreationInput), ) - count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs) + count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cacheCreationInputAttrs) assert.Equal(t, uint64(1), count) assert.Equal(t, 6.0, sum) diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go index ec6940056f..c513611b46 100644 --- a/internal/tracing/openinference/anthropic/messages.go +++ b/internal/tracing/openinference/anthropic/messages.go @@ -215,7 +215,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer ) input, _ := cost.InputTokens() cacheRead, _ := cost.CachedInputTokens() - cacheCreation, _ := cost.CachedCreationInputTokens() + cacheCreation, _ := cost.CacheCreationInputTokens() output, _ := cost.OutputTokens() total, _ := cost.TotalTokens() diff --git a/internal/tracing/openinference/openai/response_attrs.go b/internal/tracing/openinference/openai/response_attrs.go index 5cd3561401..c629430c7a 100644 --- a/internal/tracing/openinference/openai/response_attrs.go +++ b/internal/tracing/openinference/openai/response_attrs.go @@ -58,7 +58,7 @@ func buildResponseAttributes(resp *openai.ChatCompletionResponse, config *openin attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptAudio, td.AudioTokens), attribute.Int(openinference.LLMTokenCountPromptCacheHit, td.CachedTokens), - attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedCreationTokens), + attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CacheCreationTokens), ) } } @@ -194,8 +194,8 @@ func buildResponsesResponseAttributes(resp *openai.Response, _ *openinference.Tr if resp.Usage.InputTokensDetails.CachedTokens > 0 { attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(resp.Usage.InputTokensDetails.CachedTokens))) } - if resp.Usage.InputTokensDetails.CachedCreationTokens > 0 { - attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedCreationTokens))) + if resp.Usage.InputTokensDetails.CacheCreationTokens > 0 { + attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CacheCreationTokens))) } } diff --git a/internal/tracing/openinference/openai/responses_test.go b/internal/tracing/openinference/openai/responses_test.go index 3191c6e82d..3d15ceaabc 100644 --- a/internal/tracing/openinference/openai/responses_test.go +++ b/internal/tracing/openinference/openai/responses_test.go @@ -77,8 +77,8 @@ var ( Usage: &openai.ResponseUsage{ InputTokens: 100, InputTokensDetails: openai.ResponseUsageInputTokensDetails{ - CachedTokens: 10, - CachedCreationTokens: 50, + CachedTokens: 10, + CacheCreationTokens: 50, }, OutputTokens: 25, TotalTokens: 125, diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go index 23fea486a9..f0f6d65ece 100644 --- a/internal/tracing/openinference/openinference.go +++ b/internal/tracing/openinference/openinference.go @@ -163,7 +163,7 @@ const ( // LLMTokenCountPromptCacheWrite represents the number of prompt tokens // created to cache (cache write). This enables tracking of cache efficiency // and cost savings from cached prompts. - LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cached_creation" // #nosec G101 + LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_creation" // #nosec G101 // LLMTokenCountPromptAudio represents the number of audio tokens in the prompt. // Used for multimodal models that support audio input. diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go index 1f5c8d5bf0..0a5294a955 100644 --- a/internal/translator/anthropic_anthropic.go +++ b/internal/translator/anthropic_anthropic.go @@ -181,8 +181,8 @@ func (a *anthropicToAnthropicTranslator) updateTotalTokens() { if _, cachedSet := a.streamingTokenUsage.CachedInputTokens(); !cachedSet { a.streamingTokenUsage.SetCachedInputTokens(0) } - if _, cachedSet := a.streamingTokenUsage.CachedCreationInputTokens(); !cachedSet { - a.streamingTokenUsage.SetCachedCreationInputTokens(0) + if _, cachedSet := a.streamingTokenUsage.CacheCreationInputTokens(); !cachedSet { + a.streamingTokenUsage.SetCacheCreationInputTokens(0) } } diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go index 6d7e12f7ab..236dd537d9 100644 --- a/internal/translator/anthropic_anthropic_test.go +++ b/internal/translator/anthropic_anthropic_test.go @@ -89,7 +89,7 @@ func TestAnthropicToAnthropic_ResponseHeaders(t *testing.T) { func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) { translator := NewAnthropicToAnthropicTranslator("", "") require.NotNil(t, translator) - const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}` + const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}` headerMutation, bodyMutation, tokenUsage, responseModel, err := translator.ResponseBody(nil, strings.NewReader(responseBody), true, nil) require.NoError(t, err) @@ -108,7 +108,7 @@ func TestAnthropicToAnthropic_ResponseBody_streaming(t *testing.T) { // We split the response into two parts to simulate streaming where each part can end in the // middle of an event. const responseHead = `event: message_start -data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}} } +data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}} } event: content_block_start data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go index 187ed4bbc5..ad6a249af8 100644 --- a/internal/translator/anthropic_gcpanthropic_test.go +++ b/internal/translator/anthropic_gcpanthropic_test.go @@ -570,7 +570,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te } } -func tokenUsageFrom(in, cachedInput, cachedCreationInput, out, total int32) metrics.TokenUsage { +func tokenUsageFrom(in, cachedInput, cacheCreationInput, out, total int32) metrics.TokenUsage { var usage metrics.TokenUsage if in >= 0 { usage.SetInputTokens(uint32(in)) @@ -578,8 +578,8 @@ func tokenUsageFrom(in, cachedInput, cachedCreationInput, out, total int32) metr if cachedInput >= 0 { usage.SetCachedInputTokens(uint32(cachedInput)) } - if cachedCreationInput >= 0 { - usage.SetCachedCreationInputTokens(uint32(cachedCreationInput)) + if cacheCreationInput >= 0 { + usage.SetCacheCreationInputTokens(uint32(cacheCreationInput)) } if out >= 0 { usage.SetOutputTokens(uint32(out)) @@ -638,7 +638,7 @@ data: {"type": "message_stop"} outputTokens, outputSet := tokenUsage.OutputTokens() totalTokens, totalSet := tokenUsage.TotalTokens() cachedTokens, cachedSet := tokenUsage.CachedInputTokens() - cachedCreationTokens, cachedCreationSet := tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, cacheCreationSet := tokenUsage.CacheCreationInputTokens() // Assertions assert.True(t, inputSet, "Input tokens should be set") @@ -653,8 +653,8 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") - assert.True(t, cachedCreationSet, "cache creation tokens should be set") - assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario") + assert.True(t, cacheCreationSet, "cache creation tokens should be set") + assert.Equal(t, uint32(1), cacheCreationTokens, "No cache creation tokens in this scenario") _, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil) require.NoError(t, err) @@ -672,7 +672,7 @@ data: {"type": "message_stop"} outputTokens, outputSet = tokenUsage.OutputTokens() totalTokens, totalSet = tokenUsage.TotalTokens() cachedTokens, cachedSet = tokenUsage.CachedInputTokens() - cachedCreationTokens, cachedCreationSet = tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, cacheCreationSet = tokenUsage.CacheCreationInputTokens() assert.True(t, inputSet, "Input tokens should be set") assert.Equal(t, uint32(21), inputTokens, "Input tokens should be preserved from message_start") @@ -686,6 +686,6 @@ data: {"type": "message_stop"} assert.True(t, cachedSet, "Cached tokens should be set") assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario") - assert.True(t, cachedCreationSet, "cache creation tokens should be set") - assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario") + assert.True(t, cacheCreationSet, "cache creation tokens should be set") + assert.Equal(t, uint32(1), cacheCreationTokens, "No cache creation tokens in this scenario") } diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go index 86e7a50772..cb75e8a7d0 100644 --- a/internal/translator/anthropic_usage_test.go +++ b/internal/translator/anthropic_usage_test.go @@ -16,88 +16,88 @@ import ( func TestExtractLLMTokenUsage(t *testing.T) { tests := []struct { - name string - inputTokens int64 - outputTokens int64 - cacheReadTokens int64 - cachedCreationTokens int64 - expectedInputTokens uint32 - expectedOutputTokens uint32 - expectedTotalTokens uint32 - expectedCachedTokens uint32 - expectedCachedCreationTokens uint32 + name string + inputTokens int64 + outputTokens int64 + cacheReadTokens int64 + cacheCreationTokens int64 + expectedInputTokens uint32 + expectedOutputTokens uint32 + expectedTotalTokens uint32 + expectedCachedTokens uint32 + expectedCacheCreationTokens uint32 }{ { - name: "basic usage without cache", - inputTokens: 100, - outputTokens: 50, - cacheReadTokens: 0, - cachedCreationTokens: 0, - expectedInputTokens: 100, - expectedOutputTokens: 50, - expectedTotalTokens: 150, - expectedCachedTokens: 0, - expectedCachedCreationTokens: 0, + name: "basic usage without cache", + inputTokens: 100, + outputTokens: 50, + cacheReadTokens: 0, + cacheCreationTokens: 0, + expectedInputTokens: 100, + expectedOutputTokens: 50, + expectedTotalTokens: 150, + expectedCachedTokens: 0, + expectedCacheCreationTokens: 0, }, { - name: "usage with cache read tokens", - inputTokens: 80, - outputTokens: 30, - cacheReadTokens: 20, - cachedCreationTokens: 0, - expectedInputTokens: 100, // 80 + 0 + 20 - expectedOutputTokens: 30, - expectedTotalTokens: 130, // 100 + 30 - expectedCachedTokens: 20, // 20 - expectedCachedCreationTokens: 0, + name: "usage with cache read tokens", + inputTokens: 80, + outputTokens: 30, + cacheReadTokens: 20, + cacheCreationTokens: 0, + expectedInputTokens: 100, // 80 + 0 + 20 + expectedOutputTokens: 30, + expectedTotalTokens: 130, // 100 + 30 + expectedCachedTokens: 20, // 20 + expectedCacheCreationTokens: 0, }, { - name: "usage with cache creation tokens", - inputTokens: 60, - outputTokens: 40, - cacheReadTokens: 0, - cachedCreationTokens: 15, - expectedInputTokens: 75, // 60 + 15 + 0 - expectedOutputTokens: 40, - expectedTotalTokens: 115, // 75 + 40 - expectedCachedTokens: 0, // 0 - expectedCachedCreationTokens: 15, // 15 + name: "usage with cache creation tokens", + inputTokens: 60, + outputTokens: 40, + cacheReadTokens: 0, + cacheCreationTokens: 15, + expectedInputTokens: 75, // 60 + 15 + 0 + expectedOutputTokens: 40, + expectedTotalTokens: 115, // 75 + 40 + expectedCachedTokens: 0, // 0 + expectedCacheCreationTokens: 15, // 15 }, { - name: "usage with both cache types", - inputTokens: 70, - outputTokens: 25, - cacheReadTokens: 10, - cachedCreationTokens: 5, - expectedInputTokens: 85, // 70 + 5 + 10 - expectedOutputTokens: 25, - expectedTotalTokens: 110, // 85 + 25 - expectedCachedTokens: 10, // 10 - expectedCachedCreationTokens: 5, // 5 + name: "usage with both cache types", + inputTokens: 70, + outputTokens: 25, + cacheReadTokens: 10, + cacheCreationTokens: 5, + expectedInputTokens: 85, // 70 + 5 + 10 + expectedOutputTokens: 25, + expectedTotalTokens: 110, // 85 + 25 + expectedCachedTokens: 10, // 10 + expectedCacheCreationTokens: 5, // 5 }, { - name: "zero values", - inputTokens: 0, - outputTokens: 0, - cacheReadTokens: 0, - cachedCreationTokens: 0, - expectedInputTokens: 0, - expectedOutputTokens: 0, - expectedTotalTokens: 0, - expectedCachedTokens: 0, - expectedCachedCreationTokens: 0, + name: "zero values", + inputTokens: 0, + outputTokens: 0, + cacheReadTokens: 0, + cacheCreationTokens: 0, + expectedInputTokens: 0, + expectedOutputTokens: 0, + expectedTotalTokens: 0, + expectedCachedTokens: 0, + expectedCacheCreationTokens: 0, }, { - name: "large values", - inputTokens: 100000, - outputTokens: 50000, - cacheReadTokens: 25000, - cachedCreationTokens: 15000, - expectedInputTokens: 140000, // 100000 + 15000 + 25000 - expectedOutputTokens: 50000, - expectedTotalTokens: 190000, // 140000 + 50000 - expectedCachedTokens: 25000, // 25000 - expectedCachedCreationTokens: 15000, + name: "large values", + inputTokens: 100000, + outputTokens: 50000, + cacheReadTokens: 25000, + cacheCreationTokens: 15000, + expectedInputTokens: 140000, // 100000 + 15000 + 25000 + expectedOutputTokens: 50000, + expectedTotalTokens: 190000, // 140000 + 50000 + expectedCachedTokens: 25000, // 25000 + expectedCacheCreationTokens: 15000, }, } @@ -107,15 +107,15 @@ func TestExtractLLMTokenUsage(t *testing.T) { tt.inputTokens, tt.outputTokens, tt.cacheReadTokens, - tt.cachedCreationTokens, + tt.cacheCreationTokens, ) expected := tokenUsageFrom( - int32(tt.expectedInputTokens), // nolint:gosec - int32(tt.expectedCachedTokens), // nolint:gosec - int32(tt.expectedCachedCreationTokens), // nolint:gosec - int32(tt.expectedOutputTokens), // nolint:gosec - int32(tt.expectedTotalTokens), // nolint:gosec + int32(tt.expectedInputTokens), // nolint:gosec + int32(tt.expectedCachedTokens), // nolint:gosec + int32(tt.expectedCacheCreationTokens), // nolint:gosec + int32(tt.expectedOutputTokens), // nolint:gosec + int32(tt.expectedTotalTokens), // nolint:gosec ) assert.Equal(t, expected, result) }) @@ -124,13 +124,13 @@ func TestExtractLLMTokenUsage(t *testing.T) { func TestExtractLLMTokenUsageFromUsage(t *testing.T) { tests := []struct { - name string - usage anthropic.Usage - expectedInputTokens int32 - expectedOutputTokens int32 - expectedTotalTokens int32 - expectedCachedTokens uint32 - expectedCachedCreationTokens uint32 + name string + usage anthropic.Usage + expectedInputTokens int32 + expectedOutputTokens int32 + expectedTotalTokens int32 + expectedCachedTokens uint32 + expectedCacheCreationTokens uint32 }{ { name: "non-streaming response without cache", @@ -140,11 +140,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { CacheReadInputTokens: 0, CacheCreationInputTokens: 0, }, - expectedInputTokens: 150, - expectedOutputTokens: 75, - expectedTotalTokens: 225, - expectedCachedTokens: 0, - expectedCachedCreationTokens: 0, + expectedInputTokens: 150, + expectedOutputTokens: 75, + expectedTotalTokens: 225, + expectedCachedTokens: 0, + expectedCacheCreationTokens: 0, }, { name: "non-streaming response with cache read", @@ -154,11 +154,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { CacheReadInputTokens: 25, CacheCreationInputTokens: 0, }, - expectedInputTokens: 125, // 100 + 0 + 25 - expectedOutputTokens: 50, - expectedTotalTokens: 175, // 125 + 50 - expectedCachedTokens: 25, // 25 - expectedCachedCreationTokens: 0, // 0 + expectedInputTokens: 125, // 100 + 0 + 25 + expectedOutputTokens: 50, + expectedTotalTokens: 175, // 125 + 50 + expectedCachedTokens: 25, // 25 + expectedCacheCreationTokens: 0, // 0 }, { name: "non-streaming response with both cache types", @@ -168,11 +168,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { CacheReadInputTokens: 15, CacheCreationInputTokens: 10, }, - expectedInputTokens: 115, // 90 + 10 + 15 - expectedOutputTokens: 60, - expectedTotalTokens: 175, // 115 + 60 - expectedCachedTokens: 15, // 15 - expectedCachedCreationTokens: 10, // 10 + expectedInputTokens: 115, // 90 + 10 + 15 + expectedOutputTokens: 60, + expectedTotalTokens: 175, // 115 + 60 + expectedCachedTokens: 15, // 15 + expectedCacheCreationTokens: 10, // 10 }, } @@ -183,7 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CacheCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCacheCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec assert.Equal(t, expected, result) }) } @@ -191,13 +191,13 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) { func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { tests := []struct { - name string - usage anthropic.MessageDeltaUsage - expectedInputTokens int32 - expectedOutputTokens int32 - expectedTotalTokens int32 - expectedCachedTokens uint32 - expectedCachedCreationTokens uint32 + name string + usage anthropic.MessageDeltaUsage + expectedInputTokens int32 + expectedOutputTokens int32 + expectedTotalTokens int32 + expectedCachedTokens uint32 + expectedCacheCreationTokens uint32 }{ { name: "message_delta event with final totals", @@ -207,11 +207,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 30, CacheCreationInputTokens: 0, }, - expectedInputTokens: 280, // 250 + 0 + 30 - expectedOutputTokens: 120, - expectedTotalTokens: 400, // 280 + 120 - expectedCachedTokens: 30, // 30 - expectedCachedCreationTokens: 0, + expectedInputTokens: 280, // 250 + 0 + 30 + expectedOutputTokens: 120, + expectedTotalTokens: 400, // 280 + 120 + expectedCachedTokens: 30, // 30 + expectedCacheCreationTokens: 0, }, { name: "message_delta event with only output tokens", @@ -221,11 +221,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 0, CacheCreationInputTokens: 0, }, - expectedInputTokens: 0, - expectedOutputTokens: 85, - expectedTotalTokens: 85, - expectedCachedTokens: 0, - expectedCachedCreationTokens: 0, + expectedInputTokens: 0, + expectedOutputTokens: 85, + expectedTotalTokens: 85, + expectedCachedTokens: 0, + expectedCacheCreationTokens: 0, }, { name: "message_delta with cache creation tokens", @@ -235,11 +235,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { CacheReadInputTokens: 10, CacheCreationInputTokens: 5, }, - expectedInputTokens: 165, // 150 + 5 + 10 - expectedOutputTokens: 75, - expectedTotalTokens: 240, // 165 + 75 - expectedCachedTokens: 10, // 10 - expectedCachedCreationTokens: 5, // 5 + expectedInputTokens: 165, // 150 + 5 + 10 + expectedOutputTokens: 75, + expectedTotalTokens: 240, // 165 + 75 + expectedCachedTokens: 10, // 10 + expectedCacheCreationTokens: 5, // 5 }, } @@ -250,7 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) { tt.usage.CacheReadInputTokens, tt.usage.CacheCreationInputTokens, ) - expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec + expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCacheCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec assert.Equal(t, expected, result) }) } @@ -304,10 +304,10 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) { assert.Equal(t, uint32(cacheReadTokens), cachedTokens, "CachedInputTokens should be cache_read_input_tokens") - cachedCreationTokens, ok := result.CachedCreationInputTokens() + cacheCreationTokens, ok := result.CacheCreationInputTokens() assert.True(t, ok) - assert.Equal(t, uint32(cachedWriteTokens), cachedCreationTokens, - "CachedCreationInputTokens should be cache_creation_input_tokens") + assert.Equal(t, uint32(cachedWriteTokens), cacheCreationTokens, + "CacheCreationInputTokens should be cache_creation_input_tokens") // Total tokens should be input + output. expectedTotal := expectedTotalInput + uint32(outputTokens) diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go index 6efaea9db7..f0ecf6a69e 100644 --- a/internal/translator/openai_awsbedrock.go +++ b/internal/translator/openai_awsbedrock.go @@ -708,7 +708,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec } if usage.CacheWriteInputTokens != nil { - tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) //nolint:gosec + tokenUsage.SetCacheCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) //nolint:gosec } } oaiEvent, ok := o.convertEvent(event) @@ -760,8 +760,8 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string openAIResp.Usage.PromptTokensDetails.CachedTokens = *bedrockResp.Usage.CacheReadInputTokens } if bedrockResp.Usage.CacheWriteInputTokens != nil { - tokenUsage.SetCachedCreationInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec - openAIResp.Usage.PromptTokensDetails.CachedCreationTokens = *bedrockResp.Usage.CacheWriteInputTokens + tokenUsage.SetCacheCreationInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec + openAIResp.Usage.PromptTokensDetails.CacheCreationTokens = *bedrockResp.Usage.CacheWriteInputTokens } } @@ -864,7 +864,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens } if event.Usage.CacheWriteInputTokens != nil { - chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens + chunk.Usage.PromptTokensDetails.CacheCreationTokens = *event.Usage.CacheWriteInputTokens } // messageStart event. case awsbedrock.ConverseStreamEventTypeMessageStart.String(): diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go index c533ec6dbf..fd9ef5c6f8 100644 --- a/internal/translator/openai_awsbedrock_test.go +++ b/internal/translator/openai_awsbedrock_test.go @@ -1474,8 +1474,8 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T) PromptTokens: 10, CompletionTokens: 20, PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: 5, - CachedCreationTokens: 7, + CachedTokens: 5, + CacheCreationTokens: 7, }, }, Choices: []openai.ChatCompletionResponseChoice{ @@ -1725,7 +1725,7 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T) expectedUsage.SetCachedInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec } if tt.input.Usage.CacheWriteInputTokens != nil { - expectedUsage.SetCachedCreationInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec + expectedUsage.SetCacheCreationInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CacheCreationTokens)) //nolint:gosec } } else { expectedUsage = tokenUsageFrom(-1, -1, -1, -1, -1) diff --git a/internal/translator/openai_completions.go b/internal/translator/openai_completions.go index 7f5008d8dd..8bd894214d 100644 --- a/internal/translator/openai_completions.go +++ b/internal/translator/openai_completions.go @@ -171,8 +171,8 @@ func (o *openAIToOpenAITranslatorV1Completion) extractUsageFromBufferEvent(span tokenUsage.SetOutputTokens(uint32(usage.CompletionTokens)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(usage.TotalTokens)) //nolint:gosec if usage.PromptTokensDetails != nil { - tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens)) //nolint:gosec - tokenUsage.SetCachedCreationInputTokens(uint32(usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec + tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCacheCreationInputTokens(uint32(usage.PromptTokensDetails.CacheCreationTokens)) //nolint:gosec } // Do not mark buffering done; keep scanning to return the latest usage in this batch. } diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go index 6a916cc0e2..716053cf4b 100644 --- a/internal/translator/openai_gcpanthropic.go +++ b/internal/translator/openai_gcpanthropic.go @@ -839,14 +839,14 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri outputTokens, _ := tokenUsage.OutputTokens() totalTokens, _ := tokenUsage.TotalTokens() cachedTokens, _ := tokenUsage.CachedInputTokens() - cacheWriteTokens, _ := tokenUsage.CachedCreationInputTokens() + cacheWriteTokens, _ := tokenUsage.CacheCreationInputTokens() openAIResp.Usage = openai.Usage{ CompletionTokens: int(outputTokens), PromptTokens: int(inputTokens), TotalTokens: int(totalTokens), PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: int(cachedTokens), - CachedCreationTokens: int(cacheWriteTokens), + CachedTokens: int(cachedTokens), + CacheCreationTokens: int(cacheWriteTokens), }, } diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go index 5d9e12c4a1..1846ec7358 100644 --- a/internal/translator/openai_gcpanthropic_stream.go +++ b/internal/translator/openai_gcpanthropic_stream.go @@ -110,7 +110,7 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t p.tokenUsage.SetTotalTokens(inputTokens + outputTokens) totalTokens, _ := p.tokenUsage.TotalTokens() cachedTokens, _ := p.tokenUsage.CachedInputTokens() - cachedCreationTokens, _ := p.tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, _ := p.tokenUsage.CacheCreationInputTokens() finalChunk := openai.ChatCompletionResponseChunk{ ID: p.activeMessageID, Created: p.created, @@ -121,8 +121,8 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t CompletionTokens: int(outputTokens), TotalTokens: int(totalTokens), PromptTokensDetails: &openai.PromptTokensDetails{ - CachedTokens: int(cachedTokens), - CachedCreationTokens: int(cachedCreationTokens), + CachedTokens: int(cachedTokens), + CacheCreationTokens: int(cacheCreationTokens), }, }, Model: p.requestModel, @@ -213,8 +213,8 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat if cached, ok := usage.CachedInputTokens(); ok { p.tokenUsage.SetCachedInputTokens(cached) } - if cachedCreation, ok := usage.CachedCreationInputTokens(); ok { - p.tokenUsage.SetCachedCreationInputTokens(cachedCreation) + if cacheCreation, ok := usage.CacheCreationInputTokens(); ok { + p.tokenUsage.SetCacheCreationInputTokens(cacheCreation) } // reset the toolIndex for each message @@ -298,10 +298,10 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat p.tokenUsage.AddCachedInputTokens(cached) } // Update input tokens to include write cache tokens from delta - if cached, ok := usage.CachedCreationInputTokens(); ok { + if cached, ok := usage.CacheCreationInputTokens(); ok { p.tokenUsage.AddInputTokens(cached) // Accumulate any additional cache tokens from delta - p.tokenUsage.AddCachedCreationInputTokens(cached) + p.tokenUsage.AddCacheCreationInputTokens(cached) } if event.Delta.StopReason != "" { p.stopReason = event.Delta.StopReason diff --git a/internal/translator/openai_gcpanthropic_test.go b/internal/translator/openai_gcpanthropic_test.go index e48cd64242..1e36924adc 100644 --- a/internal/translator/openai_gcpanthropic_test.go +++ b/internal/translator/openai_gcpanthropic_test.go @@ -598,11 +598,11 @@ func TestOpenAIToGCPAnthropicTranslatorV1ChatCompletion_ResponseBody(t *testing. require.NoError(t, err) expectedTokenUsage := tokenUsageFrom( - int32(tt.expectedOpenAIResponse.Usage.PromptTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedCreationTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.CompletionTokens), // nolint:gosec - int32(tt.expectedOpenAIResponse.Usage.TotalTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CacheCreationTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.CompletionTokens), // nolint:gosec + int32(tt.expectedOpenAIResponse.Usage.TotalTokens), // nolint:gosec ) require.Equal(t, expectedTokenUsage, usedToken) diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go index 7aca029b4f..1fa59c2f9a 100644 --- a/internal/translator/openai_gcpvertexai.go +++ b/internal/translator/openai_gcpvertexai.go @@ -170,7 +170,7 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount)) //nolint:gosec tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec - // Gemini does not return cache creation input tokens; Skipping setCachedCreationInputTokens. + // Gemini does not return cache creation input tokens; Skipping setCacheCreationInputTokens. } if span != nil { diff --git a/internal/translator/openai_openai.go b/internal/translator/openai_openai.go index 65b33a34cd..01b300f17c 100644 --- a/internal/translator/openai_openai.go +++ b/internal/translator/openai_openai.go @@ -141,8 +141,8 @@ func (o *openAIToOpenAITranslatorV1ChatCompletion) ResponseBody(_ map[string]str tokenUsage.SetOutputTokens(uint32(resp.Usage.CompletionTokens)) //nolint:gosec tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) //nolint:gosec if resp.Usage.PromptTokensDetails != nil { - tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec - tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec + tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec + tokenUsage.SetCacheCreationInputTokens(uint32(resp.Usage.PromptTokensDetails.CacheCreationTokens)) //nolint:gosec } // Fallback to request model for test or non-compliant OpenAI backends responseModel = cmp.Or(resp.Model, o.requestModel) diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go index 30934c5c63..3f61e7a8e9 100644 --- a/internal/translator/openai_responses.go +++ b/internal/translator/openai_responses.go @@ -128,11 +128,11 @@ func (o *openAIToOpenAITranslatorV1Responses) handleNonStreamingResponse(body io // TODO: Add reasoning token usage if resp.Usage != nil { - tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens)) // #nosec G115 - tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens)) // #nosec G115 - tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) // #nosec G115 - tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 - tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.InputTokensDetails.CachedCreationTokens)) // #nosec G115 + tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens)) // #nosec G115 + tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens)) // #nosec G115 + tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens)) // #nosec G115 + tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 + tokenUsage.SetCacheCreationInputTokens(uint32(resp.Usage.InputTokensDetails.CacheCreationTokens)) // #nosec G115 } // Record non-streaming response to span if tracing is enabled. @@ -180,7 +180,7 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens)) // #nosec G115 tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115 // Openai does not support cache creation response. - tokenUsage.SetCachedCreationInputTokens(uint32(0)) // #nosec G115 + tokenUsage.SetCacheCreationInputTokens(uint32(0)) // #nosec G115 } // Record streaming chunk to span if tracing is enabled. if span != nil { diff --git a/internal/translator/openai_responses_test.go b/internal/translator/openai_responses_test.go index 363933d525..eb1a757218 100644 --- a/internal/translator/openai_responses_test.go +++ b/internal/translator/openai_responses_test.go @@ -247,9 +247,9 @@ func TestResponsesOpenAIToOpenAITranslator_ResponseBody(t *testing.T) { require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) - cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedCreationTokens) + require.Equal(t, uint32(0), cacheCreationTokens) }) t.Run("non-streaming response with fallback model", func(t *testing.T) { @@ -363,9 +363,9 @@ data: [DONE] require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) - cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedCreationTokens) + require.Equal(t, uint32(0), cacheCreationTokens) }) t.Run("streaming response with fallback model", func(t *testing.T) { @@ -462,9 +462,9 @@ data: [DONE] cachedTokens, _ := tokenUsage.CachedInputTokens() require.Equal(t, uint32(2), cachedTokens) - cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedCreationTokens) + require.Equal(t, uint32(0), cacheCreationTokens) }) t.Run("streaming read error", func(t *testing.T) { @@ -554,9 +554,9 @@ func TestResponses_HandleNonStreamingResponse(t *testing.T) { cachedTokens, _ := tokenUsage.CachedInputTokens() require.Equal(t, uint32(2), cachedTokens) - cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedCreationTokens) + require.Equal(t, uint32(0), cacheCreationTokens) }) t.Run("invalid JSON", func(t *testing.T) { @@ -619,9 +619,9 @@ data: [DONE] require.True(t, ok) require.Equal(t, uint32(2), cachedTokens) - cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens() + cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens() require.True(t, ok) - require.Equal(t, uint32(0), cachedCreationTokens) + require.Equal(t, uint32(0), cacheCreationTokens) }) t.Run("model extraction", func(t *testing.T) { @@ -686,11 +686,11 @@ data: [DONE] _, outputSet := tokenUsage.OutputTokens() _, totalSet := tokenUsage.TotalTokens() _, cachedSet := tokenUsage.CachedInputTokens() - _, cachedCreationSet := tokenUsage.CachedCreationInputTokens() + _, cacheCreationSet := tokenUsage.CacheCreationInputTokens() require.False(t, totalSet) require.False(t, cachedSet) - require.False(t, cachedCreationSet) + require.False(t, cacheCreationSet) require.False(t, inputSet) require.False(t, outputSet) }) diff --git a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml index e75770a409..f256789b72 100644 --- a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml +++ b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml @@ -173,8 +173,8 @@ spec: metadataKey: llm_input_token\n\t type: InputToken\n\t- metadataKey: llm_output_token\n\t type: OutputToken\n\t- metadataKey: llm_total_token\n\t \ type: TotalToken\n\t- metadataKey: llm_cached_input_token\n\t - \ type: CachedInputToken\n- metadataKey: llm_cached_creation_input_token\n - \ type: CachedCreationInputToken\n```\nThen, with the following + \ type: CachedInputToken\n- metadataKey: llm_cache_creation_input_token\n + \ type: CacheCreationInputToken\n```\nThen, with the following BackendTrafficPolicy of Envoy Gateway, you can have three\nrate limit buckets for each unique x-user-id header value. One bucket is for the input token,\nthe other is for the output token, and @@ -230,7 +230,7 @@ spec: \"name.namespace\". Type: string.\n\t* input_tokens: the number of input tokens. Type: unsigned integer.\n\t* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.\n\t* - cached_creation_input_tokens: the number of cache creation + cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.\n\t* output_tokens: the number of output tokens. Type: unsigned integer.\n\t* total_tokens: the total number of tokens. Type: unsigned integer.\n\nFor @@ -238,7 +238,7 @@ spec: == 'llama' ? input_tokens + output_token * 0.5 : total_tokens\"\n\t* \"backend == 'foo.default' ? input_tokens + output_tokens : total_tokens\"\n\t* \"backend == 'bar.default' ? (input_tokens - - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens + - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens\"\n\t* \"input_tokens + output_tokens + total_tokens\"\n\t* \"input_tokens * output_tokens\"" type: string @@ -250,12 +250,12 @@ spec: description: |- Type specifies the type of the request cost. The default is "OutputToken", and it uses "output token" as the cost. The other types are "InputToken", "TotalToken", - "CachedInputToken", "CachedCreationInputToken", and "CEL". + "CachedInputToken", "CacheCreationInputToken", and "CEL". enum: - OutputToken - InputToken - CachedInputToken - - CachedCreationInputToken + - CacheCreationInputToken - TotalToken - CEL type: string diff --git a/site/docs/api/api.mdx b/site/docs/api/api.mdx index 0516d744cb..bac8159967 100644 --- a/site/docs/api/api.mdx +++ b/site/docs/api/api.mdx @@ -757,7 +757,7 @@ AIGatewayRouteSpec details the AIGatewayRoute configuration. name="llmRequestCosts" type="[LLMRequestCost](#llmrequestcost) array" required="false" - description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.
The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic
metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,
For example, let's say we have the following LLMRequestCosts configuration:
```yaml
llmRequestCosts:
- metadataKey: llm_input_token
type: InputToken
- metadataKey: llm_output_token
type: OutputToken
- metadataKey: llm_total_token
type: TotalToken
- metadataKey: llm_cached_input_token
type: CachedInputToken
- metadataKey: llm_cached_creation_input_token
type: CachedCreationInputToken
```
Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
the other is for the output token, and the last one is for the total token.
Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.
```yaml
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: BackendTrafficPolicy
metadata:
name: some-example-token-rate-limit
namespace: default
spec:
targetRefs:
- group: gateway.networking.k8s.io
kind: HTTPRoute
name: usage-rate-limit
rateLimit:
type: Global
global:
rules:
- clientSelectors:
# Do the rate limiting based on the x-user-id header.
- headers:
- name: x-user-id
type: Distinct
limit:
# Configures the number of `tokens` allowed per hour.
requests: 10000
unit: Hour
cost:
request:
from: Number
# Setting the request cost to zero allows to only check the rate limit budget,
# and not consume the budget on the request path.
number: 0
# This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.
# The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited
# if the budget is exhausted.
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_input_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_output_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_total_token
```
Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and
different costs are configured for the same metadata key, the ai-gateway will pick one of them
to configure the metadata key in the generated HTTPRoute, and ignore the rest." + description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.
The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic
metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,
For example, let's say we have the following LLMRequestCosts configuration:
```yaml
llmRequestCosts:
- metadataKey: llm_input_token
type: InputToken
- metadataKey: llm_output_token
type: OutputToken
- metadataKey: llm_total_token
type: TotalToken
- metadataKey: llm_cached_input_token
type: CachedInputToken
- metadataKey: llm_cache_creation_input_token
type: CacheCreationInputToken
```
Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
the other is for the output token, and the last one is for the total token.
Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.
```yaml
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: BackendTrafficPolicy
metadata:
name: some-example-token-rate-limit
namespace: default
spec:
targetRefs:
- group: gateway.networking.k8s.io
kind: HTTPRoute
name: usage-rate-limit
rateLimit:
type: Global
global:
rules:
- clientSelectors:
# Do the rate limiting based on the x-user-id header.
- headers:
- name: x-user-id
type: Distinct
limit:
# Configures the number of `tokens` allowed per hour.
requests: 10000
unit: Hour
cost:
request:
from: Number
# Setting the request cost to zero allows to only check the rate limit budget,
# and not consume the budget on the request path.
number: 0
# This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.
# The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited
# if the budget is exhausted.
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_input_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_output_token
- clientSelectors:
- headers:
- name: x-user-id
type: Distinct
limit:
requests: 10000
unit: Hour
cost:
request:
from: Number
number: 0
response:
from: Metadata
metadata:
namespace: io.envoy.ai_gateway
key: llm_total_token
```
Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and
different costs are configured for the same metadata key, the ai-gateway will pick one of them
to configure the metadata key in the generated HTTPRoute, and ignore the rest." /> @@ -1664,12 +1664,12 @@ LLMRequestCost configures each request cost. name="type" type="[LLMRequestCostType](#llmrequestcosttype)" required="true" - description="Type specifies the type of the request cost. The default is `OutputToken`,
and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,
`CachedInputToken`, `CachedCreationInputToken`, and `CEL`." + description="Type specifies the type of the request cost. The default is `OutputToken`,
and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,
`CachedInputToken`, `CacheCreationInputToken`, and `CEL`." /> @@ -1697,10 +1697,10 @@ LLMRequestCostType specifies the type of the LLMRequestCost. required="false" description="LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.
" /> Date: Fri, 2 Jan 2026 19:22:35 -0500 Subject: [PATCH 20/20] update missing Signed-off-by: Aaron Choo --- tests/data-plane/vcr/prometheus_metrics_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data-plane/vcr/prometheus_metrics_test.go b/tests/data-plane/vcr/prometheus_metrics_test.go index 0e365e956b..5a9161f441 100644 --- a/tests/data-plane/vcr/prometheus_metrics_test.go +++ b/tests/data-plane/vcr/prometheus_metrics_test.go @@ -127,7 +127,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected } require.NotNil(t, inputMetric, "Input metric not found") require.NotNil(t, cachedInputMetric, "Cached Input metric not found") - require.NotNil(t, cacheCreationInputMetric, "Cached Creation Input metric not found") + require.NotNil(t, cacheCreationInputMetric, "Cache Creation Input metric not found") require.NotNil(t, outputMetric, "Output metric not found") type testCase struct {