From 0209b06a3dfddc90312ec92ac52188169ebcaa90 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 13:46:56 -0500
Subject: [PATCH 01/20] add cache writes

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 api/v1alpha1/ai_gateway_route.go              |  2 +
 api/v1alpha1/shared_types.go                  | 11 +++--
 examples/token_ratelimit/token_ratelimit.yaml |  2 +
 internal/apischema/openai/openai.go           |  8 ++++
 internal/apischema/openai/openai_test.go      | 38 +++++++++-------
 internal/controller/gateway.go                |  2 +
 internal/extproc/processor_impl.go            |  4 ++
 internal/filterapi/filterconfig.go            |  4 +-
 internal/metrics/metrics.go                   | 31 ++++++++++---
 .../openinference/anthropic/messages.go       |  6 ++-
 .../openinference/anthropic/messages_test.go  |  1 +
 .../openinference/openai/response_attrs.go    |  4 ++
 .../openai/response_attrs_test.go             |  1 +
 .../openinference/openai/responses_test.go    | 43 +++++++++++++++++++
 .../tracing/openinference/openinference.go    |  5 +++
 .../translator/openai_gcpanthropic_stream.go  |  8 ++++
 16 files changed, 143 insertions(+), 27 deletions(-)

diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go
index d77f7d1f24..b4dac38660 100644
--- a/api/v1alpha1/ai_gateway_route.go
+++ b/api/v1alpha1/ai_gateway_route.go
@@ -108,6 +108,8 @@ type AIGatewayRouteSpec struct {
 	//	  type: TotalToken
 	//	- metadataKey: llm_cached_input_token
 	//	  type: CachedInputToken
+	// - metadataKey: llm_cached_write_input_token
+	//    type: CachedWriteInputToken
 	// ```
 	// Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
 	// rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go
index 2241beb051..7a34cf4128 100644
--- a/api/v1alpha1/shared_types.go
+++ b/api/v1alpha1/shared_types.go
@@ -100,9 +100,9 @@ type LLMRequestCost struct {
 	MetadataKey string `json:"metadataKey"`
 	// Type specifies the type of the request cost. The default is "OutputToken",
 	// and it uses "output token" as the cost. The other types are "InputToken", "TotalToken",
-	// and "CEL".
+	// "CachedInputToken", "CachedWriteInputToken", and "CEL".
 	//
-	// +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;TotalToken;CEL
+	// +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedWriteInputToken;TotalToken;CEL
 	Type LLMRequestCostType `json:"type"`
 	// CEL is the CEL expression to calculate the cost of the request.
 	// The CEL expression must return a signed or unsigned integer. If the
@@ -113,7 +113,8 @@ type LLMRequestCost struct {
 	//	* model: the model name extracted from the request content. Type: string.
 	//	* backend: the backend name in the form of "name.namespace". Type: string.
 	//	* input_tokens: the number of input tokens. Type: unsigned integer.
-	//	* cached_input_tokens: the number of cached input tokens. Type: unsigned integer.
+	//	* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.
+	//	* cached_write_input_tokens: the number of cached write input tokens. Type: unsigned integer.
 	//	* output_tokens: the number of output tokens. Type: unsigned integer.
 	//	* total_tokens: the total number of tokens. Type: unsigned integer.
 	//
@@ -121,7 +122,7 @@ type LLMRequestCost struct {
 	//
 	// 	* "model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens"
 	//	* "backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens"
-	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + output_tokens : total_tokens"
+	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_write_input_tokens * 1.25 + output_tokens : total_tokens"
 	//	* "input_tokens + output_tokens + total_tokens"
 	//	* "input_tokens * output_tokens"
 	//
@@ -137,6 +138,8 @@ const (
 	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
 	// LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.
 	LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken"
+	// LLMRequestCostTypeCachedWriteInputToken is the cost type of the cached input token.
+	LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken"
 	// LLMRequestCostTypeOutputToken is the cost type of the output token.
 	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
 	// LLMRequestCostTypeTotalToken is the cost type of the total token.
diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml
index 8c803f21d2..2224ed724a 100644
--- a/examples/token_ratelimit/token_ratelimit.yaml
+++ b/examples/token_ratelimit/token_ratelimit.yaml
@@ -51,6 +51,8 @@ spec:
       type: InputToken
     - metadataKey: llm_cached_input_token
       type: CachedInputToken
+    - metadataKey: llm_cached_write_input_token
+      type: CachedWriteInputToken
     - metadataKey: llm_output_token
       type: OutputToken
     - metadataKey: llm_total_token
diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go
index bfe6584b29..0c74a3e244 100644
--- a/internal/apischema/openai/openai.go
+++ b/internal/apischema/openai/openai.go
@@ -1382,6 +1382,8 @@ type PromptTokensDetails struct {
 	AudioTokens int `json:"audio_tokens,omitzero"`
 	// Cached tokens present in the prompt.
 	CachedTokens int `json:"cached_tokens,omitzero"`
+	// Tokens written to the cache.
+	CachedWriteTokens int `json:"cached_write_tokens,omitzero"`
 }
 
 // ChatCompletionResponseChunk is described in the OpenAI API documentation:
@@ -2535,6 +2537,9 @@ type ResponseUsageInputTokensDetails struct {
 	// The number of tokens that were retrieved from the cache.
 	// [More on prompt caching](https://platform.openai.com/docs/guides/prompt-caching).
 	CachedTokens int64 `json:"cached_tokens"`
+
+	// The number of tokens that were written to the cache.
+	CachedWriteTokens int64 `json:"cached_write_tokens"`
 }
 
 // A detailed breakdown of the output tokens.
@@ -2548,6 +2553,9 @@ type ResponseTokensDetails struct {
 	// CachedTokens: Number of cached tokens.
 	CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api
 
+	// CachedWriteTokens: number of tokens that were written to the cache.
+	CachedWriteTokens int64 `json:"cached_write_tokens"` //nolint:tagliatelle
+
 	// ReasoningTokens: Number of reasoning tokens (for reasoning models).
 	ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api
 
diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go
index 974efe11ee..d9df99e6a4 100644
--- a/internal/apischema/openai/openai_test.go
+++ b/internal/apischema/openai/openai_test.go
@@ -1742,26 +1742,30 @@ func TestPromptTokensDetails(t *testing.T) {
 		{
 			name: "with text tokens",
 			details: PromptTokensDetails{
-				TextTokens:   15,
-				AudioTokens:  8,
-				CachedTokens: 384,
+				TextTokens:        15,
+				AudioTokens:       8,
+				CachedTokens:      384,
+				CachedWriteTokens: 10,
 			},
 			expected: `{
 				"text_tokens": 15,
 				"audio_tokens": 8,
-				"cached_tokens": 384
+				"cached_tokens": 384,
+				"cached_write_tokens": 10
 			}`,
 		},
 		{
 			name: "with zero text tokens omitted",
 			details: PromptTokensDetails{
-				TextTokens:   0,
-				AudioTokens:  8,
-				CachedTokens: 384,
+				TextTokens:        0,
+				AudioTokens:       8,
+				CachedTokens:      384,
+				CachedWriteTokens: 10,
 			},
 			expected: `{
 				"audio_tokens": 8,
-				"cached_tokens": 384
+				"cached_tokens": 384,
+				"cached_write_tokens": 10
 			}`,
 		},
 	}
@@ -1818,8 +1822,9 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					RejectedPredictionTokens: 0,
 				},
 				PromptTokensDetails: &PromptTokensDetails{
-					AudioTokens:  8,
-					CachedTokens: 384,
+					AudioTokens:       8,
+					CachedTokens:      384,
+					CachedWriteTokens: 13,
 				},
 			},
 			expected: `{
@@ -1832,7 +1837,8 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 				},
 				"prompt_tokens_details": {
 					"audio_tokens": 8,
-					"cached_tokens": 384
+					"cached_tokens": 384,
+					"cached_write_tokens": 13
 				}
 			}`,
 		},
@@ -1850,9 +1856,10 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					RejectedPredictionTokens: 0,
 				},
 				PromptTokensDetails: &PromptTokensDetails{
-					TextTokens:   15,
-					AudioTokens:  8,
-					CachedTokens: 384,
+					TextTokens:        15,
+					AudioTokens:       8,
+					CachedTokens:      384,
+					CachedWriteTokens: 21,
 				},
 			},
 			expected: `{
@@ -1867,7 +1874,8 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 				"prompt_tokens_details": {
 					"text_tokens": 15,
 					"audio_tokens": 8,
-					"cached_tokens": 384
+					"cached_tokens": 384,
+					"cached_write_tokens": 21
 				}
 			}`,
 		},
diff --git a/internal/controller/gateway.go b/internal/controller/gateway.go
index 2ae6286f6d..db952afb21 100644
--- a/internal/controller/gateway.go
+++ b/internal/controller/gateway.go
@@ -404,6 +404,8 @@ func (c *GatewayController) reconcileFilterConfigSecret(
 					fc.Type = filterapi.LLMRequestCostTypeInputToken
 				case aigv1a1.LLMRequestCostTypeCachedInputToken:
 					fc.Type = filterapi.LLMRequestCostTypeCachedInputToken
+				case aigv1a1.LLMRequestCostTypeCachedWriteInputToken:
+					fc.Type = filterapi.LLMRequestCostTypeCachedWriteInputToken
 				case aigv1a1.LLMRequestCostTypeOutputToken:
 					fc.Type = filterapi.LLMRequestCostTypeOutputToken
 				case aigv1a1.LLMRequestCostTypeTotalToken:
diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go
index 0c9e859934..887fa89e0c 100644
--- a/internal/extproc/processor_impl.go
+++ b/internal/extproc/processor_impl.go
@@ -533,6 +533,8 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 			cost, _ = costs.InputTokens()
 		case filterapi.LLMRequestCostTypeCachedInputToken:
 			cost, _ = costs.CachedInputTokens()
+		case filterapi.LLMRequestCostTypeCachedWriteInputToken:
+			cost, _ = costs.CachedWriteInputTokens()
 		case filterapi.LLMRequestCostTypeOutputToken:
 			cost, _ = costs.OutputTokens()
 		case filterapi.LLMRequestCostTypeTotalToken:
@@ -540,6 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 		case filterapi.LLMRequestCostTypeCEL:
 			in, _ := costs.InputTokens()
 			cachedIn, _ := costs.CachedInputTokens()
+			cachedWrite, _ := costs.CachedInputWriteTokens()
 			out, _ := costs.OutputTokens()
 			total, _ := costs.TotalTokens()
 			costU64, err := llmcostcel.EvaluateProgram(
@@ -548,6 +551,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 				backendName,
 				in,
 				cachedIn,
+				cachedWrite,
 				out,
 				total,
 			)
diff --git a/internal/filterapi/filterconfig.go b/internal/filterapi/filterconfig.go
index 226cf95446..f191d6eb01 100644
--- a/internal/filterapi/filterconfig.go
+++ b/internal/filterapi/filterconfig.go
@@ -79,8 +79,10 @@ const (
 	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
 	// LLMRequestCostTypeInputToken specifies that the request cost is calculated from the input token.
 	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
-	// LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached input token.
+	// LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached read input token.
 	LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken"
+	// LLMRequestCostTypeCachedWriteInputToken specifies that the request cost is calculated from the cached write input token.
+	LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken"
 	// LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token.
 	LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
 	// LLMRequestCostTypeCEL specifies that the request cost is calculated from the CEL expression.
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 4a4fab54db..6c473863bd 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -149,8 +149,10 @@ type TokenUsage struct {
 	totalTokens uint32
 	// CachedInputTokens is the total number of tokens read from cache.
 	cachedInputTokens uint32
+	// CachedWriteInputTokens is the total number of tokens written to cache.
+	cachedWriteInputTokens uint32
 
-	inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet bool
+	inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedWriteInputTokenSet bool
 }
 
 // InputTokens returns the number of input tokens and whether it was set.
@@ -173,6 +175,11 @@ func (u *TokenUsage) CachedInputTokens() (uint32, bool) {
 	return u.cachedInputTokens, u.cachedInputTokenSet
 }
 
+// CachedWriteInputTokens returns the number of cached write input tokens and whether it was set.
+func (u *TokenUsage) CachedWriteInputTokens() (uint32, bool) {
+	return u.cachedWriteInputTokens, u.cachedWriteInputTokenSet
+}
+
 // SetInputTokens sets the number of input tokens and marks the field as set.
 func (u *TokenUsage) SetInputTokens(tokens uint32) {
 	u.inputTokens = tokens
@@ -197,6 +204,12 @@ func (u *TokenUsage) SetCachedInputTokens(tokens uint32) {
 	u.cachedInputTokenSet = true
 }
 
+// SetCachedWriteInputTokens sets the number of cached write input tokens and marks the field as set.
+func (u *TokenUsage) SetCachedWriteInputTokens(tokens uint32) {
+	u.cachedWriteInputTokens = tokens
+	u.cachedWriteInputTokenSet = true
+}
+
 // AddInputTokens increments the recorded input tokens and marks the field as set.
 func (u *TokenUsage) AddInputTokens(tokens uint32) {
 	u.inputTokenSet = true
@@ -215,6 +228,12 @@ func (u *TokenUsage) AddCachedInputTokens(tokens uint32) {
 	u.cachedInputTokens += tokens
 }
 
+// AddCachedWriteInputTokens increments the recorded cached write input tokens and marks the field as set.
+func (u *TokenUsage) AddCachedWriteInputTokens(tokens uint32) {
+	u.cachedWriteInputTokenSet = true
+	u.cachedWriteInputTokens += tokens
+}
+
 // Override updates the TokenUsage fields with values from another TokenUsage instance.
 // Only fields that are marked as set in the other instance will override the current values.
 func (u *TokenUsage) Override(other TokenUsage) {
@@ -234,6 +253,10 @@ func (u *TokenUsage) Override(other TokenUsage) {
 		u.cachedInputTokens = other.cachedInputTokens
 		u.cachedInputTokenSet = true
 	}
+	if other.cachedWriteInputTokenSet {
+		u.cachedWriteInputTokens = other.cachedWriteInputTokens
+		u.cachedWriteInputTokenSet = true
+	}
 }
 
 // ExtractTokenUsageFromAnthropic extracts the correct token usage from Anthropic API response.
@@ -246,13 +269,11 @@ func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens,
 	// Calculate total input tokens as per Anthropic API documentation
 	totalInputTokens := inputTokens + cacheCreationTokens + cacheReadTokens
 
-	// Cache tokens include both read and creation tokens
-	totalCachedTokens := cacheReadTokens + cacheCreationTokens
-
 	var usage TokenUsage
 	usage.SetInputTokens(uint32(totalInputTokens))                //nolint:gosec
 	usage.SetOutputTokens(uint32(outputTokens))                   //nolint:gosec
 	usage.SetTotalTokens(uint32(totalInputTokens + outputTokens)) //nolint:gosec
-	usage.SetCachedInputTokens(uint32(totalCachedTokens))         //nolint:gosec
+	usage.SetCachedInputTokens(uint32(cacheReadTokens))           //nolint:gosec
+	usage.SetCachedWriteInputTokens(uint32(cacheCreationTokens))  //nolint:gosec
 	return usage
 }
diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go
index 4b6fe1f771..340e37c459 100644
--- a/internal/tracing/openinference/anthropic/messages.go
+++ b/internal/tracing/openinference/anthropic/messages.go
@@ -214,13 +214,15 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer
 		int64(u.CacheCreationInputTokens),
 	)
 	input, _ := cost.InputTokens()
-	cache, _ := cost.CachedInputTokens()
+	cacheRead, _ := cost.CachedInputTokens()
+	cacheCreation, _ := cost.CachedIWritenputTokens()
 	output, _ := cost.OutputTokens()
 	total, _ := cost.TotalTokens()
 
 	attrs = append(attrs,
 		attribute.Int(openinference.LLMTokenCountPrompt, int(input)),
-		attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(cache)),
+		attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(cacheRead)),
+		attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(cacheCreation)),
 		attribute.Int(openinference.LLMTokenCountCompletion, int(output)),
 		attribute.Int(openinference.LLMTokenCountTotal, int(total)),
 	)
diff --git a/internal/tracing/openinference/anthropic/messages_test.go b/internal/tracing/openinference/anthropic/messages_test.go
index 7f6384a624..6a6cf3bbff 100644
--- a/internal/tracing/openinference/anthropic/messages_test.go
+++ b/internal/tracing/openinference/anthropic/messages_test.go
@@ -326,6 +326,7 @@ func TestMessageRecorder_RecordResponse(t *testing.T) {
 				attribute.String(openinference.OutputMessageToolCallAttribute(1, 0, openinference.ToolCallFunctionArguments), `{"timezone":"UTC"}`),
 				attribute.Int(openinference.LLMTokenCountPrompt, 10),
 				attribute.Int(openinference.LLMTokenCountPromptCacheHit, 0),
+				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, 0),
 				attribute.Int(openinference.LLMTokenCountCompletion, 5),
 				attribute.Int(openinference.LLMTokenCountTotal, 15),
 			},
diff --git a/internal/tracing/openinference/openai/response_attrs.go b/internal/tracing/openinference/openai/response_attrs.go
index a771014b0b..178c22f0da 100644
--- a/internal/tracing/openinference/openai/response_attrs.go
+++ b/internal/tracing/openinference/openai/response_attrs.go
@@ -58,6 +58,7 @@ func buildResponseAttributes(resp *openai.ChatCompletionResponse, config *openin
 			attrs = append(attrs,
 				attribute.Int(openinference.LLMTokenCountPromptAudio, td.AudioTokens),
 				attribute.Int(openinference.LLMTokenCountPromptCacheHit, td.CachedTokens),
+				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedWriteTokens),
 			)
 		}
 	}
@@ -193,6 +194,9 @@ func buildResponsesResponseAttributes(resp *openai.Response, _ *openinference.Tr
 		if resp.Usage.InputTokensDetails.CachedTokens > 0 {
 			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(resp.Usage.InputTokensDetails.CachedTokens)))
 		}
+		if resp.Usage.InputTokensDetails.CachedWriteTokens > 0 {
+			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedWriteTokens)))
+		}
 	}
 
 	return attrs
diff --git a/internal/tracing/openinference/openai/response_attrs_test.go b/internal/tracing/openinference/openai/response_attrs_test.go
index c5de581f30..f147b891fe 100644
--- a/internal/tracing/openinference/openai/response_attrs_test.go
+++ b/internal/tracing/openinference/openai/response_attrs_test.go
@@ -182,6 +182,7 @@ func TestBuildResponseAttributes(t *testing.T) {
 				attribute.Int(openinference.LLMTokenCountPrompt, 9),
 				attribute.Int(openinference.LLMTokenCountPromptAudio, 0),
 				attribute.Int(openinference.LLMTokenCountPromptCacheHit, 0),
+				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, 0),
 				attribute.Int(openinference.LLMTokenCountCompletion, 9),
 				attribute.Int(openinference.LLMTokenCountCompletionAudio, 0),
 				attribute.Int(openinference.LLMTokenCountCompletionReasoning, 0),
diff --git a/internal/tracing/openinference/openai/responses_test.go b/internal/tracing/openinference/openai/responses_test.go
index d623b124f7..76fbdc9eb5 100644
--- a/internal/tracing/openinference/openai/responses_test.go
+++ b/internal/tracing/openinference/openai/responses_test.go
@@ -58,6 +58,34 @@ var (
 	}
 	basicResponseRespBody = mustJSON(basicResponseResp)
 
+	responseWithCacheWrite = &openai.Response{
+		ID:    "resp-456",
+		Model: openai.ModelGPT5Nano,
+		Output: []responses.ResponseOutputItemUnion{
+			{
+				ID:   "msg_02",
+				Type: "message",
+				Role: "assistant",
+				Content: []responses.ResponseOutputMessageContentUnion{
+					{
+						Type: "output_text",
+						Text: "This response includes cache write tokens.",
+					},
+				},
+			},
+		},
+		Usage: &openai.ResponseUsage{
+			InputTokens: 100,
+			InputTokensDetails: openai.ResponseUsageInputTokensDetails{
+				CachedTokens:      10,
+				CachedWriteTokens: 50,
+			},
+			OutputTokens: 25,
+			TotalTokens:  125,
+		},
+	}
+	responseWithCacheWriteBody = mustJSON(responseWithCacheWrite)
+
 	responseReqWithStreaming = &openai.ResponseRequest{
 		Model: openai.ModelGPT5Nano,
 		Input: responses.ResponseNewParamsInputUnion{
@@ -173,6 +201,21 @@ func TestResponsesRecorder_RecordResponse(t *testing.T) {
 			},
 			expectedStatus: trace.Status{Code: codes.Ok, Description: ""},
 		},
+		{
+			name:   "response with cache write",
+			resp:   responseWithCacheWrite,
+			config: &openinference.TraceConfig{},
+			expectedAttrs: []attribute.KeyValue{
+				attribute.String(openinference.LLMModelName, openai.ModelGPT5Nano),
+				attribute.Int(openinference.LLMTokenCountPrompt, 100),
+				attribute.Int(openinference.LLMTokenCountCompletion, 25),
+				attribute.Int(openinference.LLMTokenCountTotal, 125),
+				attribute.Int(openinference.LLMTokenCountPromptCacheHit, 10),
+				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, 50),
+				attribute.String(openinference.OutputValue, string(responseWithCacheWriteBody)),
+			},
+			expectedStatus: trace.Status{Code: codes.Ok, Description: ""},
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go
index 7913b69f57..aa50bca94c 100644
--- a/internal/tracing/openinference/openinference.go
+++ b/internal/tracing/openinference/openinference.go
@@ -160,6 +160,11 @@ const (
 	// and cost savings from cached prompts.
 	LLMTokenCountPromptCacheHit = "llm.token_count.prompt_details.cache_read" // #nosec G101
 
+	// LLMTokenCountPromptCacheWrite represents the number of prompt tokens
+	// written to cache (cache writes). This enables tracking of cache efficiency
+	// and cost savings from cached prompts.
+	LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_write" // #nosec G101
+
 	// LLMTokenCountPromptAudio represents the number of audio tokens in the prompt.
 	// Used for multimodal models that support audio input.
 	LLMTokenCountPromptAudio = "llm.token_count.prompt_details.audio" // #nosec G101
diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go
index 60f4a60b05..17b6f11f75 100644
--- a/internal/translator/openai_gcpanthropic_stream.go
+++ b/internal/translator/openai_gcpanthropic_stream.go
@@ -211,6 +211,9 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 		if cached, ok := usage.CachedInputTokens(); ok {
 			p.tokenUsage.SetCachedInputTokens(cached)
 		}
+		if cachedWrite, ok := usage.CachedWriteInputTokens(); ok {
+			p.tokenUsage.SetCachedWriteInputTokens(cachedWrite)
+		}
 
 		// reset the toolIndex for each message
 		p.toolIndex = -1
@@ -292,6 +295,11 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 			// Accumulate any additional cache tokens from delta
 			p.tokenUsage.AddCachedInputTokens(cached)
 		}
+		if cached, ok := usage.CachedWriteInputTokens(); ok {
+			p.tokenUsage.AddInputTokens(cached)
+			// Accumulate any additional cache tokens from delta
+			p.tokenUsage.AddCachedWriteInputTokens(cached)
+		}
 		if event.Delta.StopReason != "" {
 			p.stopReason = event.Delta.StopReason
 		}

From 985d4a90bd7424ee0379405fc60b3c88fc421b85 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 14:59:50 -0500
Subject: [PATCH 02/20] found more cache needing update

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/controller/gateway_test.go           |   8 +-
 internal/extproc/mocks_test.go                |  27 +-
 internal/extproc/processor_impl.go            |   2 +-
 internal/extproc/processor_impl_test.go       |   7 +
 internal/llmcostcel/cel.go                    |  29 ++-
 internal/metrics/genai.go                     |   5 +-
 internal/metrics/metrics_impl.go              |   6 +
 internal/metrics/metrics_impl_test.go         |  27 +-
 .../openinference/anthropic/messages.go       |   2 +-
 internal/translator/anthropic_anthropic.go    |   3 +
 .../translator/anthropic_anthropic_test.go    |   6 +-
 .../translator/anthropic_gcpanthropic_test.go |  31 ++-
 internal/translator/anthropic_usage_test.go   | 241 ++++++++++--------
 internal/translator/cohere_rerank_v2_test.go  |   2 +-
 .../imagegeneration_openai_openai_test.go     |   4 +-
 internal/translator/openai_awsbedrock.go      |  14 +-
 internal/translator/openai_awsbedrock_test.go |   6 +-
 .../openai_azureopenai_embeddings_test.go     |   6 +-
 .../translator/openai_azureopenai_test.go     |   4 +-
 internal/translator/openai_completions.go     |   3 +-
 .../translator/openai_completions_test.go     |  12 +-
 internal/translator/openai_embeddings_test.go |   6 +-
 internal/translator/openai_gcpanthropic.go    |   4 +-
 .../translator/openai_gcpanthropic_stream.go  |   7 +-
 .../translator/openai_gcpanthropic_test.go    |  10 +-
 internal/translator/openai_gcpvertexai.go     |   2 +
 .../translator/openai_gcpvertexai_test.go     |  18 +-
 internal/translator/openai_openai.go          |   3 +-
 internal/translator/openai_openai_test.go     |  28 +-
 internal/translator/openai_responses.go       |  11 +-
 internal/translator/openai_responses_test.go  |  22 ++
 .../testdata/aigatewayroutes/llmcosts.yaml    |   2 +
 32 files changed, 338 insertions(+), 220 deletions(-)

diff --git a/internal/controller/gateway_test.go b/internal/controller/gateway_test.go
index 3a45512524..6c3be8df2f 100644
--- a/internal/controller/gateway_test.go
+++ b/internal/controller/gateway_test.go
@@ -197,6 +197,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 					{MetadataKey: "bar", Type: aigv1a1.LLMRequestCostTypeOutputToken},
 					{MetadataKey: "baz", Type: aigv1a1.LLMRequestCostTypeTotalToken},
 					{MetadataKey: "qux", Type: aigv1a1.LLMRequestCostTypeCachedInputToken},
+					{MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedWriteInputToken},
 				},
 			},
 		},
@@ -274,13 +275,14 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 		var fc filterapi.Config
 		require.NoError(t, yaml.Unmarshal([]byte(configStr), &fc))
 		require.Equal(t, "dev", fc.Version)
-		require.Len(t, fc.LLMRequestCosts, 5)
+		require.Len(t, fc.LLMRequestCosts, 6)
 		require.Equal(t, filterapi.LLMRequestCostTypeInputToken, fc.LLMRequestCosts[0].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeOutputToken, fc.LLMRequestCosts[1].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeTotalToken, fc.LLMRequestCosts[2].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeCachedInputToken, fc.LLMRequestCosts[3].Type)
-		require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[4].Type)
-		require.Equal(t, `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[4].CEL)
+		require.Equal(t, filterapi.LLMRequestCostTypeCachedWriteInputToken, fc.LLMRequestCosts[4].Type)
+		require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[5].Type)
+		require.Equal(t, `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[5].CEL)
 		require.Len(t, fc.Models, 1)
 		require.Equal(t, "mymodel", fc.Models[0].Name)
 
diff --git a/internal/extproc/mocks_test.go b/internal/extproc/mocks_test.go
index 0536762878..a2997f14f9 100644
--- a/internal/extproc/mocks_test.go
+++ b/internal/extproc/mocks_test.go
@@ -171,16 +171,17 @@ func (m *mockMetricsFactory) NewMetrics() metrics.Metrics {
 
 // mockMetrics implements [metrics.Metrics] for testing.
 type mockMetrics struct {
-	requestStart          time.Time
-	originalModel         string
-	requestModel          string
-	responseModel         string
-	backend               string
-	requestSuccessCount   int
-	requestErrorCount     int
-	inputTokenCount       int
-	cachedInputTokenCount int
-	outputTokenCount      int
+	requestStart               time.Time
+	originalModel              string
+	requestModel               string
+	responseModel              string
+	backend                    string
+	requestSuccessCount        int
+	requestErrorCount          int
+	inputTokenCount            int
+	cachedInputTokenCount      int
+	cachedWriteInputTokenCount int
+	outputTokenCount           int
 	// streamingOutputTokens tracks the cumulative output tokens recorded via RecordTokenLatency.
 	streamingOutputTokens int
 	timeToFirstToken      float64
@@ -218,6 +219,9 @@ func (m *mockMetrics) RecordTokenUsage(_ context.Context, usage metrics.TokenUsa
 	if cachedInput, ok := usage.CachedInputTokens(); ok {
 		m.cachedInputTokenCount += int(cachedInput)
 	}
+	if cachedWriteInput, ok := usage.CachedWriteInputTokens(); ok {
+		m.cachedWriteInputTokenCount += int(cachedWriteInput)
+	}
 	if output, ok := usage.OutputTokens(); ok {
 		m.outputTokenCount += int(output)
 	}
@@ -278,9 +282,10 @@ func (m *mockMetrics) RequireRequestFailure(t *testing.T) {
 	require.Equal(t, 1, m.requestErrorCount)
 }
 
-func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedOutput int) {
+func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedWriteCachedInput, expectedOutput int) {
 	require.Equal(t, expectedInput, m.inputTokenCount)
 	require.Equal(t, expectedCachedInput, m.cachedInputTokenCount)
+	require.Equal(t, expectedWriteCachedInput, m.cachedWriteInputTokenCount)
 	require.Equal(t, expectedOutput, m.outputTokenCount)
 }
 
diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go
index 887fa89e0c..93c35aa7cc 100644
--- a/internal/extproc/processor_impl.go
+++ b/internal/extproc/processor_impl.go
@@ -542,7 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 		case filterapi.LLMRequestCostTypeCEL:
 			in, _ := costs.InputTokens()
 			cachedIn, _ := costs.CachedInputTokens()
-			cachedWrite, _ := costs.CachedInputWriteTokens()
+			cachedWrite, _ := costs.CachedWriteInputTokens()
 			out, _ := costs.OutputTokens()
 			total, _ := costs.TotalTokens()
 			costU64, err := llmcostcel.EvaluateProgram(
diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go
index e8006f0632..a590a38d24 100644
--- a/internal/extproc/processor_impl_test.go
+++ b/internal/extproc/processor_impl_test.go
@@ -259,6 +259,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		mt.retUsedToken.SetOutputTokens(123)
 		mt.retUsedToken.SetInputTokens(1)
 		mt.retUsedToken.SetCachedInputTokens(1)
+		mt.retUsedToken.SetCachedWriteInputTokens(3)
 
 		celProgInt, err := llmcostcel.NewProgram("54321")
 		require.NoError(t, err)
@@ -274,6 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}},
+						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cached_write_input_token_usage"}},
 						{
 							CELProg:        celProgInt,
 							LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"},
@@ -309,6 +311,8 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 			GetStructValue().Fields["input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cached_input_token_usage"].GetNumberValue())
+		require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
+			GetStructValue().Fields["cached_write_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cel_int"].GetNumberValue())
 		require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
@@ -371,6 +375,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		mt.expResponseBody = final
 		mt.retUsedToken.SetInputTokens(5)
 		mt.retUsedToken.SetCachedInputTokens(3)
+		mt.retUsedToken.SetCachedWriteInputTokens(21)
 		mt.retUsedToken.SetOutputTokens(138)
 		mt.retUsedToken.SetTotalTokens(143)
 		_, err = p.ProcessResponseBody(t.Context(), final)
@@ -379,6 +384,8 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		require.Equal(t, 5, mm.inputTokenCount)
 		require.Equal(t, 138, mm.outputTokenCount)
 		require.Equal(t, 138, mm.streamingOutputTokens) // accumulated output tokens from stream
+		require.Equal(t, 3, mm.cachedInputTokenCount)
+		require.Equal(t, 21, mm.cachedWriteInputTokenCount)
 	})
 }
 
diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go
index 071b9b13ab..0c2f19f913 100644
--- a/internal/llmcostcel/cel.go
+++ b/internal/llmcostcel/cel.go
@@ -16,12 +16,13 @@ import (
 )
 
 const (
-	celModelNameKey         = "model"
-	celBackendKey           = "backend"
-	celInputTokensKey       = "input_tokens"
-	celCachedInputTokensKey = "cached_input_tokens" // #nosec G101
-	celOutputTokensKey      = "output_tokens"
-	celTotalTokensKey       = "total_tokens"
+	celModelNameKey              = "model"
+	celBackendKey                = "backend"
+	celInputTokensKey            = "input_tokens"
+	celCachedInputTokensKey      = "cached_input_tokens"       // #nosec G101
+	celCachedWriteInputTokensKey = "cached_write_input_tokens" // #nosec G101
+	celOutputTokensKey           = "output_tokens"
+	celTotalTokensKey            = "total_tokens"
 )
 
 var env *cel.Env
@@ -33,6 +34,7 @@ func init() {
 		cel.Variable(celBackendKey, cel.StringType),
 		cel.Variable(celInputTokensKey, cel.UintType),
 		cel.Variable(celCachedInputTokensKey, cel.UintType),
+		cel.Variable(celCachedWriteInputTokensKey, cel.UintType),
 		cel.Variable(celOutputTokensKey, cel.UintType),
 		cel.Variable(celTotalTokensKey, cel.UintType),
 	)
@@ -62,14 +64,15 @@ func NewProgram(expr string) (prog cel.Program, err error) {
 }
 
 // EvaluateProgram evaluates the given CEL program with the given variables.
-func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, outputTokens, totalTokens uint32) (uint64, error) {
+func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedWriteInputTokens, outputTokens, totalTokens uint32) (uint64, error) {
 	out, _, err := prog.Eval(map[string]any{
-		celModelNameKey:         modelName,
-		celBackendKey:           backend,
-		celInputTokensKey:       inputTokens,
-		celCachedInputTokensKey: cachedInputTokens,
-		celOutputTokensKey:      outputTokens,
-		celTotalTokensKey:       totalTokens,
+		celModelNameKey:              modelName,
+		celBackendKey:                backend,
+		celInputTokensKey:            inputTokens,
+		celCachedInputTokensKey:      cachedInputTokens,
+		celCachedWriteInputTokensKey: cachedWriteInputTokens,
+		celOutputTokensKey:           outputTokens,
+		celTotalTokensKey:            totalTokens,
 	})
 	if err != nil || out == nil {
 		return 0, fmt.Errorf("failed to evaluate CEL expression: %w", err)
diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go
index 1aaee2dd97..3c4a3dc62f 100644
--- a/internal/metrics/genai.go
+++ b/internal/metrics/genai.go
@@ -39,8 +39,9 @@ const (
 	// https://github.com/open-telemetry/semantic-conventions/issues/1959
 	//
 	// However, the spec says "a custom value MAY be used.", so we can use it now.
-	genaiTokenTypeCachedInput = "cached_input"
-	genaiErrorTypeFallback    = "_OTHER"
+	genaiTokenTypeCachedInput      = "cached_input"
+	genaiTokenTypeCachedWriteInput = "cached_write_input"
+	genaiErrorTypeFallback         = "_OTHER"
 )
 
 // GenAIOperation represents the type of generative AI operation i.e. the endpoint being called.
diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go
index c946c541db..e4a85e4d23 100644
--- a/internal/metrics/metrics_impl.go
+++ b/internal/metrics/metrics_impl.go
@@ -148,6 +148,12 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re
 			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)),
 		)
 	}
+	if cachedWriteInputTokens, ok := usage.CachedWriteInputTokens(); ok {
+		b.metrics.tokenUsage.Record(ctx, float64(cachedWriteInputTokens),
+			metric.WithAttributeSet(attrs),
+			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput)),
+		)
+	}
 	if outputTokens, ok := usage.OutputTokens(); ok {
 		b.metrics.tokenUsage.Record(ctx, float64(outputTokens),
 			metric.WithAttributeSet(attrs),
diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go
index 7f697c7218..8bf0ff2cce 100644
--- a/internal/metrics/metrics_impl_test.go
+++ b/internal/metrics/metrics_impl_test.go
@@ -71,9 +71,10 @@ func TestRecordTokenUsage(t *testing.T) {
 			attribute.Key(genaiAttributeResponseModel).String("test-model"),
 		}
 		// gen_ai.token.type values - https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/#common-attributes
-		inputAttrs       = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...)
-		outputAttrs      = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...)
-		cachedInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...)
+		inputAttrs            = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...)
+		outputAttrs           = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...)
+		cachedInputAttrs      = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...)
+		cachedWriteInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput))...)
 	)
 
 	pm.SetOriginalModel("test-model")
@@ -81,7 +82,7 @@ func TestRecordTokenUsage(t *testing.T) {
 	pm.SetResponseModel("test-model")
 	pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}})
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
-		inputTokens: 10, cachedInputTokens: 8, outputTokens: 5,
+		inputTokens: 10, cachedInputTokens: 8, cachedWriteInputTokens: 2, outputTokens: 5,
 		inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
@@ -93,6 +94,10 @@ func TestRecordTokenUsage(t *testing.T) {
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 8.0, sum)
 
+	count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs)
+	assert.Equal(t, uint64(1), count)
+	assert.Equal(t, 2.0, sum)
+
 	count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, outputAttrs)
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 5.0, sum)
@@ -295,7 +300,7 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) {
 	pm.SetRequestModel("req-model")
 	pm.SetResponseModel("res-model")
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
-		inputTokens: 2, cachedInputTokens: 1, outputTokens: 3,
+		inputTokens: 2, cachedInputTokens: 1, cachedWriteInputTokens: 6, outputTokens: 3,
 		inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
@@ -323,6 +328,18 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) {
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 1.0, sum)
 
+	cachedWriteInputAttrs := attribute.NewSet(
+		attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)),
+		attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI),
+		attribute.Key(genaiAttributeOriginalModel).String("orig-model"),
+		attribute.Key(genaiAttributeRequestModel).String("req-model"),
+		attribute.Key(genaiAttributeResponseModel).String("res-model"),
+		attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput),
+	)
+	count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs)
+	assert.Equal(t, uint64(1), count)
+	assert.Equal(t, 6.0, sum)
+
 	outputAttrs := attribute.NewSet(
 		attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)),
 		attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI),
diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go
index 340e37c459..1232609a37 100644
--- a/internal/tracing/openinference/anthropic/messages.go
+++ b/internal/tracing/openinference/anthropic/messages.go
@@ -215,7 +215,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer
 	)
 	input, _ := cost.InputTokens()
 	cacheRead, _ := cost.CachedInputTokens()
-	cacheCreation, _ := cost.CachedIWritenputTokens()
+	cacheCreation, _ := cost.CachedWriteInputTokens()
 	output, _ := cost.OutputTokens()
 	total, _ := cost.TotalTokens()
 
diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go
index e1462a32f8..bbbde960cc 100644
--- a/internal/translator/anthropic_anthropic.go
+++ b/internal/translator/anthropic_anthropic.go
@@ -181,6 +181,9 @@ func (a *anthropicToAnthropicTranslator) updateTotalTokens() {
 		if _, cachedSet := a.streamingTokenUsage.CachedInputTokens(); !cachedSet {
 			a.streamingTokenUsage.SetCachedInputTokens(0)
 		}
+		if _, cachedSet := a.streamingTokenUsage.CachedWriteInputTokens(); !cachedSet {
+			a.streamingTokenUsage.SetCachedWriteInputTokens(0)
+		}
 	}
 
 	if inputSet && outputSet {
diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go
index da8f220e6d..20fc793ab5 100644
--- a/internal/translator/anthropic_anthropic_test.go
+++ b/internal/translator/anthropic_anthropic_test.go
@@ -95,7 +95,7 @@ func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) {
 	require.NoError(t, err)
 	require.Nil(t, headerMutation)
 	require.Nil(t, bodyMutation)
-	expected := tokenUsageFrom(9, 0, 16, 25)
+	expected := tokenUsageFrom(9, 0, -1, 16, 25)
 	require.Equal(t, expected, tokenUsage)
 	require.Equal(t, "claude-sonnet-4-5-20250929", responseModel)
 }
@@ -141,7 +141,7 @@ data: {"type":"message_stop"       }`
 	require.NoError(t, err)
 	require.Nil(t, headerMutation)
 	require.Nil(t, bodyMutation)
-	expected := tokenUsageFrom(10, 1, 0, 10)
+	expected := tokenUsageFrom(10, 1, 0, 0, 10)
 	require.Equal(t, expected, tokenUsage)
 	require.Equal(t, "claude-sonnet-4-5-20250929", responseModel)
 
@@ -149,7 +149,7 @@ data: {"type":"message_stop"       }`
 	require.NoError(t, err)
 	require.Nil(t, headerMutation)
 	require.Nil(t, bodyMutation)
-	expected = tokenUsageFrom(10, 1, 16, 26)
+	expected = tokenUsageFrom(10, 1, 0, 16, 26)
 	require.Equal(t, expected, tokenUsage)
 	require.Equal(t, "claude-sonnet-4-5-20250929", responseModel)
 }
diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go
index 7116fd0261..875265d94d 100644
--- a/internal/translator/anthropic_gcpanthropic_test.go
+++ b/internal/translator/anthropic_gcpanthropic_test.go
@@ -467,7 +467,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_ZeroTokenUsage(t *testin
 	_, _, tokenUsage, _, err := translator.ResponseBody(respHeaders, bodyReader, true, nil)
 	require.NoError(t, err)
 
-	expected := tokenUsageFrom(0, 0, 0, 0)
+	expected := tokenUsageFrom(0, 0, 0, 0, 0)
 	assert.Equal(t, expected, tokenUsage)
 }
 
@@ -482,31 +482,31 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingTokenUsage(t *t
 			name:          "regular streaming chunk without usage",
 			chunk:         "event: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" to me.\"}}\n\n",
 			endOfStream:   false,
-			expectedUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 		{
 			name:          "message_delta chunk with token usage",
 			chunk:         "event: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\",\"stop_sequence\":null},\"usage\":{\"output_tokens\":84}}\n\n",
 			endOfStream:   false,
-			expectedUsage: tokenUsageFrom(0, 0, 84, 84),
+			expectedUsage: tokenUsageFrom(0, 0, 0, 84, 84),
 		},
 		{
 			name:          "message_stop chunk without usage",
 			chunk:         "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n",
 			endOfStream:   false,
-			expectedUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 		{
 			name:          "invalid json chunk",
 			chunk:         "event: invalid\ndata: {\"invalid\": \"json\"}\n\n",
 			endOfStream:   false,
-			expectedUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 		{
 			name:          "message_delta with decimal output_tokens",
 			chunk:         "event: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"tool_use\"},\"usage\":{\"output_tokens\":42.0}}\n\n",
 			endOfStream:   false,
-			expectedUsage: tokenUsageFrom(0, 0, 42, 42),
+			expectedUsage: tokenUsageFrom(0, 0, 0, 42, 42),
 		},
 	}
 
@@ -545,12 +545,12 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te
 		{
 			name:          "message_delta without usage field",
 			chunk:         "event: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\"}}\n\n",
-			expectedUsage: tokenUsageFrom(0, 0, 0, 0),
+			expectedUsage: tokenUsageFrom(0, 0, 0, 0, 0),
 		},
 		{
 			name:          "invalid json in data",
 			chunk:         "event: message_start\ndata: {invalid json}\n\n",
-			expectedUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expectedUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 	}
 
@@ -570,7 +570,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te
 	}
 }
 
-func tokenUsageFrom(in, cachedInput, out, total int32) metrics.TokenUsage {
+func tokenUsageFrom(in, cachedInput, cachedWriteInput, out, total int32) metrics.TokenUsage {
 	var usage metrics.TokenUsage
 	if in >= 0 {
 		usage.SetInputTokens(uint32(in))
@@ -578,6 +578,9 @@ func tokenUsageFrom(in, cachedInput, out, total int32) metrics.TokenUsage {
 	if cachedInput >= 0 {
 		usage.SetCachedInputTokens(uint32(cachedInput))
 	}
+	if cachedWriteInput >= 0 {
+		usage.SetCachedWriteInputTokens(uint32(cachedWriteInput))
+	}
 	if out >= 0 {
 		usage.SetOutputTokens(uint32(out))
 	}
@@ -608,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t
 	// 3. message_delta at the end provides output_tokens=5 but no input_tokens
 	// 4. message_stop ends the stream
 	messageStartChunk := `event: message_start
-data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5, "output_tokens": 0}}}
+data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5,  "cache_write_input_tokens": 1, "output_tokens": 0}}}
 `
 	contentBlockStartChunk := `event: content_block_start
 data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}}
@@ -635,6 +638,7 @@ data: {"type": "message_stop"}
 	outputTokens, outputSet := tokenUsage.OutputTokens()
 	totalTokens, totalSet := tokenUsage.TotalTokens()
 	cachedTokens, cachedSet := tokenUsage.CachedInputTokens()
+	cachedWriteTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens()
 
 	// Assertions
 	assert.True(t, inputSet, "Input tokens should be set")
@@ -649,6 +653,9 @@ data: {"type": "message_stop"}
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
 
+	assert.True(t, cachedWriteSet, "Cached write tokens should be set")
+	assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario")
+
 	_, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil)
 	require.NoError(t, err)
 	_, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockDeltaChunk), false, nil)
@@ -665,6 +672,7 @@ data: {"type": "message_stop"}
 	outputTokens, outputSet = tokenUsage.OutputTokens()
 	totalTokens, totalSet = tokenUsage.TotalTokens()
 	cachedTokens, cachedSet = tokenUsage.CachedInputTokens()
+	cachedWriteTokens, cachedWriteSet = tokenUsage.CachedWriteInputTokens()
 
 	assert.True(t, inputSet, "Input tokens should be set")
 	assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start")
@@ -677,4 +685,7 @@ data: {"type": "message_stop"}
 
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
+
+	assert.True(t, cachedWriteSet, "Cached write tokens should be set")
+	assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario")
 }
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index 427eff0c5a..bc1a8a063c 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -16,81 +16,88 @@ import (
 
 func TestExtractLLMTokenUsage(t *testing.T) {
 	tests := []struct {
-		name                 string
-		inputTokens          int64
-		outputTokens         int64
-		cacheReadTokens      int64
-		cacheCreationTokens  int64
-		expectedInputTokens  uint32
-		expectedOutputTokens uint32
-		expectedTotalTokens  uint32
-		expectedCachedTokens uint32
+		name                      string
+		inputTokens               int64
+		outputTokens              int64
+		cacheReadTokens           int64
+		cacheCreationTokens       int64
+		expectedInputTokens       uint32
+		expectedOutputTokens      uint32
+		expectedTotalTokens       uint32
+		expectedCachedTokens      uint32
+		expectedCachedWriteTokens uint32
 	}{
 		{
-			name:                 "basic usage without cache",
-			inputTokens:          100,
-			outputTokens:         50,
-			cacheReadTokens:      0,
-			cacheCreationTokens:  0,
-			expectedInputTokens:  100,
-			expectedOutputTokens: 50,
-			expectedTotalTokens:  150,
-			expectedCachedTokens: 0,
+			name:                      "basic usage without cache",
+			inputTokens:               100,
+			outputTokens:              50,
+			cacheReadTokens:           0,
+			cacheCreationTokens:       0,
+			expectedInputTokens:       100,
+			expectedOutputTokens:      50,
+			expectedTotalTokens:       150,
+			expectedCachedTokens:      0,
+			expectedCachedWriteTokens: 0,
 		},
 		{
-			name:                 "usage with cache read tokens",
-			inputTokens:          80,
-			outputTokens:         30,
-			cacheReadTokens:      20,
-			cacheCreationTokens:  0,
-			expectedInputTokens:  100, // 80 + 0 + 20
-			expectedOutputTokens: 30,
-			expectedTotalTokens:  130, // 100 + 30
-			expectedCachedTokens: 20,  // 20 + 0
+			name:                      "usage with cache read tokens",
+			inputTokens:               80,
+			outputTokens:              30,
+			cacheReadTokens:           20,
+			cacheCreationTokens:       0,
+			expectedInputTokens:       100, // 80 + 0 + 20
+			expectedOutputTokens:      30,
+			expectedTotalTokens:       130, // 100 + 30
+			expectedCachedTokens:      20,  // 20
+			expectedCachedWriteTokens: 0,
 		},
 		{
-			name:                 "usage with cache creation tokens",
-			inputTokens:          60,
-			outputTokens:         40,
-			cacheReadTokens:      0,
-			cacheCreationTokens:  15,
-			expectedInputTokens:  75, // 60 + 15 + 0
-			expectedOutputTokens: 40,
-			expectedTotalTokens:  115, // 75 + 40
-			expectedCachedTokens: 15,  // 0 + 15
+			name:                      "usage with cache creation tokens",
+			inputTokens:               60,
+			outputTokens:              40,
+			cacheReadTokens:           0,
+			cacheCreationTokens:       15,
+			expectedInputTokens:       75, // 60 + 15 + 0
+			expectedOutputTokens:      40,
+			expectedTotalTokens:       115, // 75 + 40
+			expectedCachedTokens:      0,   // 0
+			expectedCachedWriteTokens: 15,  // 15
 		},
 		{
-			name:                 "usage with both cache types",
-			inputTokens:          70,
-			outputTokens:         25,
-			cacheReadTokens:      10,
-			cacheCreationTokens:  5,
-			expectedInputTokens:  85, // 70 + 5 + 10
-			expectedOutputTokens: 25,
-			expectedTotalTokens:  110, // 85 + 25
-			expectedCachedTokens: 15,  // 10 + 5
+			name:                      "usage with both cache types",
+			inputTokens:               70,
+			outputTokens:              25,
+			cacheReadTokens:           10,
+			cacheCreationTokens:       5,
+			expectedInputTokens:       85, // 70 + 5 + 10
+			expectedOutputTokens:      25,
+			expectedTotalTokens:       110, // 85 + 25
+			expectedCachedTokens:      10,  // 10
+			expectedCachedWriteTokens: 5,   // 5
 		},
 		{
-			name:                 "zero values",
-			inputTokens:          0,
-			outputTokens:         0,
-			cacheReadTokens:      0,
-			cacheCreationTokens:  0,
-			expectedInputTokens:  0,
-			expectedOutputTokens: 0,
-			expectedTotalTokens:  0,
-			expectedCachedTokens: 0,
+			name:                      "zero values",
+			inputTokens:               0,
+			outputTokens:              0,
+			cacheReadTokens:           0,
+			cacheCreationTokens:       0,
+			expectedInputTokens:       0,
+			expectedOutputTokens:      0,
+			expectedTotalTokens:       0,
+			expectedCachedTokens:      0,
+			expectedCachedWriteTokens: 0,
 		},
 		{
-			name:                 "large values",
-			inputTokens:          100000,
-			outputTokens:         50000,
-			cacheReadTokens:      25000,
-			cacheCreationTokens:  15000,
-			expectedInputTokens:  140000, // 100000 + 15000 + 25000
-			expectedOutputTokens: 50000,
-			expectedTotalTokens:  190000, // 140000 + 50000
-			expectedCachedTokens: 40000,  // 25000 + 15000
+			name:                      "large values",
+			inputTokens:               100000,
+			outputTokens:              50000,
+			cacheReadTokens:           25000,
+			cacheCreationTokens:       15000,
+			expectedInputTokens:       140000, // 100000 + 15000 + 25000
+			expectedOutputTokens:      50000,
+			expectedTotalTokens:       190000, // 140000 + 50000
+			expectedCachedTokens:      25000,  // 25000
+			expectedCachedWriteTokens: 15000,
 		},
 	}
 
@@ -104,12 +111,12 @@ func TestExtractLLMTokenUsage(t *testing.T) {
 			)
 
 			expected := tokenUsageFrom(
-				int32(tt.expectedInputTokens), // nolint:gosec
-				-1,
-				int32(tt.expectedOutputTokens), // nolint:gosec
-				int32(tt.expectedTotalTokens),  // nolint:gosec
+				int32(tt.expectedInputTokens),       // nolint:gosec
+				int32(tt.expectedCachedTokens),      // nolint:gosec
+				int32(tt.expectedCachedWriteTokens), // nolint:gosec
+				int32(tt.expectedOutputTokens),      // nolint:gosec
+				int32(tt.expectedTotalTokens),       // nolint:gosec
 			)
-			expected.SetCachedInputTokens(tt.expectedCachedTokens)
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -117,12 +124,13 @@ func TestExtractLLMTokenUsage(t *testing.T) {
 
 func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 	tests := []struct {
-		name                 string
-		usage                anthropic.Usage
-		expectedInputTokens  int32
-		expectedOutputTokens int32
-		expectedTotalTokens  int32
-		expectedCachedTokens uint32
+		name                      string
+		usage                     anthropic.Usage
+		expectedInputTokens       int32
+		expectedOutputTokens      int32
+		expectedTotalTokens       int32
+		expectedCachedTokens      uint32
+		expectedCachedWriteTokens uint32
 	}{
 		{
 			name: "non-streaming response without cache",
@@ -132,10 +140,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				CacheReadInputTokens:     0,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:  150,
-			expectedOutputTokens: 75,
-			expectedTotalTokens:  225,
-			expectedCachedTokens: 0,
+			expectedInputTokens:       150,
+			expectedOutputTokens:      75,
+			expectedTotalTokens:       225,
+			expectedCachedTokens:      0,
+			expectedCachedWriteTokens: 0,
 		},
 		{
 			name: "non-streaming response with cache read",
@@ -145,10 +154,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				CacheReadInputTokens:     25,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:  125, // 100 + 0 + 25
-			expectedOutputTokens: 50,
-			expectedTotalTokens:  175, // 125 + 50
-			expectedCachedTokens: 25,  // 25 + 0
+			expectedInputTokens:       125, // 100 + 0 + 25
+			expectedOutputTokens:      50,
+			expectedTotalTokens:       175, // 125 + 50
+			expectedCachedTokens:      25,  // 25
+			expectedCachedWriteTokens: 0,   // 0
 		},
 		{
 			name: "non-streaming response with both cache types",
@@ -158,10 +168,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				CacheReadInputTokens:     15,
 				CacheCreationInputTokens: 10,
 			},
-			expectedInputTokens:  115, // 90 + 10 + 15
-			expectedOutputTokens: 60,
-			expectedTotalTokens:  175, // 115 + 60
-			expectedCachedTokens: 25,  // 15 + 10
+			expectedInputTokens:       115, // 90 + 10 + 15
+			expectedOutputTokens:      60,
+			expectedTotalTokens:       175, // 115 + 60
+			expectedCachedTokens:      25,  // 15
+			expectedCachedWriteTokens: 10,  // 10
 		},
 	}
 
@@ -172,8 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CacheCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, 0, tt.expectedOutputTokens, tt.expectedTotalTokens)
-			expected.SetCachedInputTokens(tt.expectedCachedTokens)
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -181,12 +191,13 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 
 func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 	tests := []struct {
-		name                 string
-		usage                anthropic.MessageDeltaUsage
-		expectedInputTokens  int32
-		expectedOutputTokens int32
-		expectedTotalTokens  int32
-		expectedCachedTokens uint32
+		name                      string
+		usage                     anthropic.MessageDeltaUsage
+		expectedInputTokens       int32
+		expectedOutputTokens      int32
+		expectedTotalTokens       int32
+		expectedCachedTokens      uint32
+		expectedCachedWriteTokens uint32
 	}{
 		{
 			name: "message_delta event with final totals",
@@ -196,10 +207,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:     30,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:  280, // 250 + 0 + 30
-			expectedOutputTokens: 120,
-			expectedTotalTokens:  400, // 280 + 120
-			expectedCachedTokens: 30,  // 30 + 0
+			expectedInputTokens:       280, // 250 + 0 + 30
+			expectedOutputTokens:      120,
+			expectedTotalTokens:       400, // 280 + 120
+			expectedCachedTokens:      30,  // 30
+			expectedCachedWriteTokens: 0,
 		},
 		{
 			name: "message_delta event with only output tokens",
@@ -209,10 +221,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:     0,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:  0,
-			expectedOutputTokens: 85,
-			expectedTotalTokens:  85,
-			expectedCachedTokens: 0,
+			expectedInputTokens:       0,
+			expectedOutputTokens:      85,
+			expectedTotalTokens:       85,
+			expectedCachedTokens:      0,
+			expectedCachedWriteTokens: 0,
 		},
 		{
 			name: "message_delta with cache creation tokens",
@@ -222,10 +235,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:     10,
 				CacheCreationInputTokens: 5,
 			},
-			expectedInputTokens:  165, // 150 + 5 + 10
-			expectedOutputTokens: 75,
-			expectedTotalTokens:  240, // 165 + 75
-			expectedCachedTokens: 15,  // 10 + 5
+			expectedInputTokens:       165, // 150 + 5 + 10
+			expectedOutputTokens:      75,
+			expectedTotalTokens:       240, // 165 + 75
+			expectedCachedTokens:      10,  // 10
+			expectedCachedWriteTokens: 5,   // 5
 		},
 	}
 
@@ -236,8 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CacheCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, 0, tt.expectedOutputTokens, tt.expectedTotalTokens)
-			expected.SetCachedInputTokens(tt.expectedCachedTokens)
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -286,13 +299,15 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 		assert.Equal(t, expectedTotalInput, inputTokensVal,
 			"InputTokens should be sum of input_tokens + cache_creation_input_tokens + cache_read_input_tokens")
 
-		// Total cache should be sum of cache token types.
-		expectedCacheTokensInt := cacheCreationTokens + cacheReadTokens
-		expectedCacheTokens := uint32(expectedCacheTokensInt) // #nosec G115 - test values are small and safe
 		cachedTokens, ok := result.CachedInputTokens()
 		assert.True(t, ok)
-		assert.Equal(t, expectedCacheTokens, cachedTokens,
-			"CachedInputTokens should be sum of cache_creation_input_tokens + cache_read_input_tokens")
+		assert.Equal(t, cacheReadTokens, cachedTokens,
+			"CachedInputTokens should be  cache_read_input_tokens")
+
+		cachedWriteTokens, ok := result.CachedWriteInputTokens()
+		assert.True(t, ok)
+		assert.Equal(t, cacheCreationTokens, cachedWriteTokens,
+			"CachedWriteInputTokens should be cache_creation_input_tokens")
 
 		// Total tokens should be input + output.
 		expectedTotal := expectedTotalInput + uint32(outputTokens)
diff --git a/internal/translator/cohere_rerank_v2_test.go b/internal/translator/cohere_rerank_v2_test.go
index a1d64fa504..c3b93b818f 100644
--- a/internal/translator/cohere_rerank_v2_test.go
+++ b/internal/translator/cohere_rerank_v2_test.go
@@ -180,7 +180,7 @@ func TestCohereToCohereTranslatorV2Rerank_ResponseBody(t *testing.T) {
 			}
 
 			require.NoError(t, err)
-			expected := tokenUsageFrom(tc.expectedInput, -1, tc.expectedOutput, tc.expectedTotal)
+			expected := tokenUsageFrom(tc.expectedInput, -1, -1, tc.expectedOutput, tc.expectedTotal)
 			require.Equal(t, expected, tokenUsage)
 			require.Equal(t, "rerank-english-v3", responseModel)
 			require.Nil(t, headerMutation)
diff --git a/internal/translator/imagegeneration_openai_openai_test.go b/internal/translator/imagegeneration_openai_openai_test.go
index d30190cd0e..c5b80da903 100644
--- a/internal/translator/imagegeneration_openai_openai_test.go
+++ b/internal/translator/imagegeneration_openai_openai_test.go
@@ -80,7 +80,7 @@ func TestOpenAIToOpenAIImageTranslator_ResponseBody_OK(t *testing.T) {
 	require.NoError(t, err)
 	require.Nil(t, hm)
 	require.Nil(t, bm)
-	require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), usage)
+	require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), usage)
 	require.Empty(t, responseModel)
 }
 
@@ -192,5 +192,5 @@ func TestOpenAIToOpenAIImageTranslator_ResponseBody_Usage(t *testing.T) {
 	buf, _ := json.Marshal(resp)
 	_, _, usage, _, err := tr.ResponseBody(map[string]string{}, bytes.NewReader(buf), true, nil)
 	require.NoError(t, err)
-	require.Equal(t, tokenUsageFrom(40, -1, 60, 100), usage)
+	require.Equal(t, tokenUsageFrom(40, -1, -1, 60, 100), usage)
 }
diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go
index cfc419171b..b566766eb9 100644
--- a/internal/translator/openai_awsbedrock.go
+++ b/internal/translator/openai_awsbedrock.go
@@ -707,6 +707,9 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 				if usage.CacheReadInputTokens != nil {
 					tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec
 				}
+				if usage.CacheWriteInputTokens != nil {
+					tokenUsage.SetCachedWriteInputTokens(uint32(*usage.CacheWriteInputTokens))
+				}
 			}
 			oaiEvent, ok := o.convertEvent(event)
 			if !ok {
@@ -749,11 +752,16 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 			PromptTokens:     bedrockResp.Usage.InputTokens,
 			CompletionTokens: bedrockResp.Usage.OutputTokens,
 		}
+		if openAIResp.Usage.PromptTokensDetails == nil {
+			openAIResp.Usage.PromptTokensDetails = &openai.PromptTokensDetails{}
+		}
 		if bedrockResp.Usage.CacheReadInputTokens != nil {
 			tokenUsage.SetCachedInputTokens(uint32(*bedrockResp.Usage.CacheReadInputTokens)) //nolint:gosec
-			openAIResp.Usage.PromptTokensDetails = &openai.PromptTokensDetails{
-				CachedTokens: *bedrockResp.Usage.CacheReadInputTokens,
-			}
+			openAIResp.Usage.PromptTokensDetails.CachedTokens = *bedrockResp.Usage.CacheReadInputTokens
+		}
+		if bedrockResp.Usage.CacheWriteInputTokens != nil {
+			tokenUsage.SetCachedWriteInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec
+			openAIResp.Usage.PromptTokensDetails.CachedWriteTokens = *bedrockResp.Usage.CacheWriteInputTokens
 		}
 	}
 
diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go
index f469d9fa11..e97e39219a 100644
--- a/internal/translator/openai_awsbedrock_test.go
+++ b/internal/translator/openai_awsbedrock_test.go
@@ -1715,14 +1715,18 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T)
 				expectedUsage = tokenUsageFrom(
 					int32(tt.output.Usage.PromptTokens), // nolint:gosec
 					-1,
+					-1,
 					int32(tt.output.Usage.CompletionTokens), // nolint:gosec
 					int32(tt.output.Usage.TotalTokens),      // nolint:gosec
 				)
 				if tt.input.Usage.CacheReadInputTokens != nil {
 					expectedUsage.SetCachedInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec
 				}
+				if tt.input.Usage.CacheWriteInputTokens != nil {
+					expectedUsage.SetCachedWriteInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec
+				}
 			} else {
-				expectedUsage = tokenUsageFrom(-1, -1, -1, -1)
+				expectedUsage = tokenUsageFrom(-1, -1, -1, -1, -1)
 			}
 			require.Equal(t, expectedUsage, usedToken)
 		})
diff --git a/internal/translator/openai_azureopenai_embeddings_test.go b/internal/translator/openai_azureopenai_embeddings_test.go
index 4deffa3bde..7ee6dcae59 100644
--- a/internal/translator/openai_azureopenai_embeddings_test.go
+++ b/internal/translator/openai_azureopenai_embeddings_test.go
@@ -111,19 +111,19 @@ func TestOpenAIToAzureOpenAITranslatorV1EmbeddingResponseBody(t *testing.T) {
 					"total_tokens": 8
 				}
 			}`,
-			expTokenUsage: tokenUsageFrom(8, -1, -1, 8),
+			expTokenUsage: tokenUsageFrom(8, -1, -1, -1, 8),
 		},
 		{
 			name:          "invalid_json",
 			responseBody:  `invalid json`,
 			expError:      true,
-			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 		{
 			name:           "error_response",
 			responseBody:   `{"error": {"message": "Invalid input", "type": "BadRequestError"}}`,
 			responseStatus: "400",
-			expTokenUsage:  tokenUsageFrom(0, -1, -1, 0),
+			expTokenUsage:  tokenUsageFrom(0, -1, -1, -1, 0),
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
diff --git a/internal/translator/openai_azureopenai_test.go b/internal/translator/openai_azureopenai_test.go
index 972b3647d1..c95597f541 100644
--- a/internal/translator/openai_azureopenai_test.go
+++ b/internal/translator/openai_azureopenai_test.go
@@ -72,7 +72,7 @@ func TestResponseModel_AzureOpenAI(t *testing.T) {
 	_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewBuffer(body), true, nil)
 	require.NoError(t, err)
 	require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Uses response field as authoritative
-	require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage)
+	require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage)
 }
 
 // TestResponseModel_AzureOpenAIStreaming tests Azure OpenAI streaming returns actual model version
@@ -103,5 +103,5 @@ data: [DONE]
 	_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(sseChunks)), true, nil)
 	require.NoError(t, err)
 	require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Returns actual versioned model from response
-	require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage)
+	require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage)
 }
diff --git a/internal/translator/openai_completions.go b/internal/translator/openai_completions.go
index 30069ba921..3754dc2bff 100644
--- a/internal/translator/openai_completions.go
+++ b/internal/translator/openai_completions.go
@@ -171,7 +171,8 @@ func (o *openAIToOpenAITranslatorV1Completion) extractUsageFromBufferEvent(span
 			tokenUsage.SetOutputTokens(uint32(usage.CompletionTokens)) //nolint:gosec
 			tokenUsage.SetTotalTokens(uint32(usage.TotalTokens))       //nolint:gosec
 			if usage.PromptTokensDetails != nil {
-				tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens)) //nolint:gosec
+				tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens))           //nolint:gosec
+				tokenUsage.SetCachedWriteInputTokens(uint32(usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec
 			}
 			// Do not mark buffering done; keep scanning to return the latest usage in this batch.
 		}
diff --git a/internal/translator/openai_completions_test.go b/internal/translator/openai_completions_test.go
index 3bc6b351a6..cecc894a66 100644
--- a/internal/translator/openai_completions_test.go
+++ b/internal/translator/openai_completions_test.go
@@ -133,14 +133,14 @@ func TestOpenAIToOpenAITranslatorV1CompletionResponseBody(t *testing.T) {
 					"total_tokens": 13
 				}
 			}`,
-			expTokenUsage: tokenUsageFrom(5, -1, 8, 13),
+			expTokenUsage: tokenUsageFrom(5, -1, -1, 8, 13),
 			expModel:      "gpt-3.5-turbo-instruct",
 		},
 		{
 			name:          "invalid_json",
 			responseBody:  `invalid json`,
 			expError:      true,
-			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 		{
 			name: "response_without_usage",
@@ -157,7 +157,7 @@ func TestOpenAIToOpenAITranslatorV1CompletionResponseBody(t *testing.T) {
 					}
 				]
 			}`,
-			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 			expModel:      "gpt-3.5-turbo-instruct",
 		},
 	} {
@@ -225,7 +225,7 @@ data: [DONE]
 	require.NoError(t, err)
 	require.Nil(t, headerMutation)
 	require.Nil(t, bodyMutation)
-	require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), tokenUsage)
+	require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), tokenUsage)
 	require.Equal(t, "gpt-3.5-turbo-instruct", responseModel)
 
 	// Process chunk2.
@@ -238,7 +238,7 @@ data: [DONE]
 	require.NoError(t, err)
 	require.Nil(t, headerMutation)
 	require.Nil(t, bodyMutation)
-	require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), tokenUsage)
+	require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), tokenUsage)
 	require.Equal(t, "gpt-3.5-turbo-instruct", responseModel)
 
 	// Process chunk3 with usage.
@@ -251,7 +251,7 @@ data: [DONE]
 	require.NoError(t, err)
 	require.Nil(t, headerMutation)
 	require.Nil(t, bodyMutation)
-	require.Equal(t, tokenUsageFrom(5, -1, 3, 8), tokenUsage)
+	require.Equal(t, tokenUsageFrom(5, -1, -1, 3, 8), tokenUsage)
 	require.Equal(t, "gpt-3.5-turbo-instruct", responseModel)
 }
 
diff --git a/internal/translator/openai_embeddings_test.go b/internal/translator/openai_embeddings_test.go
index f1750befc2..8f4c9517d1 100644
--- a/internal/translator/openai_embeddings_test.go
+++ b/internal/translator/openai_embeddings_test.go
@@ -113,19 +113,19 @@ func TestOpenAIToOpenAITranslatorV1EmbeddingResponseBody(t *testing.T) {
 					"total_tokens": 8
 				}
 			}`,
-			expTokenUsage: tokenUsageFrom(8, -1, -1, 8),
+			expTokenUsage: tokenUsageFrom(8, -1, -1, -1, 8),
 		},
 		{
 			name:          "invalid_json",
 			responseBody:  `invalid json`,
 			expError:      true,
-			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1),
+			expTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 		{
 			name:           "error_response",
 			responseBody:   `{"error": {"message": "Invalid input", "type": "BadRequestError"}}`,
 			responseStatus: "400",
-			expTokenUsage:  tokenUsageFrom(0, -1, -1, 0),
+			expTokenUsage:  tokenUsageFrom(0, -1, -1, -1, 0),
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go
index 2127d64f9c..7e11451f31 100644
--- a/internal/translator/openai_gcpanthropic.go
+++ b/internal/translator/openai_gcpanthropic.go
@@ -839,12 +839,14 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri
 	outputTokens, _ := tokenUsage.OutputTokens()
 	totalTokens, _ := tokenUsage.TotalTokens()
 	cachedTokens, _ := tokenUsage.CachedInputTokens()
+	cacheWriteTokens, _ := tokenUsage.CachedWriteInputTokens()
 	openAIResp.Usage = openai.Usage{
 		CompletionTokens: int(outputTokens),
 		PromptTokens:     int(inputTokens),
 		TotalTokens:      int(totalTokens),
 		PromptTokensDetails: &openai.PromptTokensDetails{
-			CachedTokens: int(cachedTokens),
+			CachedTokens:      int(cachedTokens),
+			CachedWriteTokens: int(cacheWriteTokens),
 		},
 	}
 
diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go
index 17b6f11f75..191b436545 100644
--- a/internal/translator/openai_gcpanthropic_stream.go
+++ b/internal/translator/openai_gcpanthropic_stream.go
@@ -110,6 +110,7 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t
 		p.tokenUsage.SetTotalTokens(inputTokens + outputTokens)
 		totalTokens, _ := p.tokenUsage.TotalTokens()
 		cachedTokens, _ := p.tokenUsage.CachedInputTokens()
+		cachedWriteTokens, _ := p.tokenUsage.CachedWriteInputTokens()
 		finalChunk := openai.ChatCompletionResponseChunk{
 			ID:      p.activeMessageID,
 			Created: p.created,
@@ -120,7 +121,8 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t
 				CompletionTokens: int(outputTokens),
 				TotalTokens:      int(totalTokens),
 				PromptTokensDetails: &openai.PromptTokensDetails{
-					CachedTokens: int(cachedTokens),
+					CachedTokens:      int(cachedTokens),
+					CachedWriteTokens: int(cachedWriteTokens),
 				},
 			},
 			Model: p.requestModel,
@@ -289,12 +291,13 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 		if output, ok := usage.OutputTokens(); ok {
 			p.tokenUsage.AddOutputTokens(output)
 		}
-		// Update input tokens to include any cache tokens from delta
+		// Update input tokens to include read cache tokens from delta
 		if cached, ok := usage.CachedInputTokens(); ok {
 			p.tokenUsage.AddInputTokens(cached)
 			// Accumulate any additional cache tokens from delta
 			p.tokenUsage.AddCachedInputTokens(cached)
 		}
+		// Update input tokens to include write cache tokens from delta
 		if cached, ok := usage.CachedWriteInputTokens(); ok {
 			p.tokenUsage.AddInputTokens(cached)
 			// Accumulate any additional cache tokens from delta
diff --git a/internal/translator/openai_gcpanthropic_test.go b/internal/translator/openai_gcpanthropic_test.go
index adaaa03fa0..ea0a11d979 100644
--- a/internal/translator/openai_gcpanthropic_test.go
+++ b/internal/translator/openai_gcpanthropic_test.go
@@ -598,12 +598,12 @@ func TestOpenAIToGCPAnthropicTranslatorV1ChatCompletion_ResponseBody(t *testing.
 			require.NoError(t, err)
 
 			expectedTokenUsage := tokenUsageFrom(
-				int32(tt.expectedOpenAIResponse.Usage.PromptTokens), // nolint:gosec
-				-1,
-				int32(tt.expectedOpenAIResponse.Usage.CompletionTokens), // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.TotalTokens),      // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokens),                      // nolint:gosec
+				uint32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens),  // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.CompletionTokens),                  // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.TotalTokens),                       // nolint:gosec
 			)
-			expectedTokenUsage.SetCachedInputTokens(uint32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec
 			require.Equal(t, expectedTokenUsage, usedToken)
 
 			if diff := cmp.Diff(tt.expectedOpenAIResponse, gotResp, cmpopts.IgnoreFields(openai.ChatCompletionResponse{}, "Created")); diff != "" {
diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go
index 716ced8cdd..232ab836cb 100644
--- a/internal/translator/openai_gcpvertexai.go
+++ b/internal/translator/openai_gcpvertexai.go
@@ -170,6 +170,8 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin
 		tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount))         //nolint:gosec
 		tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount))               //nolint:gosec
 		tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec
+		// Gemini does not return cached write input tokens, set to 0.
+		tokenUsage.SetCachedWriteInputTokens(0)
 	}
 
 	if span != nil {
diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go
index 033b8782b9..93740102ce 100644
--- a/internal/translator/openai_gcpvertexai_test.go
+++ b/internal/translator/openai_gcpvertexai_test.go
@@ -913,7 +913,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
         "total_tokens": 25
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(10, 10, 15, 25),
+			wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25),
 		},
 		{
 			name: "response with safety ratings",
@@ -993,7 +993,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
         "total_tokens": 20
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(8, 0, 12, 20),
+			wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20),
 		},
 		{
 			name: "empty response",
@@ -1005,7 +1005,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
 			wantError:      false,
 			wantHeaderMut:  []internalapi.Header{{contentLengthHeaderName, "28"}},
 			wantBodyMut:    []byte(`{"object":"chat.completion"}`),
-			wantTokenUsage: tokenUsageFrom(-1, -1, -1, -1),
+			wantTokenUsage: tokenUsageFrom(-1, -1, -1, -1, -1),
 		},
 		{
 			name: "single stream chunk response",
@@ -1025,7 +1025,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t
 
 data: [DONE]
 `),
-			wantTokenUsage: tokenUsageFrom(5, 0, 3, 8),
+			wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8),
 		},
 		{
 			name: "response with model version field",
@@ -1080,7 +1080,7 @@ data: [DONE]
         "total_tokens": 14
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(6, 0, 8, 14),
+			wantTokenUsage: tokenUsageFrom(6, 0, -1, 8, 14),
 		},
 
 		{
@@ -1149,7 +1149,7 @@ data: [DONE]
         "total_tokens": 20
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(8, 0, 12, 20),
+			wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20),
 		},
 		{
 			name: "response with thought summary",
@@ -1214,7 +1214,7 @@ data: [DONE]
     }
 }`),
 
-			wantTokenUsage: tokenUsageFrom(10, 10, 15, 25),
+			wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25),
 		},
 		{
 			name: "stream chunks with thought summary",
@@ -1236,7 +1236,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t
 
 data: [DONE]
 `),
-			wantTokenUsage: tokenUsageFrom(5, 0, 3, 8),
+			wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8),
 		},
 	}
 
@@ -1355,7 +1355,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_StreamingResponseBody(t *
 			print(bodyStr)
 			require.Contains(t, bodyStr, "data: ")
 			require.Contains(t, bodyStr, "chat.completion.chunk")
-			require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), tokenUsage) // No usage in this test chunk.
+			require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), tokenUsage) // No usage in this test chunk.
 		})
 	}
 }
diff --git a/internal/translator/openai_openai.go b/internal/translator/openai_openai.go
index 277f2e982f..9ad3084b3b 100644
--- a/internal/translator/openai_openai.go
+++ b/internal/translator/openai_openai.go
@@ -141,7 +141,8 @@ func (o *openAIToOpenAITranslatorV1ChatCompletion) ResponseBody(_ map[string]str
 	tokenUsage.SetOutputTokens(uint32(resp.Usage.CompletionTokens)) //nolint:gosec
 	tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))       //nolint:gosec
 	if resp.Usage.PromptTokensDetails != nil {
-		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec
+		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens))           //nolint:gosec
+		tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec
 	}
 	// Fallback to request model for test or non-compliant OpenAI backends
 	responseModel = cmp.Or(resp.Model, o.requestModel)
diff --git a/internal/translator/openai_openai_test.go b/internal/translator/openai_openai_test.go
index a90160fb22..8d95cd503b 100644
--- a/internal/translator/openai_openai_test.go
+++ b/internal/translator/openai_openai_test.go
@@ -49,7 +49,7 @@ data: [DONE]
 	_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(sseChunks)), true, nil)
 	require.NoError(t, err)
 	require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Returns actual versioned model
-	require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage)
+	require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage)
 }
 
 // TestResponseModel_EmptyFallback tests the fallback to request model when response model is empty
@@ -83,7 +83,7 @@ func TestResponseModel_EmptyFallback(t *testing.T) {
 		_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(responseJSON)), false, nil)
 		require.NoError(t, err)
 		require.Equal(t, "gpt-4o", responseModel) // Falls back to request model
-		require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage)
+		require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage)
 	})
 
 	t.Run("streaming", func(t *testing.T) {
@@ -112,7 +112,7 @@ data: [DONE]
 		_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(sseChunks)), true, nil)
 		require.NoError(t, err)
 		require.Equal(t, "gpt-4o-mini", responseModel) // Falls back to request model
-		require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage)
+		require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage)
 	})
 
 	t.Run("with model override", func(t *testing.T) {
@@ -148,7 +148,7 @@ data: [DONE]
 		_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader([]byte(responseJSON)), false, nil)
 		require.NoError(t, err)
 		require.Equal(t, "gpt-4o-2024-11-20", responseModel) // Falls back to overridden model
-		require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage)
+		require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage)
 	})
 }
 
@@ -358,7 +358,7 @@ data: [DONE]
 			o := &openAIToOpenAITranslatorV1ChatCompletion{}
 			_, _, usedToken, _, err := o.ResponseBody(nil, bytes.NewBuffer(body), false, s)
 			require.NoError(t, err)
-			require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken)
+			require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken)
 			require.Equal(t, &resp, s.Resp)
 		})
 		t.Run("valid body with different response model", func(t *testing.T) {
@@ -373,7 +373,7 @@ data: [DONE]
 			o := &openAIToOpenAITranslatorV1ChatCompletion{}
 			_, _, usedToken, _, err := o.ResponseBody(nil, bytes.NewBuffer(body), false, s)
 			require.NoError(t, err)
-			require.Equal(t, tokenUsageFrom(10, -1, 20, 30), usedToken)
+			require.Equal(t, tokenUsageFrom(10, -1, -1, 20, 30), usedToken)
 			require.Equal(t, &resp, s.Resp)
 		})
 	})
@@ -397,7 +397,7 @@ data: [DONE]
 			o := &openAIToOpenAITranslatorV1ChatCompletion{}
 			_, _, usedToken, _, err := o.ResponseBody(nil, bytes.NewBuffer(body), false, s)
 			require.NoError(t, err)
-			require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken)
+			require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken)
 			require.Equal(t, &resp, s.Resp)
 		})
 	})
@@ -409,7 +409,7 @@ func TestExtractUsageFromBufferEvent(t *testing.T) {
 		o := &openAIToOpenAITranslatorV1ChatCompletion{}
 		o.buffered = []byte("data: {\"usage\": {\"total_tokens\": 42}}\n")
 		usedToken := o.extractUsageFromBufferEvent(s)
-		require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken)
+		require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken)
 		require.Empty(t, o.buffered)
 		require.Len(t, s.RespChunks, 1)
 	})
@@ -418,7 +418,7 @@ func TestExtractUsageFromBufferEvent(t *testing.T) {
 		o := &openAIToOpenAITranslatorV1ChatCompletion{}
 		o.buffered = []byte("data: invalid\ndata: {\"usage\": {\"total_tokens\": 42}}\n")
 		usedToken := o.extractUsageFromBufferEvent(nil)
-		require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken)
+		require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken)
 		require.Empty(t, o.buffered)
 	})
 
@@ -426,12 +426,12 @@ func TestExtractUsageFromBufferEvent(t *testing.T) {
 		o := &openAIToOpenAITranslatorV1ChatCompletion{}
 		o.buffered = []byte("data: {}\n\ndata: ")
 		usedToken := o.extractUsageFromBufferEvent(nil)
-		require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), usedToken)
+		require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), usedToken)
 		require.GreaterOrEqual(t, len(o.buffered), 1)
 
 		o.buffered = append(o.buffered, []byte("{\"usage\": {\"total_tokens\": 42}}\n")...)
 		usedToken = o.extractUsageFromBufferEvent(nil)
-		require.Equal(t, tokenUsageFrom(0, -1, 0, 42), usedToken)
+		require.Equal(t, tokenUsageFrom(0, -1, -1, 0, 42), usedToken)
 		require.Empty(t, o.buffered)
 	})
 
@@ -439,7 +439,7 @@ func TestExtractUsageFromBufferEvent(t *testing.T) {
 		o := &openAIToOpenAITranslatorV1ChatCompletion{}
 		o.buffered = []byte("data: invalid\n")
 		usedToken := o.extractUsageFromBufferEvent(nil)
-		require.Equal(t, tokenUsageFrom(-1, -1, -1, -1), usedToken)
+		require.Equal(t, tokenUsageFrom(-1, -1, -1, -1, -1), usedToken)
 		require.Empty(t, o.buffered)
 	})
 }
@@ -461,7 +461,7 @@ func TestResponseModel_OpenAI(t *testing.T) {
 	_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewBuffer(body), true, nil)
 	require.NoError(t, err)
 	require.Equal(t, "gpt-4o-2024-08-06", responseModel)
-	require.Equal(t, tokenUsageFrom(10, -1, 5, 15), tokenUsage)
+	require.Equal(t, tokenUsageFrom(10, -1, -1, 5, 15), tokenUsage)
 }
 
 // TestResponseModel_OpenAIEmbeddings tests OpenAI embeddings (not virtualized but has response field)
@@ -480,5 +480,5 @@ func TestResponseModel_OpenAIEmbeddings(t *testing.T) {
 	_, _, tokenUsage, responseModel, err := translator.ResponseBody(nil, bytes.NewReader(body), true, nil)
 	require.NoError(t, err)
 	require.Equal(t, "text-embedding-ada-002", responseModel) // Uses response field as authoritative
-	require.Equal(t, tokenUsageFrom(10, -1, -1, 10), tokenUsage)
+	require.Equal(t, tokenUsageFrom(10, -1, -1, -1, 10), tokenUsage)
 }
diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go
index 0accb13b45..9550d0ce4f 100644
--- a/internal/translator/openai_responses.go
+++ b/internal/translator/openai_responses.go
@@ -128,10 +128,11 @@ func (o *openAIToOpenAITranslatorV1Responses) handleNonStreamingResponse(body io
 
 	// TODO: Add reasoning token usage
 	if resp.Usage != nil {
-		tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens))                           // #nosec G115
-		tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens))                         // #nosec G115
-		tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))                           // #nosec G115
-		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens)) // #nosec G115
+		tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens))                                     // #nosec G115
+		tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens))                                   // #nosec G115
+		tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))                                     // #nosec G115
+		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens))           // #nosec G115
+		tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.InputTokensDetails.CachedWriteTokens)) // #nosec G115
 	}
 
 	// Record non-streaming response to span if tracing is enabled.
@@ -178,6 +179,8 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t
 				tokenUsage.SetOutputTokens(uint32(respComplEvent.Response.Usage.OutputTokens))                         // #nosec G115
 				tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens))                           // #nosec G115
 				tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115
+				// Openai does not support cached write response.
+				tokenUsage.SetCachedWriteInputTokens(uint32(0)) // #nosec G115
 			}
 			// Record streaming chunk to span if tracing is enabled.
 			if span != nil {
diff --git a/internal/translator/openai_responses_test.go b/internal/translator/openai_responses_test.go
index f136d74ebe..80c574d51e 100644
--- a/internal/translator/openai_responses_test.go
+++ b/internal/translator/openai_responses_test.go
@@ -246,6 +246,10 @@ func TestResponsesOpenAIToOpenAITranslator_ResponseBody(t *testing.T) {
 		cachedTokens, ok := tokenUsage.CachedInputTokens()
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
+
+		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		require.True(t, ok)
+		require.Equal(t, uint32(0), cachedWriteTokens)
 	})
 
 	t.Run("non-streaming response with fallback model", func(t *testing.T) {
@@ -358,6 +362,10 @@ data: [DONE]
 		cachedTokens, ok := tokenUsage.CachedInputTokens()
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
+
+		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		require.True(t, ok)
+		require.Equal(t, uint32(0), cachedWriteTokens)
 	})
 
 	t.Run("streaming response with fallback model", func(t *testing.T) {
@@ -453,6 +461,10 @@ data: [DONE]
 
 		cachedTokens, _ := tokenUsage.CachedInputTokens()
 		require.Equal(t, uint32(2), cachedTokens)
+
+		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		require.True(t, ok)
+		require.Equal(t, uint32(0), cachedWriteTokens)
 	})
 
 	t.Run("streaming read error", func(t *testing.T) {
@@ -541,6 +553,10 @@ func TestResponses_HandleNonStreamingResponse(t *testing.T) {
 
 		cachedTokens, _ := tokenUsage.CachedInputTokens()
 		require.Equal(t, uint32(2), cachedTokens)
+
+		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		require.True(t, ok)
+		require.Equal(t, uint32(0), cachedWriteTokens)
 	})
 
 	t.Run("invalid JSON", func(t *testing.T) {
@@ -602,6 +618,10 @@ data: [DONE]
 		cachedTokens, ok := tokenUsage.CachedInputTokens()
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
+
+		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		require.True(t, ok)
+		require.Equal(t, uint32(0), cachedWriteTokens)
 	})
 
 	t.Run("model extraction", func(t *testing.T) {
@@ -666,9 +686,11 @@ data: [DONE]
 		_, outputSet := tokenUsage.OutputTokens()
 		_, totalSet := tokenUsage.TotalTokens()
 		_, cachedSet := tokenUsage.CachedInputTokens()
+		_, cachedWriteSet := tokenUsage.CachedWriteInputTokens()
 
 		require.False(t, totalSet)
 		require.False(t, cachedSet)
+		require.False(t, cachedWriteSet)
 		require.False(t, inputSet)
 		require.False(t, outputSet)
 	})
diff --git a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
index b6f1733910..4407e6ae18 100644
--- a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
+++ b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
@@ -31,6 +31,8 @@ spec:
       type: InputToken
     - metadataKey: llm_input_cached_token
       type: CachedInputToken
+    - metadataKey: llm_write_input_cached_token
+      type: CachedWriteInputToken
     - metadataKey: llm_output_token
       type: OutputToken
     - metadataKey: llm_total_token

From 6ed2f434137c76b86bafaa179254057299e1d5cf Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 15:16:21 -0500
Subject: [PATCH 03/20] cache for aws;

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/openai_awsbedrock.go      | 14 ++++++++------
 internal/translator/openai_awsbedrock_test.go | 12 +++++++-----
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go
index b566766eb9..56b4870151 100644
--- a/internal/translator/openai_awsbedrock.go
+++ b/internal/translator/openai_awsbedrock.go
@@ -853,14 +853,16 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe
 			return chunk, false
 		}
 		chunk.Usage = &openai.Usage{
-			TotalTokens:      event.Usage.TotalTokens,
-			PromptTokens:     event.Usage.InputTokens,
-			CompletionTokens: event.Usage.OutputTokens,
+			TotalTokens:         event.Usage.TotalTokens,
+			PromptTokens:        event.Usage.InputTokens,
+			CompletionTokens:    event.Usage.OutputTokens,
+			PromptTokensDetails: &openai.PromptTokensDetails{},
 		}
 		if event.Usage.CacheReadInputTokens != nil {
-			chunk.Usage.PromptTokensDetails = &openai.PromptTokensDetails{
-				CachedTokens: *event.Usage.CacheReadInputTokens,
-			}
+			chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens
+		}
+		if event.Usage.CacheWriteInputTokens != nil {
+			chunk.Usage.PromptTokensDetails.CachedWriteTokens = *event.Usage.CacheWriteInputTokens
 		}
 	// messageStart event.
 	case awsbedrock.ConverseStreamEventTypeMessageStart.String():
diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go
index e97e39219a..e780b8a34c 100644
--- a/internal/translator/openai_awsbedrock_test.go
+++ b/internal/translator/openai_awsbedrock_test.go
@@ -1447,10 +1447,11 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T)
 			name: "basic_testing",
 			input: awsbedrock.ConverseResponse{
 				Usage: &awsbedrock.TokenUsage{
-					InputTokens:          10,
-					OutputTokens:         20,
-					TotalTokens:          30,
-					CacheReadInputTokens: ptr.To(5),
+					InputTokens:           10,
+					OutputTokens:          20,
+					TotalTokens:           30,
+					CacheReadInputTokens:  ptr.To(5),
+					CacheWriteInputTokens: ptr.To(7),
 				},
 				Output: &awsbedrock.ConverseOutput{
 					Message: awsbedrock.Message{
@@ -1473,7 +1474,8 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T)
 					PromptTokens:     10,
 					CompletionTokens: 20,
 					PromptTokensDetails: &openai.PromptTokensDetails{
-						CachedTokens: 5,
+						CachedTokens:      5,
+						CachedWriteTokens: 7,
 					},
 				},
 				Choices: []openai.ChatCompletionResponseChoice{

From 21dc66c5dbb12262028af200e7cb27d1a730200f Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 15:26:09 -0500
Subject: [PATCH 04/20] fix cel

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/llmcostcel/cel.go      |  2 +-
 internal/llmcostcel/cel_test.go | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go
index 0c2f19f913..2d2f4ad834 100644
--- a/internal/llmcostcel/cel.go
+++ b/internal/llmcostcel/cel.go
@@ -56,7 +56,7 @@ func NewProgram(expr string) (prog cel.Program, err error) {
 	}
 
 	// Sanity check by evaluating the expression with some dummy values.
-	_, err = EvaluateProgram(prog, "dummy", "dummy", 0, 0, 0, 0)
+	_, err = EvaluateProgram(prog, "dummy", "dummy", 0, 0, 0, 0, 0)
 	if err != nil {
 		return nil, fmt.Errorf("failed to evaluate CEL expression: %w", err)
 	}
diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go
index 7730b181fb..79835b7354 100644
--- a/internal/llmcostcel/cel_test.go
+++ b/internal/llmcostcel/cel_test.go
@@ -26,13 +26,13 @@ func TestNewProgram(t *testing.T) {
 		require.NoError(t, err)
 	})
 	t.Run("variables", func(t *testing.T) {
-		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens) * output_tokens  : total_tokens")
+		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens - cached_write_input_tokens) * output_tokens  : total_tokens")
 		require.NoError(t, err)
-		v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 2, 3)
+		v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3)
 		require.NoError(t, err)
-		require.Equal(t, uint64(200), v)
+		require.Equal(t, uint64(198), v)
 
-		v, err = EvaluateProgram(prog, "not_cool_model", "cool_backend", 200, 100, 2, 3)
+		v, err = EvaluateProgram(prog, "not_cool_model", "cool_backend", 200, 100, 1, 2, 3)
 		require.NoError(t, err)
 		require.Equal(t, uint64(3), v)
 	})
@@ -59,13 +59,13 @@ func TestEvaluateProgram(t *testing.T) {
 	t.Run("signed integer negative", func(t *testing.T) {
 		prog, err := NewProgram("int(input_tokens) - int(output_tokens)")
 		require.NoError(t, err)
-		_, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 2000, 3)
+		_, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 0, 2000, 3)
 		require.ErrorContains(t, err, "CEL expression result is negative (-1900)")
 	})
 	t.Run("unsigned integer overflow", func(t *testing.T) {
 		prog, err := NewProgram("input_tokens - output_tokens")
 		require.NoError(t, err)
-		_, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 2000, 3)
+		_, err = EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 0, 2000, 3)
 		require.ErrorContains(t, err, "failed to evaluate CEL expression: unsigned integer overflow")
 	})
 	t.Run("ensure concurrency safety", func(t *testing.T) {
@@ -76,7 +76,7 @@ func TestEvaluateProgram(t *testing.T) {
 		synctest.Test(t, func(t *testing.T) {
 			for range 100 {
 				go func() {
-					v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 2, 3)
+					v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 100, 0, 0, 2, 3)
 					require.NoError(t, err)
 					require.Equal(t, uint64(200), v)
 				}()

From 4c1dc960a51e40cb10101f897ec0b8e8991162cb Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 15:27:07 -0500
Subject: [PATCH 05/20] fix -1

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/anthropic_anthropic_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go
index 20fc793ab5..236dd537d9 100644
--- a/internal/translator/anthropic_anthropic_test.go
+++ b/internal/translator/anthropic_anthropic_test.go
@@ -95,7 +95,7 @@ func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) {
 	require.NoError(t, err)
 	require.Nil(t, headerMutation)
 	require.Nil(t, bodyMutation)
-	expected := tokenUsageFrom(9, 0, -1, 16, 25)
+	expected := tokenUsageFrom(9, 0, 0, 16, 25)
 	require.Equal(t, expected, tokenUsage)
 	require.Equal(t, "claude-sonnet-4-5-20250929", responseModel)
 }

From c58a29a4f574eaad5060d01f4fd7cdbdaffb4242 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:09:22 -0500
Subject: [PATCH 06/20] find+replace cache write with cache creation

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 api/v1alpha1/ai_gateway_route.go              |   4 +-
 api/v1alpha1/shared_types.go                  |  12 +-
 examples/token_ratelimit/token_ratelimit.yaml |   4 +-
 internal/apischema/anthropic/anthropic.go     |   2 +-
 internal/apischema/openai/openai.go           |   8 +-
 internal/apischema/openai/openai_test.go      |  38 +--
 internal/controller/gateway.go                |   4 +-
 internal/controller/gateway_test.go           |   4 +-
 internal/extproc/mocks_test.go                |  28 +-
 internal/extproc/processor_impl.go            |   2 +-
 internal/extproc/processor_impl_test.go       |   6 +-
 internal/filterapi/filterconfig.go            |   4 +-
 internal/llmcostcel/cel.go                    |  32 +--
 internal/llmcostcel/cel_test.go               |   2 +-
 internal/metrics/genai.go                     |   6 +-
 internal/metrics/metrics.go                   |  48 ++--
 internal/metrics/metrics_impl.go              |   2 +-
 internal/metrics/metrics_impl_test.go         |  12 +-
 .../openinference/anthropic/messages.go       |   2 +-
 .../openinference/openai/response_attrs.go    |   6 +-
 internal/translator/anthropic_anthropic.go    |   8 +-
 .../translator/anthropic_gcpanthropic_test.go |  16 +-
 internal/translator/anthropic_usage_test.go   | 256 +++++++++---------
 internal/translator/openai_awsbedrock.go      |   8 +-
 internal/translator/openai_awsbedrock_test.go |   4 +-
 internal/translator/openai_completions.go     |   4 +-
 internal/translator/openai_gcpanthropic.go    |   4 +-
 .../translator/openai_gcpanthropic_stream.go  |  12 +-
 internal/translator/openai_gcpvertexai.go     |   4 +-
 internal/translator/openai_openai.go          |   4 +-
 internal/translator/openai_responses.go       |   2 +-
 .../testdata/aigatewayroutes/llmcosts.yaml    |   2 +-
 32 files changed, 275 insertions(+), 275 deletions(-)

diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go
index b4dac38660..6404f8e8a6 100644
--- a/api/v1alpha1/ai_gateway_route.go
+++ b/api/v1alpha1/ai_gateway_route.go
@@ -108,8 +108,8 @@ type AIGatewayRouteSpec struct {
 	//	  type: TotalToken
 	//	- metadataKey: llm_cached_input_token
 	//	  type: CachedInputToken
-	// - metadataKey: llm_cached_write_input_token
-	//    type: CachedWriteInputToken
+	// - metadataKey: llm_cache_creation_input_token
+	//    type: CachedCreationInputToken
 	// ```
 	// Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
 	// rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go
index 7a34cf4128..2e391498dd 100644
--- a/api/v1alpha1/shared_types.go
+++ b/api/v1alpha1/shared_types.go
@@ -100,9 +100,9 @@ type LLMRequestCost struct {
 	MetadataKey string `json:"metadataKey"`
 	// Type specifies the type of the request cost. The default is "OutputToken",
 	// and it uses "output token" as the cost. The other types are "InputToken", "TotalToken",
-	// "CachedInputToken", "CachedWriteInputToken", and "CEL".
+	// "CachedInputToken", "CachedCreationInputToken", and "CEL".
 	//
-	// +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedWriteInputToken;TotalToken;CEL
+	// +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedCreationInputToken;TotalToken;CEL
 	Type LLMRequestCostType `json:"type"`
 	// CEL is the CEL expression to calculate the cost of the request.
 	// The CEL expression must return a signed or unsigned integer. If the
@@ -114,7 +114,7 @@ type LLMRequestCost struct {
 	//	* backend: the backend name in the form of "name.namespace". Type: string.
 	//	* input_tokens: the number of input tokens. Type: unsigned integer.
 	//	* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.
-	//	* cached_write_input_tokens: the number of cached write input tokens. Type: unsigned integer.
+	//	* cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.
 	//	* output_tokens: the number of output tokens. Type: unsigned integer.
 	//	* total_tokens: the total number of tokens. Type: unsigned integer.
 	//
@@ -122,7 +122,7 @@ type LLMRequestCost struct {
 	//
 	// 	* "model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens"
 	//	* "backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens"
-	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_write_input_tokens * 1.25 + output_tokens : total_tokens"
+	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens"
 	//	* "input_tokens + output_tokens + total_tokens"
 	//	* "input_tokens * output_tokens"
 	//
@@ -138,8 +138,8 @@ const (
 	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
 	// LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.
 	LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken"
-	// LLMRequestCostTypeCachedWriteInputToken is the cost type of the cached input token.
-	LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken"
+	// LLMRequestCostTypeCachedCreationInputToken is the cost type of the cached input token.
+	LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken"
 	// LLMRequestCostTypeOutputToken is the cost type of the output token.
 	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
 	// LLMRequestCostTypeTotalToken is the cost type of the total token.
diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml
index 2224ed724a..4ebe6217c7 100644
--- a/examples/token_ratelimit/token_ratelimit.yaml
+++ b/examples/token_ratelimit/token_ratelimit.yaml
@@ -51,8 +51,8 @@ spec:
       type: InputToken
     - metadataKey: llm_cached_input_token
       type: CachedInputToken
-    - metadataKey: llm_cached_write_input_token
-      type: CachedWriteInputToken
+    - metadataKey: llm_cache_creation_input_token
+      type: CachedCreationInputToken
     - metadataKey: llm_output_token
       type: OutputToken
     - metadataKey: llm_total_token
diff --git a/internal/apischema/anthropic/anthropic.go b/internal/apischema/anthropic/anthropic.go
index f65d102761..f02e706d92 100644
--- a/internal/apischema/anthropic/anthropic.go
+++ b/internal/apischema/anthropic/anthropic.go
@@ -437,7 +437,7 @@ const (
 // so we use float64 to be able to unmarshal both 1234 and 1234.0 without errors.
 type Usage struct {
 	// The number of input tokens used to create the cache entry.
-	CacheCreationInputTokens float64 `json:"cache_creation_input_tokens"`
+	CachedCreationInputTokens float64 `json:"cache_creation_input_tokens"`
 	// The number of input tokens read from the cache.
 	CacheReadInputTokens float64 `json:"cache_read_input_tokens"`
 	// The number of input tokens which were used.
diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go
index 0c74a3e244..1cb6268929 100644
--- a/internal/apischema/openai/openai.go
+++ b/internal/apischema/openai/openai.go
@@ -1383,7 +1383,7 @@ type PromptTokensDetails struct {
 	// Cached tokens present in the prompt.
 	CachedTokens int `json:"cached_tokens,omitzero"`
 	// Tokens written to the cache.
-	CachedWriteTokens int `json:"cached_write_tokens,omitzero"`
+	CachedCreationTokens int `json:"cache_creation_input_tokens,omitzero"`
 }
 
 // ChatCompletionResponseChunk is described in the OpenAI API documentation:
@@ -2539,7 +2539,7 @@ type ResponseUsageInputTokensDetails struct {
 	CachedTokens int64 `json:"cached_tokens"`
 
 	// The number of tokens that were written to the cache.
-	CachedWriteTokens int64 `json:"cached_write_tokens"`
+	CachedCreationTokens int64 `json:"cache_creation_input_tokens"`
 }
 
 // A detailed breakdown of the output tokens.
@@ -2553,8 +2553,8 @@ type ResponseTokensDetails struct {
 	// CachedTokens: Number of cached tokens.
 	CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api
 
-	// CachedWriteTokens: number of tokens that were written to the cache.
-	CachedWriteTokens int64 `json:"cached_write_tokens"` //nolint:tagliatelle
+	// CachedCreationTokens: number of tokens that were written to the cache.
+	CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle
 
 	// ReasoningTokens: Number of reasoning tokens (for reasoning models).
 	ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api
diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go
index d9df99e6a4..c592a3b712 100644
--- a/internal/apischema/openai/openai_test.go
+++ b/internal/apischema/openai/openai_test.go
@@ -1742,30 +1742,30 @@ func TestPromptTokensDetails(t *testing.T) {
 		{
 			name: "with text tokens",
 			details: PromptTokensDetails{
-				TextTokens:        15,
-				AudioTokens:       8,
-				CachedTokens:      384,
-				CachedWriteTokens: 10,
+				TextTokens:           15,
+				AudioTokens:          8,
+				CachedTokens:         384,
+				CachedCreationTokens: 10,
 			},
 			expected: `{
 				"text_tokens": 15,
 				"audio_tokens": 8,
 				"cached_tokens": 384,
-				"cached_write_tokens": 10
+				"cache_creation_input_tokens": 10
 			}`,
 		},
 		{
 			name: "with zero text tokens omitted",
 			details: PromptTokensDetails{
-				TextTokens:        0,
-				AudioTokens:       8,
-				CachedTokens:      384,
-				CachedWriteTokens: 10,
+				TextTokens:           0,
+				AudioTokens:          8,
+				CachedTokens:         384,
+				CachedCreationTokens: 10,
 			},
 			expected: `{
 				"audio_tokens": 8,
 				"cached_tokens": 384,
-				"cached_write_tokens": 10
+				"cache_creation_input_tokens": 10
 			}`,
 		},
 	}
@@ -1822,9 +1822,9 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					RejectedPredictionTokens: 0,
 				},
 				PromptTokensDetails: &PromptTokensDetails{
-					AudioTokens:       8,
-					CachedTokens:      384,
-					CachedWriteTokens: 13,
+					AudioTokens:          8,
+					CachedTokens:         384,
+					CachedCreationTokens: 13,
 				},
 			},
 			expected: `{
@@ -1838,7 +1838,7 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 				"prompt_tokens_details": {
 					"audio_tokens": 8,
 					"cached_tokens": 384,
-					"cached_write_tokens": 13
+					"cache_creation_input_tokens": 13
 				}
 			}`,
 		},
@@ -1856,10 +1856,10 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					RejectedPredictionTokens: 0,
 				},
 				PromptTokensDetails: &PromptTokensDetails{
-					TextTokens:        15,
-					AudioTokens:       8,
-					CachedTokens:      384,
-					CachedWriteTokens: 21,
+					TextTokens:           15,
+					AudioTokens:          8,
+					CachedTokens:         384,
+					CachedCreationTokens: 21,
 				},
 			},
 			expected: `{
@@ -1875,7 +1875,7 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					"text_tokens": 15,
 					"audio_tokens": 8,
 					"cached_tokens": 384,
-					"cached_write_tokens": 21
+					"cache_creation_input_tokens": 21
 				}
 			}`,
 		},
diff --git a/internal/controller/gateway.go b/internal/controller/gateway.go
index db952afb21..b2e577114f 100644
--- a/internal/controller/gateway.go
+++ b/internal/controller/gateway.go
@@ -404,8 +404,8 @@ func (c *GatewayController) reconcileFilterConfigSecret(
 					fc.Type = filterapi.LLMRequestCostTypeInputToken
 				case aigv1a1.LLMRequestCostTypeCachedInputToken:
 					fc.Type = filterapi.LLMRequestCostTypeCachedInputToken
-				case aigv1a1.LLMRequestCostTypeCachedWriteInputToken:
-					fc.Type = filterapi.LLMRequestCostTypeCachedWriteInputToken
+				case aigv1a1.LLMRequestCostTypeCachedCreationInputToken:
+					fc.Type = filterapi.LLMRequestCostTypeCachedCreationInputToken
 				case aigv1a1.LLMRequestCostTypeOutputToken:
 					fc.Type = filterapi.LLMRequestCostTypeOutputToken
 				case aigv1a1.LLMRequestCostTypeTotalToken:
diff --git a/internal/controller/gateway_test.go b/internal/controller/gateway_test.go
index 6c3be8df2f..fb26991883 100644
--- a/internal/controller/gateway_test.go
+++ b/internal/controller/gateway_test.go
@@ -197,7 +197,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 					{MetadataKey: "bar", Type: aigv1a1.LLMRequestCostTypeOutputToken},
 					{MetadataKey: "baz", Type: aigv1a1.LLMRequestCostTypeTotalToken},
 					{MetadataKey: "qux", Type: aigv1a1.LLMRequestCostTypeCachedInputToken},
-					{MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedWriteInputToken},
+					{MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedCreationInputToken},
 				},
 			},
 		},
@@ -280,7 +280,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 		require.Equal(t, filterapi.LLMRequestCostTypeOutputToken, fc.LLMRequestCosts[1].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeTotalToken, fc.LLMRequestCosts[2].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeCachedInputToken, fc.LLMRequestCosts[3].Type)
-		require.Equal(t, filterapi.LLMRequestCostTypeCachedWriteInputToken, fc.LLMRequestCosts[4].Type)
+		require.Equal(t, filterapi.LLMRequestCostTypeCachedCreationInputToken, fc.LLMRequestCosts[4].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[5].Type)
 		require.Equal(t, `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[5].CEL)
 		require.Len(t, fc.Models, 1)
diff --git a/internal/extproc/mocks_test.go b/internal/extproc/mocks_test.go
index a2997f14f9..6d5087eb9d 100644
--- a/internal/extproc/mocks_test.go
+++ b/internal/extproc/mocks_test.go
@@ -171,17 +171,17 @@ func (m *mockMetricsFactory) NewMetrics() metrics.Metrics {
 
 // mockMetrics implements [metrics.Metrics] for testing.
 type mockMetrics struct {
-	requestStart               time.Time
-	originalModel              string
-	requestModel               string
-	responseModel              string
-	backend                    string
-	requestSuccessCount        int
-	requestErrorCount          int
-	inputTokenCount            int
-	cachedInputTokenCount      int
-	cachedWriteInputTokenCount int
-	outputTokenCount           int
+	requestStart                  time.Time
+	originalModel                 string
+	requestModel                  string
+	responseModel                 string
+	backend                       string
+	requestSuccessCount           int
+	requestErrorCount             int
+	inputTokenCount               int
+	cachedInputTokenCount         int
+	cachedCreationInputTokenCount int
+	outputTokenCount              int
 	// streamingOutputTokens tracks the cumulative output tokens recorded via RecordTokenLatency.
 	streamingOutputTokens int
 	timeToFirstToken      float64
@@ -219,8 +219,8 @@ func (m *mockMetrics) RecordTokenUsage(_ context.Context, usage metrics.TokenUsa
 	if cachedInput, ok := usage.CachedInputTokens(); ok {
 		m.cachedInputTokenCount += int(cachedInput)
 	}
-	if cachedWriteInput, ok := usage.CachedWriteInputTokens(); ok {
-		m.cachedWriteInputTokenCount += int(cachedWriteInput)
+	if cachedCreationInput, ok := usage.CachedCreationInputTokens(); ok {
+		m.cachedCreationInputTokenCount += int(cachedCreationInput)
 	}
 	if output, ok := usage.OutputTokens(); ok {
 		m.outputTokenCount += int(output)
@@ -285,7 +285,7 @@ func (m *mockMetrics) RequireRequestFailure(t *testing.T) {
 func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedWriteCachedInput, expectedOutput int) {
 	require.Equal(t, expectedInput, m.inputTokenCount)
 	require.Equal(t, expectedCachedInput, m.cachedInputTokenCount)
-	require.Equal(t, expectedWriteCachedInput, m.cachedWriteInputTokenCount)
+	require.Equal(t, expectedWriteCachedInput, m.cachedCreationInputTokenCount)
 	require.Equal(t, expectedOutput, m.outputTokenCount)
 }
 
diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go
index 93c35aa7cc..800b880393 100644
--- a/internal/extproc/processor_impl.go
+++ b/internal/extproc/processor_impl.go
@@ -533,7 +533,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 			cost, _ = costs.InputTokens()
 		case filterapi.LLMRequestCostTypeCachedInputToken:
 			cost, _ = costs.CachedInputTokens()
-		case filterapi.LLMRequestCostTypeCachedWriteInputToken:
+		case filterapi.LLMRequestCostTypeCachedCreationInputToken:
 			cost, _ = costs.CachedWriteInputTokens()
 		case filterapi.LLMRequestCostTypeOutputToken:
 			cost, _ = costs.OutputTokens()
diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go
index a590a38d24..e97d2fae45 100644
--- a/internal/extproc/processor_impl_test.go
+++ b/internal/extproc/processor_impl_test.go
@@ -259,7 +259,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		mt.retUsedToken.SetOutputTokens(123)
 		mt.retUsedToken.SetInputTokens(1)
 		mt.retUsedToken.SetCachedInputTokens(1)
-		mt.retUsedToken.SetCachedWriteInputTokens(3)
+		mt.retUsedToken.SetCachedCreationInputTokens(3)
 
 		celProgInt, err := llmcostcel.NewProgram("54321")
 		require.NoError(t, err)
@@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}},
-						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cached_write_input_token_usage"}},
+						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cache_creation_input_token_usage"}},
 						{
 							CELProg:        celProgInt,
 							LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"},
@@ -312,7 +312,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cached_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
-			GetStructValue().Fields["cached_write_input_token_usage"].GetNumberValue())
+			GetStructValue().Fields["cache_creation_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cel_int"].GetNumberValue())
 		require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
diff --git a/internal/filterapi/filterconfig.go b/internal/filterapi/filterconfig.go
index f191d6eb01..b2f10de51f 100644
--- a/internal/filterapi/filterconfig.go
+++ b/internal/filterapi/filterconfig.go
@@ -81,8 +81,8 @@ const (
 	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
 	// LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached read input token.
 	LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken"
-	// LLMRequestCostTypeCachedWriteInputToken specifies that the request cost is calculated from the cached write input token.
-	LLMRequestCostTypeCachedWriteInputToken LLMRequestCostType = "CachedWriteInputToken"
+	// LLMRequestCostTypeCachedCreationInputToken specifies that the request cost is calculated from the cache creation input token.
+	LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken"
 	// LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token.
 	LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
 	// LLMRequestCostTypeCEL specifies that the request cost is calculated from the CEL expression.
diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go
index 2d2f4ad834..5bc0008d59 100644
--- a/internal/llmcostcel/cel.go
+++ b/internal/llmcostcel/cel.go
@@ -16,13 +16,13 @@ import (
 )
 
 const (
-	celModelNameKey              = "model"
-	celBackendKey                = "backend"
-	celInputTokensKey            = "input_tokens"
-	celCachedInputTokensKey      = "cached_input_tokens"       // #nosec G101
-	celCachedWriteInputTokensKey = "cached_write_input_tokens" // #nosec G101
-	celOutputTokensKey           = "output_tokens"
-	celTotalTokensKey            = "total_tokens"
+	celModelNameKey                 = "model"
+	celBackendKey                   = "backend"
+	celInputTokensKey               = "input_tokens"
+	celCachedInputTokensKey         = "cached_input_tokens"         // #nosec G101
+	celCachedCreationInputTokensKey = "cache_creation_input_tokens" // #nosec G101
+	celOutputTokensKey              = "output_tokens"
+	celTotalTokensKey               = "total_tokens"
 )
 
 var env *cel.Env
@@ -34,7 +34,7 @@ func init() {
 		cel.Variable(celBackendKey, cel.StringType),
 		cel.Variable(celInputTokensKey, cel.UintType),
 		cel.Variable(celCachedInputTokensKey, cel.UintType),
-		cel.Variable(celCachedWriteInputTokensKey, cel.UintType),
+		cel.Variable(celCachedCreationInputTokensKey, cel.UintType),
 		cel.Variable(celOutputTokensKey, cel.UintType),
 		cel.Variable(celTotalTokensKey, cel.UintType),
 	)
@@ -64,15 +64,15 @@ func NewProgram(expr string) (prog cel.Program, err error) {
 }
 
 // EvaluateProgram evaluates the given CEL program with the given variables.
-func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedWriteInputTokens, outputTokens, totalTokens uint32) (uint64, error) {
+func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedCreationInputTokens, outputTokens, totalTokens uint32) (uint64, error) {
 	out, _, err := prog.Eval(map[string]any{
-		celModelNameKey:              modelName,
-		celBackendKey:                backend,
-		celInputTokensKey:            inputTokens,
-		celCachedInputTokensKey:      cachedInputTokens,
-		celCachedWriteInputTokensKey: cachedWriteInputTokens,
-		celOutputTokensKey:           outputTokens,
-		celTotalTokensKey:            totalTokens,
+		celModelNameKey:                 modelName,
+		celBackendKey:                   backend,
+		celInputTokensKey:               inputTokens,
+		celCachedInputTokensKey:         cachedInputTokens,
+		celCachedCreationInputTokensKey: cachedCreationInputTokens,
+		celOutputTokensKey:              outputTokens,
+		celTotalTokensKey:               totalTokens,
 	})
 	if err != nil || out == nil {
 		return 0, fmt.Errorf("failed to evaluate CEL expression: %w", err)
diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go
index 79835b7354..cee9a259a5 100644
--- a/internal/llmcostcel/cel_test.go
+++ b/internal/llmcostcel/cel_test.go
@@ -26,7 +26,7 @@ func TestNewProgram(t *testing.T) {
 		require.NoError(t, err)
 	})
 	t.Run("variables", func(t *testing.T) {
-		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens - cached_write_input_tokens) * output_tokens  : total_tokens")
+		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens - cache_creation_input_tokens) * output_tokens  : total_tokens")
 		require.NoError(t, err)
 		v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3)
 		require.NoError(t, err)
diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go
index 3c4a3dc62f..cb45ae6051 100644
--- a/internal/metrics/genai.go
+++ b/internal/metrics/genai.go
@@ -39,9 +39,9 @@ const (
 	// https://github.com/open-telemetry/semantic-conventions/issues/1959
 	//
 	// However, the spec says "a custom value MAY be used.", so we can use it now.
-	genaiTokenTypeCachedInput      = "cached_input"
-	genaiTokenTypeCachedWriteInput = "cached_write_input"
-	genaiErrorTypeFallback         = "_OTHER"
+	genaiTokenTypeCachedInput         = "cached_input"
+	genaiTokenTypeCachedCreationInput = "cache_creation_input"
+	genaiErrorTypeFallback            = "_OTHER"
 )
 
 // GenAIOperation represents the type of generative AI operation i.e. the endpoint being called.
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 6c473863bd..38fe032539 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -149,10 +149,10 @@ type TokenUsage struct {
 	totalTokens uint32
 	// CachedInputTokens is the total number of tokens read from cache.
 	cachedInputTokens uint32
-	// CachedWriteInputTokens is the total number of tokens written to cache.
-	cachedWriteInputTokens uint32
+	// CachedCreationInputTokens is the total number of tokens written to cache.
+	cachedCreationInputTokens uint32
 
-	inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedWriteInputTokenSet bool
+	inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedCreationInputTokenSet bool
 }
 
 // InputTokens returns the number of input tokens and whether it was set.
@@ -175,9 +175,9 @@ func (u *TokenUsage) CachedInputTokens() (uint32, bool) {
 	return u.cachedInputTokens, u.cachedInputTokenSet
 }
 
-// CachedWriteInputTokens returns the number of cached write input tokens and whether it was set.
-func (u *TokenUsage) CachedWriteInputTokens() (uint32, bool) {
-	return u.cachedWriteInputTokens, u.cachedWriteInputTokenSet
+// CachedCreationInputTokens returns the number of cache creation input tokens and whether it was set.
+func (u *TokenUsage) CachedCreationInputTokens() (uint32, bool) {
+	return u.cachedCreationInputTokens, u.cachedCreationInputTokenSet
 }
 
 // SetInputTokens sets the number of input tokens and marks the field as set.
@@ -204,10 +204,10 @@ func (u *TokenUsage) SetCachedInputTokens(tokens uint32) {
 	u.cachedInputTokenSet = true
 }
 
-// SetCachedWriteInputTokens sets the number of cached write input tokens and marks the field as set.
-func (u *TokenUsage) SetCachedWriteInputTokens(tokens uint32) {
-	u.cachedWriteInputTokens = tokens
-	u.cachedWriteInputTokenSet = true
+// SetCachedCreationInputTokens sets the number of cache creation input tokens and marks the field as set.
+func (u *TokenUsage) SetCachedCreationInputTokens(tokens uint32) {
+	u.cachedCreationInputTokens = tokens
+	u.cachedCreationInputTokenSet = true
 }
 
 // AddInputTokens increments the recorded input tokens and marks the field as set.
@@ -228,10 +228,10 @@ func (u *TokenUsage) AddCachedInputTokens(tokens uint32) {
 	u.cachedInputTokens += tokens
 }
 
-// AddCachedWriteInputTokens increments the recorded cached write input tokens and marks the field as set.
-func (u *TokenUsage) AddCachedWriteInputTokens(tokens uint32) {
-	u.cachedWriteInputTokenSet = true
-	u.cachedWriteInputTokens += tokens
+// AddCachedCreationInputTokens increments the recorded cache creation input tokens and marks the field as set.
+func (u *TokenUsage) AddCachedCreationInputTokens(tokens uint32) {
+	u.cachedCreationInputTokenSet = true
+	u.cachedCreationInputTokens += tokens
 }
 
 // Override updates the TokenUsage fields with values from another TokenUsage instance.
@@ -253,9 +253,9 @@ func (u *TokenUsage) Override(other TokenUsage) {
 		u.cachedInputTokens = other.cachedInputTokens
 		u.cachedInputTokenSet = true
 	}
-	if other.cachedWriteInputTokenSet {
-		u.cachedWriteInputTokens = other.cachedWriteInputTokens
-		u.cachedWriteInputTokenSet = true
+	if other.cachedCreationInputTokenSet {
+		u.cachedCreationInputTokens = other.cachedCreationInputTokens
+		u.cachedCreationInputTokenSet = true
 	}
 }
 
@@ -265,15 +265,15 @@ func (u *TokenUsage) Override(other TokenUsage) {
 //
 // This function works for both streaming and non-streaming responses by accepting
 // the common usage fields that exist in all Anthropic usage structures.
-func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cacheCreationTokens int64) TokenUsage {
+func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens int64) TokenUsage {
 	// Calculate total input tokens as per Anthropic API documentation
-	totalInputTokens := inputTokens + cacheCreationTokens + cacheReadTokens
+	totalInputTokens := inputTokens + cachedCreationTokens + cacheReadTokens
 
 	var usage TokenUsage
-	usage.SetInputTokens(uint32(totalInputTokens))                //nolint:gosec
-	usage.SetOutputTokens(uint32(outputTokens))                   //nolint:gosec
-	usage.SetTotalTokens(uint32(totalInputTokens + outputTokens)) //nolint:gosec
-	usage.SetCachedInputTokens(uint32(cacheReadTokens))           //nolint:gosec
-	usage.SetCachedWriteInputTokens(uint32(cacheCreationTokens))  //nolint:gosec
+	usage.SetInputTokens(uint32(totalInputTokens))                   //nolint:gosec
+	usage.SetOutputTokens(uint32(outputTokens))                      //nolint:gosec
+	usage.SetTotalTokens(uint32(totalInputTokens + outputTokens))    //nolint:gosec
+	usage.SetCachedInputTokens(uint32(cacheReadTokens))              //nolint:gosec
+	usage.SetCachedCreationInputTokens(uint32(cachedCreationTokens)) //nolint:gosec
 	return usage
 }
diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go
index e4a85e4d23..32dbfc38d3 100644
--- a/internal/metrics/metrics_impl.go
+++ b/internal/metrics/metrics_impl.go
@@ -148,7 +148,7 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re
 			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)),
 		)
 	}
-	if cachedWriteInputTokens, ok := usage.CachedWriteInputTokens(); ok {
+	if cachedCreationInputTokens, ok := usage.CachedWriteInputTokens(); ok {
 		b.metrics.tokenUsage.Record(ctx, float64(cachedWriteInputTokens),
 			metric.WithAttributeSet(attrs),
 			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput)),
diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go
index 8bf0ff2cce..a0fce8287e 100644
--- a/internal/metrics/metrics_impl_test.go
+++ b/internal/metrics/metrics_impl_test.go
@@ -71,10 +71,10 @@ func TestRecordTokenUsage(t *testing.T) {
 			attribute.Key(genaiAttributeResponseModel).String("test-model"),
 		}
 		// gen_ai.token.type values - https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/#common-attributes
-		inputAttrs            = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...)
-		outputAttrs           = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...)
-		cachedInputAttrs      = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...)
-		cachedWriteInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput))...)
+		inputAttrs               = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...)
+		outputAttrs              = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...)
+		cachedInputAttrs         = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...)
+		cachedCreationInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput))...)
 	)
 
 	pm.SetOriginalModel("test-model")
@@ -82,7 +82,7 @@ func TestRecordTokenUsage(t *testing.T) {
 	pm.SetResponseModel("test-model")
 	pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}})
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
-		inputTokens: 10, cachedInputTokens: 8, cachedWriteInputTokens: 2, outputTokens: 5,
+		inputTokens: 10, cachedInputTokens: 8, cachedCreationInputTokens: 2, outputTokens: 5,
 		inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
@@ -94,7 +94,7 @@ func TestRecordTokenUsage(t *testing.T) {
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 8.0, sum)
 
-	count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs)
+	count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs)
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 2.0, sum)
 
diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go
index 1232609a37..b37f37aa1a 100644
--- a/internal/tracing/openinference/anthropic/messages.go
+++ b/internal/tracing/openinference/anthropic/messages.go
@@ -211,7 +211,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer
 		int64(u.InputTokens),
 		int64(u.OutputTokens),
 		int64(u.CacheReadInputTokens),
-		int64(u.CacheCreationInputTokens),
+		int64(u.CachedCreationInputTokens),
 	)
 	input, _ := cost.InputTokens()
 	cacheRead, _ := cost.CachedInputTokens()
diff --git a/internal/tracing/openinference/openai/response_attrs.go b/internal/tracing/openinference/openai/response_attrs.go
index 178c22f0da..5cd3561401 100644
--- a/internal/tracing/openinference/openai/response_attrs.go
+++ b/internal/tracing/openinference/openai/response_attrs.go
@@ -58,7 +58,7 @@ func buildResponseAttributes(resp *openai.ChatCompletionResponse, config *openin
 			attrs = append(attrs,
 				attribute.Int(openinference.LLMTokenCountPromptAudio, td.AudioTokens),
 				attribute.Int(openinference.LLMTokenCountPromptCacheHit, td.CachedTokens),
-				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedWriteTokens),
+				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedCreationTokens),
 			)
 		}
 	}
@@ -194,8 +194,8 @@ func buildResponsesResponseAttributes(resp *openai.Response, _ *openinference.Tr
 		if resp.Usage.InputTokensDetails.CachedTokens > 0 {
 			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(resp.Usage.InputTokensDetails.CachedTokens)))
 		}
-		if resp.Usage.InputTokensDetails.CachedWriteTokens > 0 {
-			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedWriteTokens)))
+		if resp.Usage.InputTokensDetails.CachedCreationTokens > 0 {
+			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedCreationTokens)))
 		}
 	}
 
diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go
index bbbde960cc..60f12cf6af 100644
--- a/internal/translator/anthropic_anthropic.go
+++ b/internal/translator/anthropic_anthropic.go
@@ -103,7 +103,7 @@ func (a *anthropicToAnthropicTranslator) ResponseBody(_ map[string]string, body
 		int64(usage.InputTokens),
 		int64(usage.OutputTokens),
 		int64(usage.CacheReadInputTokens),
-		int64(usage.CacheCreationInputTokens),
+		int64(usage.CachedCreationInputTokens),
 	)
 	if span != nil {
 		span.RecordResponse(anthropicResp)
@@ -148,7 +148,7 @@ func (a *anthropicToAnthropicTranslator) extractUsageFromBufferEvent(s tracing.M
 					int64(u.InputTokens),
 					int64(u.OutputTokens),
 					int64(u.CacheReadInputTokens),
-					int64(u.CacheCreationInputTokens),
+					int64(u.CachedCreationInputTokens),
 				)
 				// Override with message_start usage (contains input tokens and initial state)
 				a.streamingTokenUsage.Override(messageStartUsage)
@@ -181,8 +181,8 @@ func (a *anthropicToAnthropicTranslator) updateTotalTokens() {
 		if _, cachedSet := a.streamingTokenUsage.CachedInputTokens(); !cachedSet {
 			a.streamingTokenUsage.SetCachedInputTokens(0)
 		}
-		if _, cachedSet := a.streamingTokenUsage.CachedWriteInputTokens(); !cachedSet {
-			a.streamingTokenUsage.SetCachedWriteInputTokens(0)
+		if _, cachedSet := a.streamingTokenUsage.CachedCreationInputTokens(); !cachedSet {
+			a.streamingTokenUsage.SetCachedCreationInputTokens(0)
 		}
 	}
 
diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go
index 875265d94d..d788a0f2bd 100644
--- a/internal/translator/anthropic_gcpanthropic_test.go
+++ b/internal/translator/anthropic_gcpanthropic_test.go
@@ -570,7 +570,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te
 	}
 }
 
-func tokenUsageFrom(in, cachedInput, cachedWriteInput, out, total int32) metrics.TokenUsage {
+func tokenUsageFrom(in, cachedInput, cachedCreationInput, out, total int32) metrics.TokenUsage {
 	var usage metrics.TokenUsage
 	if in >= 0 {
 		usage.SetInputTokens(uint32(in))
@@ -578,8 +578,8 @@ func tokenUsageFrom(in, cachedInput, cachedWriteInput, out, total int32) metrics
 	if cachedInput >= 0 {
 		usage.SetCachedInputTokens(uint32(cachedInput))
 	}
-	if cachedWriteInput >= 0 {
-		usage.SetCachedWriteInputTokens(uint32(cachedWriteInput))
+	if cachedCreationInput >= 0 {
+		usage.SetCachedCreationInputTokens(uint32(cachedCreationInput))
 	}
 	if out >= 0 {
 		usage.SetOutputTokens(uint32(out))
@@ -638,7 +638,7 @@ data: {"type": "message_stop"}
 	outputTokens, outputSet := tokenUsage.OutputTokens()
 	totalTokens, totalSet := tokenUsage.TotalTokens()
 	cachedTokens, cachedSet := tokenUsage.CachedInputTokens()
-	cachedWriteTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens()
+	cachedCreationTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens()
 
 	// Assertions
 	assert.True(t, inputSet, "Input tokens should be set")
@@ -653,8 +653,8 @@ data: {"type": "message_stop"}
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
 
-	assert.True(t, cachedWriteSet, "Cached write tokens should be set")
-	assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario")
+	assert.True(t, cachedWriteSet, "cache creation tokens should be set")
+	assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario")
 
 	_, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil)
 	require.NoError(t, err)
@@ -686,6 +686,6 @@ data: {"type": "message_stop"}
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
 
-	assert.True(t, cachedWriteSet, "Cached write tokens should be set")
-	assert.Equal(t, uint32(1), cachedWriteTokens, "No cached write tokens in this scenario")
+	assert.True(t, cachedWriteSet, "cache creation tokens should be set")
+	assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario")
 }
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index bc1a8a063c..016a355f04 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -16,88 +16,88 @@ import (
 
 func TestExtractLLMTokenUsage(t *testing.T) {
 	tests := []struct {
-		name                      string
-		inputTokens               int64
-		outputTokens              int64
-		cacheReadTokens           int64
-		cacheCreationTokens       int64
-		expectedInputTokens       uint32
-		expectedOutputTokens      uint32
-		expectedTotalTokens       uint32
-		expectedCachedTokens      uint32
-		expectedCachedWriteTokens uint32
+		name                         string
+		inputTokens                  int64
+		outputTokens                 int64
+		cacheReadTokens              int64
+		cachedCreationTokens         int64
+		expectedInputTokens          uint32
+		expectedOutputTokens         uint32
+		expectedTotalTokens          uint32
+		expectedCachedTokens         uint32
+		expectedCachedCreationTokens uint32
 	}{
 		{
-			name:                      "basic usage without cache",
-			inputTokens:               100,
-			outputTokens:              50,
-			cacheReadTokens:           0,
-			cacheCreationTokens:       0,
-			expectedInputTokens:       100,
-			expectedOutputTokens:      50,
-			expectedTotalTokens:       150,
-			expectedCachedTokens:      0,
-			expectedCachedWriteTokens: 0,
+			name:                         "basic usage without cache",
+			inputTokens:                  100,
+			outputTokens:                 50,
+			cacheReadTokens:              0,
+			cachedCreationTokens:         0,
+			expectedInputTokens:          100,
+			expectedOutputTokens:         50,
+			expectedTotalTokens:          150,
+			expectedCachedTokens:         0,
+			expectedCachedCreationTokens: 0,
 		},
 		{
-			name:                      "usage with cache read tokens",
-			inputTokens:               80,
-			outputTokens:              30,
-			cacheReadTokens:           20,
-			cacheCreationTokens:       0,
-			expectedInputTokens:       100, // 80 + 0 + 20
-			expectedOutputTokens:      30,
-			expectedTotalTokens:       130, // 100 + 30
-			expectedCachedTokens:      20,  // 20
-			expectedCachedWriteTokens: 0,
+			name:                         "usage with cache read tokens",
+			inputTokens:                  80,
+			outputTokens:                 30,
+			cacheReadTokens:              20,
+			cachedCreationTokens:         0,
+			expectedInputTokens:          100, // 80 + 0 + 20
+			expectedOutputTokens:         30,
+			expectedTotalTokens:          130, // 100 + 30
+			expectedCachedTokens:         20,  // 20
+			expectedCachedCreationTokens: 0,
 		},
 		{
-			name:                      "usage with cache creation tokens",
-			inputTokens:               60,
-			outputTokens:              40,
-			cacheReadTokens:           0,
-			cacheCreationTokens:       15,
-			expectedInputTokens:       75, // 60 + 15 + 0
-			expectedOutputTokens:      40,
-			expectedTotalTokens:       115, // 75 + 40
-			expectedCachedTokens:      0,   // 0
-			expectedCachedWriteTokens: 15,  // 15
+			name:                         "usage with cache creation tokens",
+			inputTokens:                  60,
+			outputTokens:                 40,
+			cacheReadTokens:              0,
+			cachedCreationTokens:         15,
+			expectedInputTokens:          75, // 60 + 15 + 0
+			expectedOutputTokens:         40,
+			expectedTotalTokens:          115, // 75 + 40
+			expectedCachedTokens:         0,   // 0
+			expectedCachedCreationTokens: 15,  // 15
 		},
 		{
-			name:                      "usage with both cache types",
-			inputTokens:               70,
-			outputTokens:              25,
-			cacheReadTokens:           10,
-			cacheCreationTokens:       5,
-			expectedInputTokens:       85, // 70 + 5 + 10
-			expectedOutputTokens:      25,
-			expectedTotalTokens:       110, // 85 + 25
-			expectedCachedTokens:      10,  // 10
-			expectedCachedWriteTokens: 5,   // 5
+			name:                         "usage with both cache types",
+			inputTokens:                  70,
+			outputTokens:                 25,
+			cacheReadTokens:              10,
+			cachedCreationTokens:         5,
+			expectedInputTokens:          85, // 70 + 5 + 10
+			expectedOutputTokens:         25,
+			expectedTotalTokens:          110, // 85 + 25
+			expectedCachedTokens:         10,  // 10
+			expectedCachedCreationTokens: 5,   // 5
 		},
 		{
-			name:                      "zero values",
-			inputTokens:               0,
-			outputTokens:              0,
-			cacheReadTokens:           0,
-			cacheCreationTokens:       0,
-			expectedInputTokens:       0,
-			expectedOutputTokens:      0,
-			expectedTotalTokens:       0,
-			expectedCachedTokens:      0,
-			expectedCachedWriteTokens: 0,
+			name:                         "zero values",
+			inputTokens:                  0,
+			outputTokens:                 0,
+			cacheReadTokens:              0,
+			cachedCreationTokens:         0,
+			expectedInputTokens:          0,
+			expectedOutputTokens:         0,
+			expectedTotalTokens:          0,
+			expectedCachedTokens:         0,
+			expectedCachedCreationTokens: 0,
 		},
 		{
-			name:                      "large values",
-			inputTokens:               100000,
-			outputTokens:              50000,
-			cacheReadTokens:           25000,
-			cacheCreationTokens:       15000,
-			expectedInputTokens:       140000, // 100000 + 15000 + 25000
-			expectedOutputTokens:      50000,
-			expectedTotalTokens:       190000, // 140000 + 50000
-			expectedCachedTokens:      25000,  // 25000
-			expectedCachedWriteTokens: 15000,
+			name:                         "large values",
+			inputTokens:                  100000,
+			outputTokens:                 50000,
+			cacheReadTokens:              25000,
+			cachedCreationTokens:         15000,
+			expectedInputTokens:          140000, // 100000 + 15000 + 25000
+			expectedOutputTokens:         50000,
+			expectedTotalTokens:          190000, // 140000 + 50000
+			expectedCachedTokens:         25000,  // 25000
+			expectedCachedCreationTokens: 15000,
 		},
 	}
 
@@ -107,15 +107,15 @@ func TestExtractLLMTokenUsage(t *testing.T) {
 				tt.inputTokens,
 				tt.outputTokens,
 				tt.cacheReadTokens,
-				tt.cacheCreationTokens,
+				tt.cachedCreationTokens,
 			)
 
 			expected := tokenUsageFrom(
-				int32(tt.expectedInputTokens),       // nolint:gosec
-				int32(tt.expectedCachedTokens),      // nolint:gosec
-				int32(tt.expectedCachedWriteTokens), // nolint:gosec
-				int32(tt.expectedOutputTokens),      // nolint:gosec
-				int32(tt.expectedTotalTokens),       // nolint:gosec
+				int32(tt.expectedInputTokens),          // nolint:gosec
+				int32(tt.expectedCachedTokens),         // nolint:gosec
+				int32(tt.expectedCachedCreationTokens), // nolint:gosec
+				int32(tt.expectedOutputTokens),         // nolint:gosec
+				int32(tt.expectedTotalTokens),          // nolint:gosec
 			)
 			assert.Equal(t, expected, result)
 		})
@@ -124,55 +124,55 @@ func TestExtractLLMTokenUsage(t *testing.T) {
 
 func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 	tests := []struct {
-		name                      string
-		usage                     anthropic.Usage
-		expectedInputTokens       int32
-		expectedOutputTokens      int32
-		expectedTotalTokens       int32
-		expectedCachedTokens      uint32
-		expectedCachedWriteTokens uint32
+		name                         string
+		usage                        anthropic.Usage
+		expectedInputTokens          int32
+		expectedOutputTokens         int32
+		expectedTotalTokens          int32
+		expectedCachedTokens         uint32
+		expectedCachedCreationTokens uint32
 	}{
 		{
 			name: "non-streaming response without cache",
 			usage: anthropic.Usage{
-				InputTokens:              150,
-				OutputTokens:             75,
-				CacheReadInputTokens:     0,
-				CacheCreationInputTokens: 0,
+				InputTokens:               150,
+				OutputTokens:              75,
+				CacheReadInputTokens:      0,
+				CachedCreationInputTokens: 0,
 			},
-			expectedInputTokens:       150,
-			expectedOutputTokens:      75,
-			expectedTotalTokens:       225,
-			expectedCachedTokens:      0,
-			expectedCachedWriteTokens: 0,
+			expectedInputTokens:          150,
+			expectedOutputTokens:         75,
+			expectedTotalTokens:          225,
+			expectedCachedTokens:         0,
+			expectedCachedCreationTokens: 0,
 		},
 		{
 			name: "non-streaming response with cache read",
 			usage: anthropic.Usage{
-				InputTokens:              100,
-				OutputTokens:             50,
-				CacheReadInputTokens:     25,
-				CacheCreationInputTokens: 0,
+				InputTokens:               100,
+				OutputTokens:              50,
+				CacheReadInputTokens:      25,
+				CachedCreationInputTokens: 0,
 			},
-			expectedInputTokens:       125, // 100 + 0 + 25
-			expectedOutputTokens:      50,
-			expectedTotalTokens:       175, // 125 + 50
-			expectedCachedTokens:      25,  // 25
-			expectedCachedWriteTokens: 0,   // 0
+			expectedInputTokens:          125, // 100 + 0 + 25
+			expectedOutputTokens:         50,
+			expectedTotalTokens:          175, // 125 + 50
+			expectedCachedTokens:         25,  // 25
+			expectedCachedCreationTokens: 0,   // 0
 		},
 		{
 			name: "non-streaming response with both cache types",
 			usage: anthropic.Usage{
-				InputTokens:              90,
-				OutputTokens:             60,
-				CacheReadInputTokens:     15,
-				CacheCreationInputTokens: 10,
+				InputTokens:               90,
+				OutputTokens:              60,
+				CacheReadInputTokens:      15,
+				CachedCreationInputTokens: 10,
 			},
-			expectedInputTokens:       115, // 90 + 10 + 15
-			expectedOutputTokens:      60,
-			expectedTotalTokens:       175, // 115 + 60
-			expectedCachedTokens:      25,  // 15
-			expectedCachedWriteTokens: 10,  // 10
+			expectedInputTokens:          115, // 90 + 10 + 15
+			expectedOutputTokens:         60,
+			expectedTotalTokens:          175, // 115 + 60
+			expectedCachedTokens:         25,  // 15
+			expectedCachedCreationTokens: 10,  // 10
 		},
 	}
 
@@ -181,7 +181,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 			result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens,
 				tt.usage.OutputTokens,
 				tt.usage.CacheReadInputTokens,
-				tt.usage.CacheCreationInputTokens,
+				tt.usage.CachedCreationInputTokens,
 			)
 			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)
@@ -202,10 +202,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 		{
 			name: "message_delta event with final totals",
 			usage: anthropic.MessageDeltaUsage{
-				InputTokens:              250,
-				OutputTokens:             120,
-				CacheReadInputTokens:     30,
-				CacheCreationInputTokens: 0,
+				InputTokens:               250,
+				OutputTokens:              120,
+				CacheReadInputTokens:      30,
+				CachedCreationInputTokens: 0,
 			},
 			expectedInputTokens:       280, // 250 + 0 + 30
 			expectedOutputTokens:      120,
@@ -216,10 +216,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 		{
 			name: "message_delta event with only output tokens",
 			usage: anthropic.MessageDeltaUsage{
-				InputTokens:              0,
-				OutputTokens:             85,
-				CacheReadInputTokens:     0,
-				CacheCreationInputTokens: 0,
+				InputTokens:               0,
+				OutputTokens:              85,
+				CacheReadInputTokens:      0,
+				CachedCreationInputTokens: 0,
 			},
 			expectedInputTokens:       0,
 			expectedOutputTokens:      85,
@@ -230,10 +230,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 		{
 			name: "message_delta with cache creation tokens",
 			usage: anthropic.MessageDeltaUsage{
-				InputTokens:              150,
-				OutputTokens:             75,
-				CacheReadInputTokens:     10,
-				CacheCreationInputTokens: 5,
+				InputTokens:               150,
+				OutputTokens:              75,
+				CacheReadInputTokens:      10,
+				CachedCreationInputTokens: 5,
 			},
 			expectedInputTokens:       165, // 150 + 5 + 10
 			expectedOutputTokens:      75,
@@ -248,7 +248,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 			result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens,
 				tt.usage.OutputTokens,
 				tt.usage.CacheReadInputTokens,
-				tt.usage.CacheCreationInputTokens,
+				tt.usage.CachedCreationInputTokens,
 			)
 			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)
@@ -285,14 +285,14 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 		// cache_creation_input_tokens, and cache_read_input_tokens".
 
 		inputTokens := int64(100)
-		cacheCreationTokens := int64(20)
+		cachedCreationTokens := int64(20)
 		cacheReadTokens := int64(30)
 		outputTokens := int64(50)
 
-		result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cacheCreationTokens)
+		result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens)
 
 		// Total input should be sum of all input token types.
-		expectedTotalInputInt := inputTokens + cacheCreationTokens + cacheReadTokens
+		expectedTotalInputInt := inputTokens + cachedCreationTokens + cacheReadTokens
 		expectedTotalInput := uint32(expectedTotalInputInt) // #nosec G115 - test values are small and safe
 		inputTokensVal, ok := result.InputTokens()
 		assert.True(t, ok)
@@ -306,7 +306,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 
 		cachedWriteTokens, ok := result.CachedWriteInputTokens()
 		assert.True(t, ok)
-		assert.Equal(t, cacheCreationTokens, cachedWriteTokens,
+		assert.Equal(t, cachedCreationTokens, cachedWriteTokens,
 			"CachedWriteInputTokens should be cache_creation_input_tokens")
 
 		// Total tokens should be input + output.
diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go
index 56b4870151..8318fe88aa 100644
--- a/internal/translator/openai_awsbedrock.go
+++ b/internal/translator/openai_awsbedrock.go
@@ -708,7 +708,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 					tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec
 				}
 				if usage.CacheWriteInputTokens != nil {
-					tokenUsage.SetCachedWriteInputTokens(uint32(*usage.CacheWriteInputTokens))
+					tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens))
 				}
 			}
 			oaiEvent, ok := o.convertEvent(event)
@@ -760,8 +760,8 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 			openAIResp.Usage.PromptTokensDetails.CachedTokens = *bedrockResp.Usage.CacheReadInputTokens
 		}
 		if bedrockResp.Usage.CacheWriteInputTokens != nil {
-			tokenUsage.SetCachedWriteInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec
-			openAIResp.Usage.PromptTokensDetails.CachedWriteTokens = *bedrockResp.Usage.CacheWriteInputTokens
+			tokenUsage.SetCachedCreationInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec
+			openAIResp.Usage.PromptTokensDetails.CachedCreationTokens = *bedrockResp.Usage.CacheWriteInputTokens
 		}
 	}
 
@@ -862,7 +862,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe
 			chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens
 		}
 		if event.Usage.CacheWriteInputTokens != nil {
-			chunk.Usage.PromptTokensDetails.CachedWriteTokens = *event.Usage.CacheWriteInputTokens
+			chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens
 		}
 	// messageStart event.
 	case awsbedrock.ConverseStreamEventTypeMessageStart.String():
diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go
index e780b8a34c..fa4a17345e 100644
--- a/internal/translator/openai_awsbedrock_test.go
+++ b/internal/translator/openai_awsbedrock_test.go
@@ -1474,8 +1474,8 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T)
 					PromptTokens:     10,
 					CompletionTokens: 20,
 					PromptTokensDetails: &openai.PromptTokensDetails{
-						CachedTokens:      5,
-						CachedWriteTokens: 7,
+						CachedTokens:         5,
+						CachedCreationTokens: 7,
 					},
 				},
 				Choices: []openai.ChatCompletionResponseChoice{
diff --git a/internal/translator/openai_completions.go b/internal/translator/openai_completions.go
index 3754dc2bff..7f5008d8dd 100644
--- a/internal/translator/openai_completions.go
+++ b/internal/translator/openai_completions.go
@@ -171,8 +171,8 @@ func (o *openAIToOpenAITranslatorV1Completion) extractUsageFromBufferEvent(span
 			tokenUsage.SetOutputTokens(uint32(usage.CompletionTokens)) //nolint:gosec
 			tokenUsage.SetTotalTokens(uint32(usage.TotalTokens))       //nolint:gosec
 			if usage.PromptTokensDetails != nil {
-				tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens))           //nolint:gosec
-				tokenUsage.SetCachedWriteInputTokens(uint32(usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec
+				tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens))                 //nolint:gosec
+				tokenUsage.SetCachedCreationInputTokens(uint32(usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec
 			}
 			// Do not mark buffering done; keep scanning to return the latest usage in this batch.
 		}
diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go
index 7e11451f31..6880a6f3dd 100644
--- a/internal/translator/openai_gcpanthropic.go
+++ b/internal/translator/openai_gcpanthropic.go
@@ -833,13 +833,13 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri
 		usage.InputTokens,
 		usage.OutputTokens,
 		usage.CacheReadInputTokens,
-		usage.CacheCreationInputTokens,
+		usage.CachedCreationInputTokens,
 	)
 	inputTokens, _ := tokenUsage.InputTokens()
 	outputTokens, _ := tokenUsage.OutputTokens()
 	totalTokens, _ := tokenUsage.TotalTokens()
 	cachedTokens, _ := tokenUsage.CachedInputTokens()
-	cacheWriteTokens, _ := tokenUsage.CachedWriteInputTokens()
+	cacheWriteTokens, _ := tokenUsage.CachedCreationInputTokens()
 	openAIResp.Usage = openai.Usage{
 		CompletionTokens: int(outputTokens),
 		PromptTokens:     int(inputTokens),
diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go
index 191b436545..6d32041909 100644
--- a/internal/translator/openai_gcpanthropic_stream.go
+++ b/internal/translator/openai_gcpanthropic_stream.go
@@ -110,7 +110,7 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t
 		p.tokenUsage.SetTotalTokens(inputTokens + outputTokens)
 		totalTokens, _ := p.tokenUsage.TotalTokens()
 		cachedTokens, _ := p.tokenUsage.CachedInputTokens()
-		cachedWriteTokens, _ := p.tokenUsage.CachedWriteInputTokens()
+		cachedCreationTokens, _ := p.tokenUsage.CachedCreationInputTokens()
 		finalChunk := openai.ChatCompletionResponseChunk{
 			ID:      p.activeMessageID,
 			Created: p.created,
@@ -121,8 +121,8 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t
 				CompletionTokens: int(outputTokens),
 				TotalTokens:      int(totalTokens),
 				PromptTokensDetails: &openai.PromptTokensDetails{
-					CachedTokens:      int(cachedTokens),
-					CachedWriteTokens: int(cachedWriteTokens),
+					CachedTokens:         int(cachedTokens),
+					CachedCreationTokens: int(cachedCreationTokens),
 				},
 			},
 			Model: p.requestModel,
@@ -203,7 +203,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 			u.InputTokens,
 			u.OutputTokens,
 			u.CacheReadInputTokens,
-			u.CacheCreationInputTokens,
+			u.CachedCreationInputTokens,
 		)
 		// For message_start, we store the initial usage but don't add to the accumulated
 		// The message_delta event will contain the final totals
@@ -213,7 +213,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 		if cached, ok := usage.CachedInputTokens(); ok {
 			p.tokenUsage.SetCachedInputTokens(cached)
 		}
-		if cachedWrite, ok := usage.CachedWriteInputTokens(); ok {
+		if cachedCreation, ok := usage.CachedWriteInputTokens(); ok {
 			p.tokenUsage.SetCachedWriteInputTokens(cachedWrite)
 		}
 
@@ -285,7 +285,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 			u.InputTokens,
 			u.OutputTokens,
 			u.CacheReadInputTokens,
-			u.CacheCreationInputTokens,
+			u.CachedCreationInputTokens,
 		)
 		// For message_delta, accumulate the incremental output tokens
 		if output, ok := usage.OutputTokens(); ok {
diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go
index 232ab836cb..74e99c5741 100644
--- a/internal/translator/openai_gcpvertexai.go
+++ b/internal/translator/openai_gcpvertexai.go
@@ -170,8 +170,8 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin
 		tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount))         //nolint:gosec
 		tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount))               //nolint:gosec
 		tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec
-		// Gemini does not return cached write input tokens, set to 0.
-		tokenUsage.SetCachedWriteInputTokens(0)
+		// Gemini does not return cache creation input tokens, set to 0.
+		tokenUsage.SetCachedCreationInputTokens(0)
 	}
 
 	if span != nil {
diff --git a/internal/translator/openai_openai.go b/internal/translator/openai_openai.go
index 9ad3084b3b..65b33a34cd 100644
--- a/internal/translator/openai_openai.go
+++ b/internal/translator/openai_openai.go
@@ -141,8 +141,8 @@ func (o *openAIToOpenAITranslatorV1ChatCompletion) ResponseBody(_ map[string]str
 	tokenUsage.SetOutputTokens(uint32(resp.Usage.CompletionTokens)) //nolint:gosec
 	tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))       //nolint:gosec
 	if resp.Usage.PromptTokensDetails != nil {
-		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens))           //nolint:gosec
-		tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec
+		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens))                 //nolint:gosec
+		tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec
 	}
 	// Fallback to request model for test or non-compliant OpenAI backends
 	responseModel = cmp.Or(resp.Model, o.requestModel)
diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go
index 9550d0ce4f..7ed57bcb11 100644
--- a/internal/translator/openai_responses.go
+++ b/internal/translator/openai_responses.go
@@ -179,7 +179,7 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t
 				tokenUsage.SetOutputTokens(uint32(respComplEvent.Response.Usage.OutputTokens))                         // #nosec G115
 				tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens))                           // #nosec G115
 				tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115
-				// Openai does not support cached write response.
+				// Openai does not support cache creation response.
 				tokenUsage.SetCachedWriteInputTokens(uint32(0)) // #nosec G115
 			}
 			// Record streaming chunk to span if tracing is enabled.
diff --git a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
index 4407e6ae18..38b2851a85 100644
--- a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
+++ b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
@@ -32,7 +32,7 @@ spec:
     - metadataKey: llm_input_cached_token
       type: CachedInputToken
     - metadataKey: llm_write_input_cached_token
-      type: CachedWriteInputToken
+      type: CachedCreationInputToken
     - metadataKey: llm_output_token
       type: OutputToken
     - metadataKey: llm_total_token

From 72408dddbea5b7e6e00350263944a6e4515ef670 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:11:29 -0500
Subject: [PATCH 07/20] missed a few

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/extproc/processor_impl.go            |  6 +--
 internal/extproc/processor_impl_test.go       |  6 +--
 internal/metrics/metrics_impl.go              |  6 +--
 internal/metrics/metrics_impl_test.go         |  8 +--
 .../openinference/anthropic/messages.go       |  2 +-
 .../openinference/openai/responses_test.go    |  6 +--
 .../tracing/openinference/openinference.go    |  4 +-
 .../translator/anthropic_gcpanthropic_test.go | 14 ++---
 internal/translator/anthropic_usage_test.go   | 54 +++++++++----------
 internal/translator/openai_awsbedrock_test.go |  2 +-
 internal/translator/openai_gcpanthropic.go    |  4 +-
 .../translator/openai_gcpanthropic_stream.go  |  8 +--
 internal/translator/openai_responses.go       | 12 ++---
 internal/translator/openai_responses_test.go  | 24 ++++-----
 14 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go
index 800b880393..729268644c 100644
--- a/internal/extproc/processor_impl.go
+++ b/internal/extproc/processor_impl.go
@@ -534,7 +534,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 		case filterapi.LLMRequestCostTypeCachedInputToken:
 			cost, _ = costs.CachedInputTokens()
 		case filterapi.LLMRequestCostTypeCachedCreationInputToken:
-			cost, _ = costs.CachedWriteInputTokens()
+			cost, _ = costs.CachedCreationInputTokens()
 		case filterapi.LLMRequestCostTypeOutputToken:
 			cost, _ = costs.OutputTokens()
 		case filterapi.LLMRequestCostTypeTotalToken:
@@ -542,7 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 		case filterapi.LLMRequestCostTypeCEL:
 			in, _ := costs.InputTokens()
 			cachedIn, _ := costs.CachedInputTokens()
-			cachedWrite, _ := costs.CachedWriteInputTokens()
+			cachedCreation, _ := costs.CachedCreationInputTokens()
 			out, _ := costs.OutputTokens()
 			total, _ := costs.TotalTokens()
 			costU64, err := llmcostcel.EvaluateProgram(
@@ -551,7 +551,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 				backendName,
 				in,
 				cachedIn,
-				cachedWrite,
+				cachedCreation,
 				out,
 				total,
 			)
diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go
index e97d2fae45..7b498f12af 100644
--- a/internal/extproc/processor_impl_test.go
+++ b/internal/extproc/processor_impl_test.go
@@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}},
-						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedWriteInputToken, MetadataKey: "cache_creation_input_token_usage"}},
+						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cache_creation_input_token_usage"}},
 						{
 							CELProg:        celProgInt,
 							LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"},
@@ -375,7 +375,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		mt.expResponseBody = final
 		mt.retUsedToken.SetInputTokens(5)
 		mt.retUsedToken.SetCachedInputTokens(3)
-		mt.retUsedToken.SetCachedWriteInputTokens(21)
+		mt.retUsedToken.SetCachedCreationInputTokens(21)
 		mt.retUsedToken.SetOutputTokens(138)
 		mt.retUsedToken.SetTotalTokens(143)
 		_, err = p.ProcessResponseBody(t.Context(), final)
@@ -385,7 +385,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		require.Equal(t, 138, mm.outputTokenCount)
 		require.Equal(t, 138, mm.streamingOutputTokens) // accumulated output tokens from stream
 		require.Equal(t, 3, mm.cachedInputTokenCount)
-		require.Equal(t, 21, mm.cachedWriteInputTokenCount)
+		require.Equal(t, 21, mm.cachedCreationInputTokenCount)
 	})
 }
 
diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go
index 32dbfc38d3..8f13a50104 100644
--- a/internal/metrics/metrics_impl.go
+++ b/internal/metrics/metrics_impl.go
@@ -148,10 +148,10 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re
 			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)),
 		)
 	}
-	if cachedCreationInputTokens, ok := usage.CachedWriteInputTokens(); ok {
-		b.metrics.tokenUsage.Record(ctx, float64(cachedWriteInputTokens),
+	if cachedCreationInputTokens, ok := usage.CachedCreationInputTokens(); ok {
+		b.metrics.tokenUsage.Record(ctx, float64(cachedCreationInputTokens),
 			metric.WithAttributeSet(attrs),
-			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput)),
+			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput)),
 		)
 	}
 	if outputTokens, ok := usage.OutputTokens(); ok {
diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go
index a0fce8287e..581815e1bc 100644
--- a/internal/metrics/metrics_impl_test.go
+++ b/internal/metrics/metrics_impl_test.go
@@ -300,7 +300,7 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) {
 	pm.SetRequestModel("req-model")
 	pm.SetResponseModel("res-model")
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
-		inputTokens: 2, cachedInputTokens: 1, cachedWriteInputTokens: 6, outputTokens: 3,
+		inputTokens: 2, cachedInputTokens: 1, cachedCreationInputTokens: 6, outputTokens: 3,
 		inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
@@ -328,15 +328,15 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) {
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 1.0, sum)
 
-	cachedWriteInputAttrs := attribute.NewSet(
+	cachedCreationInputAttrs := attribute.NewSet(
 		attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)),
 		attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI),
 		attribute.Key(genaiAttributeOriginalModel).String("orig-model"),
 		attribute.Key(genaiAttributeRequestModel).String("req-model"),
 		attribute.Key(genaiAttributeResponseModel).String("res-model"),
-		attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedWriteInput),
+		attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput),
 	)
-	count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedWriteInputAttrs)
+	count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs)
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 6.0, sum)
 
diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go
index b37f37aa1a..23a52a618c 100644
--- a/internal/tracing/openinference/anthropic/messages.go
+++ b/internal/tracing/openinference/anthropic/messages.go
@@ -215,7 +215,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer
 	)
 	input, _ := cost.InputTokens()
 	cacheRead, _ := cost.CachedInputTokens()
-	cacheCreation, _ := cost.CachedWriteInputTokens()
+	cacheCreation, _ := cost.CachedCreationInputTokens()
 	output, _ := cost.OutputTokens()
 	total, _ := cost.TotalTokens()
 
diff --git a/internal/tracing/openinference/openai/responses_test.go b/internal/tracing/openinference/openai/responses_test.go
index 76fbdc9eb5..3191c6e82d 100644
--- a/internal/tracing/openinference/openai/responses_test.go
+++ b/internal/tracing/openinference/openai/responses_test.go
@@ -77,8 +77,8 @@ var (
 		Usage: &openai.ResponseUsage{
 			InputTokens: 100,
 			InputTokensDetails: openai.ResponseUsageInputTokensDetails{
-				CachedTokens:      10,
-				CachedWriteTokens: 50,
+				CachedTokens:         10,
+				CachedCreationTokens: 50,
 			},
 			OutputTokens: 25,
 			TotalTokens:  125,
@@ -202,7 +202,7 @@ func TestResponsesRecorder_RecordResponse(t *testing.T) {
 			expectedStatus: trace.Status{Code: codes.Ok, Description: ""},
 		},
 		{
-			name:   "response with cache write",
+			name:   "response with cache creation",
 			resp:   responseWithCacheWrite,
 			config: &openinference.TraceConfig{},
 			expectedAttrs: []attribute.KeyValue{
diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go
index aa50bca94c..f0f6d65ece 100644
--- a/internal/tracing/openinference/openinference.go
+++ b/internal/tracing/openinference/openinference.go
@@ -161,9 +161,9 @@ const (
 	LLMTokenCountPromptCacheHit = "llm.token_count.prompt_details.cache_read" // #nosec G101
 
 	// LLMTokenCountPromptCacheWrite represents the number of prompt tokens
-	// written to cache (cache writes). This enables tracking of cache efficiency
+	// created to cache (cache write). This enables tracking of cache efficiency
 	// and cost savings from cached prompts.
-	LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_write" // #nosec G101
+	LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_creation" // #nosec G101
 
 	// LLMTokenCountPromptAudio represents the number of audio tokens in the prompt.
 	// Used for multimodal models that support audio input.
diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go
index d788a0f2bd..d2c87e4935 100644
--- a/internal/translator/anthropic_gcpanthropic_test.go
+++ b/internal/translator/anthropic_gcpanthropic_test.go
@@ -611,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t
 	// 3. message_delta at the end provides output_tokens=5 but no input_tokens
 	// 4. message_stop ends the stream
 	messageStartChunk := `event: message_start
-data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5,  "cache_write_input_tokens": 1, "output_tokens": 0}}}
+data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5,  "cache_creation_input_tokens": 1, "output_tokens": 0}}}
 `
 	contentBlockStartChunk := `event: content_block_start
 data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}}
@@ -638,7 +638,7 @@ data: {"type": "message_stop"}
 	outputTokens, outputSet := tokenUsage.OutputTokens()
 	totalTokens, totalSet := tokenUsage.TotalTokens()
 	cachedTokens, cachedSet := tokenUsage.CachedInputTokens()
-	cachedCreationTokens, cachedWriteSet := tokenUsage.CachedWriteInputTokens()
+	cachedCreationTokens, cachedCreationSet := tokenUsage.CachedCreationInputTokens()
 
 	// Assertions
 	assert.True(t, inputSet, "Input tokens should be set")
@@ -653,8 +653,8 @@ data: {"type": "message_stop"}
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
 
-	assert.True(t, cachedWriteSet, "cache creation tokens should be set")
-	assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario")
+	assert.True(t, cachedCreationSet, "cache creation tokens should be set")
+	assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario")
 
 	_, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil)
 	require.NoError(t, err)
@@ -672,7 +672,7 @@ data: {"type": "message_stop"}
 	outputTokens, outputSet = tokenUsage.OutputTokens()
 	totalTokens, totalSet = tokenUsage.TotalTokens()
 	cachedTokens, cachedSet = tokenUsage.CachedInputTokens()
-	cachedWriteTokens, cachedWriteSet = tokenUsage.CachedWriteInputTokens()
+	cachedCreationTokens, cachedCreationSet = tokenUsage.CachedCreationInputTokens()
 
 	assert.True(t, inputSet, "Input tokens should be set")
 	assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start")
@@ -686,6 +686,6 @@ data: {"type": "message_stop"}
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
 
-	assert.True(t, cachedWriteSet, "cache creation tokens should be set")
-	assert.Equal(t, uint32(1), cachedWriteTokens, "No cache creation tokens in this scenario")
+	assert.True(t, cachedCreationSet, "cache creation tokens should be set")
+	assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario")
 }
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index 016a355f04..42e2b0e83f 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -183,7 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CachedCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -191,13 +191,13 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 
 func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 	tests := []struct {
-		name                      string
-		usage                     anthropic.MessageDeltaUsage
-		expectedInputTokens       int32
-		expectedOutputTokens      int32
-		expectedTotalTokens       int32
-		expectedCachedTokens      uint32
-		expectedCachedWriteTokens uint32
+		name                         string
+		usage                        anthropic.MessageDeltaUsage
+		expectedInputTokens          int32
+		expectedOutputTokens         int32
+		expectedTotalTokens          int32
+		expectedCachedTokens         uint32
+		expectedCachedCreationTokens uint32
 	}{
 		{
 			name: "message_delta event with final totals",
@@ -207,11 +207,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:      30,
 				CachedCreationInputTokens: 0,
 			},
-			expectedInputTokens:       280, // 250 + 0 + 30
-			expectedOutputTokens:      120,
-			expectedTotalTokens:       400, // 280 + 120
-			expectedCachedTokens:      30,  // 30
-			expectedCachedWriteTokens: 0,
+			expectedInputTokens:          280, // 250 + 0 + 30
+			expectedOutputTokens:         120,
+			expectedTotalTokens:          400, // 280 + 120
+			expectedCachedTokens:         30,  // 30
+			expectedCachedCreationTokens: 0,
 		},
 		{
 			name: "message_delta event with only output tokens",
@@ -221,11 +221,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:      0,
 				CachedCreationInputTokens: 0,
 			},
-			expectedInputTokens:       0,
-			expectedOutputTokens:      85,
-			expectedTotalTokens:       85,
-			expectedCachedTokens:      0,
-			expectedCachedWriteTokens: 0,
+			expectedInputTokens:          0,
+			expectedOutputTokens:         85,
+			expectedTotalTokens:          85,
+			expectedCachedTokens:         0,
+			expectedCachedCreationTokens: 0,
 		},
 		{
 			name: "message_delta with cache creation tokens",
@@ -235,11 +235,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:      10,
 				CachedCreationInputTokens: 5,
 			},
-			expectedInputTokens:       165, // 150 + 5 + 10
-			expectedOutputTokens:      75,
-			expectedTotalTokens:       240, // 165 + 75
-			expectedCachedTokens:      10,  // 10
-			expectedCachedWriteTokens: 5,   // 5
+			expectedInputTokens:          165, // 150 + 5 + 10
+			expectedOutputTokens:         75,
+			expectedTotalTokens:          240, // 165 + 75
+			expectedCachedTokens:         10,  // 10
+			expectedCachedCreationTokens: 5,   // 5
 		},
 	}
 
@@ -250,7 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CachedCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedWriteTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -304,10 +304,10 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 		assert.Equal(t, cacheReadTokens, cachedTokens,
 			"CachedInputTokens should be  cache_read_input_tokens")
 
-		cachedWriteTokens, ok := result.CachedWriteInputTokens()
+		cachedCreationTokens, ok := result.CachedCreationInputTokens()
 		assert.True(t, ok)
-		assert.Equal(t, cachedCreationTokens, cachedWriteTokens,
-			"CachedWriteInputTokens should be cache_creation_input_tokens")
+		assert.Equal(t, cachedCreationTokens, cachedCreationTokens,
+			"CachedCreationInputTokens should be cache_creation_input_tokens")
 
 		// Total tokens should be input + output.
 		expectedTotal := expectedTotalInput + uint32(outputTokens)
diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go
index fa4a17345e..c533ec6dbf 100644
--- a/internal/translator/openai_awsbedrock_test.go
+++ b/internal/translator/openai_awsbedrock_test.go
@@ -1725,7 +1725,7 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T)
 					expectedUsage.SetCachedInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec
 				}
 				if tt.input.Usage.CacheWriteInputTokens != nil {
-					expectedUsage.SetCachedWriteInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedWriteTokens)) //nolint:gosec
+					expectedUsage.SetCachedCreationInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec
 				}
 			} else {
 				expectedUsage = tokenUsageFrom(-1, -1, -1, -1, -1)
diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go
index 6880a6f3dd..dce744da00 100644
--- a/internal/translator/openai_gcpanthropic.go
+++ b/internal/translator/openai_gcpanthropic.go
@@ -845,8 +845,8 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri
 		PromptTokens:     int(inputTokens),
 		TotalTokens:      int(totalTokens),
 		PromptTokensDetails: &openai.PromptTokensDetails{
-			CachedTokens:      int(cachedTokens),
-			CachedWriteTokens: int(cacheWriteTokens),
+			CachedTokens:         int(cachedTokens),
+			CachedCreationTokens: int(cacheWriteTokens),
 		},
 	}
 
diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go
index 6d32041909..9e5cc95796 100644
--- a/internal/translator/openai_gcpanthropic_stream.go
+++ b/internal/translator/openai_gcpanthropic_stream.go
@@ -213,8 +213,8 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 		if cached, ok := usage.CachedInputTokens(); ok {
 			p.tokenUsage.SetCachedInputTokens(cached)
 		}
-		if cachedCreation, ok := usage.CachedWriteInputTokens(); ok {
-			p.tokenUsage.SetCachedWriteInputTokens(cachedWrite)
+		if cachedCreation, ok := usage.CachedCreationInputTokens(); ok {
+			p.tokenUsage.SetCachedCreationInputTokens(cachedCreation)
 		}
 
 		// reset the toolIndex for each message
@@ -298,10 +298,10 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 			p.tokenUsage.AddCachedInputTokens(cached)
 		}
 		// Update input tokens to include write cache tokens from delta
-		if cached, ok := usage.CachedWriteInputTokens(); ok {
+		if cached, ok := usage.CachedCreationInputTokens(); ok {
 			p.tokenUsage.AddInputTokens(cached)
 			// Accumulate any additional cache tokens from delta
-			p.tokenUsage.AddCachedWriteInputTokens(cached)
+			p.tokenUsage.AddCachedCreationInputTokens(cached)
 		}
 		if event.Delta.StopReason != "" {
 			p.stopReason = event.Delta.StopReason
diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go
index 7ed57bcb11..30934c5c63 100644
--- a/internal/translator/openai_responses.go
+++ b/internal/translator/openai_responses.go
@@ -128,11 +128,11 @@ func (o *openAIToOpenAITranslatorV1Responses) handleNonStreamingResponse(body io
 
 	// TODO: Add reasoning token usage
 	if resp.Usage != nil {
-		tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens))                                     // #nosec G115
-		tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens))                                   // #nosec G115
-		tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))                                     // #nosec G115
-		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens))           // #nosec G115
-		tokenUsage.SetCachedWriteInputTokens(uint32(resp.Usage.InputTokensDetails.CachedWriteTokens)) // #nosec G115
+		tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens))                                           // #nosec G115
+		tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens))                                         // #nosec G115
+		tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))                                           // #nosec G115
+		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens))                 // #nosec G115
+		tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.InputTokensDetails.CachedCreationTokens)) // #nosec G115
 	}
 
 	// Record non-streaming response to span if tracing is enabled.
@@ -180,7 +180,7 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t
 				tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens))                           // #nosec G115
 				tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115
 				// Openai does not support cache creation response.
-				tokenUsage.SetCachedWriteInputTokens(uint32(0)) // #nosec G115
+				tokenUsage.SetCachedCreationInputTokens(uint32(0)) // #nosec G115
 			}
 			// Record streaming chunk to span if tracing is enabled.
 			if span != nil {
diff --git a/internal/translator/openai_responses_test.go b/internal/translator/openai_responses_test.go
index 80c574d51e..363933d525 100644
--- a/internal/translator/openai_responses_test.go
+++ b/internal/translator/openai_responses_test.go
@@ -247,9 +247,9 @@ func TestResponsesOpenAIToOpenAITranslator_ResponseBody(t *testing.T) {
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedWriteTokens)
+		require.Equal(t, uint32(0), cachedCreationTokens)
 	})
 
 	t.Run("non-streaming response with fallback model", func(t *testing.T) {
@@ -363,9 +363,9 @@ data: [DONE]
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedWriteTokens)
+		require.Equal(t, uint32(0), cachedCreationTokens)
 	})
 
 	t.Run("streaming response with fallback model", func(t *testing.T) {
@@ -462,9 +462,9 @@ data: [DONE]
 		cachedTokens, _ := tokenUsage.CachedInputTokens()
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedWriteTokens)
+		require.Equal(t, uint32(0), cachedCreationTokens)
 	})
 
 	t.Run("streaming read error", func(t *testing.T) {
@@ -554,9 +554,9 @@ func TestResponses_HandleNonStreamingResponse(t *testing.T) {
 		cachedTokens, _ := tokenUsage.CachedInputTokens()
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedWriteTokens)
+		require.Equal(t, uint32(0), cachedCreationTokens)
 	})
 
 	t.Run("invalid JSON", func(t *testing.T) {
@@ -619,9 +619,9 @@ data: [DONE]
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedWriteTokens, ok := tokenUsage.CachedWriteInputTokens()
+		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedWriteTokens)
+		require.Equal(t, uint32(0), cachedCreationTokens)
 	})
 
 	t.Run("model extraction", func(t *testing.T) {
@@ -686,11 +686,11 @@ data: [DONE]
 		_, outputSet := tokenUsage.OutputTokens()
 		_, totalSet := tokenUsage.TotalTokens()
 		_, cachedSet := tokenUsage.CachedInputTokens()
-		_, cachedWriteSet := tokenUsage.CachedWriteInputTokens()
+		_, cachedCreationSet := tokenUsage.CachedCreationInputTokens()
 
 		require.False(t, totalSet)
 		require.False(t, cachedSet)
-		require.False(t, cachedWriteSet)
+		require.False(t, cachedCreationSet)
 		require.False(t, inputSet)
 		require.False(t, outputSet)
 	})

From ea12f8ea4ce3e520154aeaba77291cc5e7b2adb0 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:16:17 -0500
Subject: [PATCH 08/20] fix typo

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 api/v1alpha1/ai_gateway_route.go                   | 2 +-
 api/v1alpha1/shared_types.go                       | 4 ++--
 examples/token_ratelimit/token_ratelimit.yaml      | 2 +-
 internal/apischema/anthropic/anthropic.go          | 2 +-
 internal/apischema/openai/openai.go                | 6 +++---
 internal/apischema/openai/openai_test.go           | 8 ++++----
 internal/extproc/processor_impl_test.go            | 4 ++--
 internal/filterapi/runtime_test.go                 | 2 +-
 internal/llmcostcel/cel.go                         | 4 ++--
 internal/llmcostcel/cel_test.go                    | 2 +-
 internal/metrics/genai.go                          | 2 +-
 internal/metrics/metrics.go                        | 2 +-
 internal/metrics/metrics_impl_test.go              | 4 ++--
 internal/tracing/openinference/openinference.go    | 2 +-
 internal/translator/anthropic_anthropic_test.go    | 4 ++--
 internal/translator/anthropic_gcpanthropic_test.go | 2 +-
 internal/translator/anthropic_usage_test.go        | 6 +++---
 tests/data-plane/testupstream_test.go              | 6 +++---
 18 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go
index 6404f8e8a6..f99a66bfc4 100644
--- a/api/v1alpha1/ai_gateway_route.go
+++ b/api/v1alpha1/ai_gateway_route.go
@@ -108,7 +108,7 @@ type AIGatewayRouteSpec struct {
 	//	  type: TotalToken
 	//	- metadataKey: llm_cached_input_token
 	//	  type: CachedInputToken
-	// - metadataKey: llm_cache_creation_input_token
+	// - metadataKey: llm_cached_creation_input_token
 	//    type: CachedCreationInputToken
 	// ```
 	// Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go
index 2e391498dd..ed4e2c4dc3 100644
--- a/api/v1alpha1/shared_types.go
+++ b/api/v1alpha1/shared_types.go
@@ -114,7 +114,7 @@ type LLMRequestCost struct {
 	//	* backend: the backend name in the form of "name.namespace". Type: string.
 	//	* input_tokens: the number of input tokens. Type: unsigned integer.
 	//	* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.
-	//	* cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.
+	//	* cached_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.
 	//	* output_tokens: the number of output tokens. Type: unsigned integer.
 	//	* total_tokens: the total number of tokens. Type: unsigned integer.
 	//
@@ -122,7 +122,7 @@ type LLMRequestCost struct {
 	//
 	// 	* "model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens"
 	//	* "backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens"
-	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens"
+	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens * 1.25 + output_tokens : total_tokens"
 	//	* "input_tokens + output_tokens + total_tokens"
 	//	* "input_tokens * output_tokens"
 	//
diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml
index 4ebe6217c7..42b35a9a18 100644
--- a/examples/token_ratelimit/token_ratelimit.yaml
+++ b/examples/token_ratelimit/token_ratelimit.yaml
@@ -51,7 +51,7 @@ spec:
       type: InputToken
     - metadataKey: llm_cached_input_token
       type: CachedInputToken
-    - metadataKey: llm_cache_creation_input_token
+    - metadataKey: llm_cached_creation_input_token
       type: CachedCreationInputToken
     - metadataKey: llm_output_token
       type: OutputToken
diff --git a/internal/apischema/anthropic/anthropic.go b/internal/apischema/anthropic/anthropic.go
index f02e706d92..55dbaceee5 100644
--- a/internal/apischema/anthropic/anthropic.go
+++ b/internal/apischema/anthropic/anthropic.go
@@ -437,7 +437,7 @@ const (
 // so we use float64 to be able to unmarshal both 1234 and 1234.0 without errors.
 type Usage struct {
 	// The number of input tokens used to create the cache entry.
-	CachedCreationInputTokens float64 `json:"cache_creation_input_tokens"`
+	CachedCreationInputTokens float64 `json:"cached_creation_input_tokens"`
 	// The number of input tokens read from the cache.
 	CacheReadInputTokens float64 `json:"cache_read_input_tokens"`
 	// The number of input tokens which were used.
diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go
index 1cb6268929..02e8b9eae8 100644
--- a/internal/apischema/openai/openai.go
+++ b/internal/apischema/openai/openai.go
@@ -1383,7 +1383,7 @@ type PromptTokensDetails struct {
 	// Cached tokens present in the prompt.
 	CachedTokens int `json:"cached_tokens,omitzero"`
 	// Tokens written to the cache.
-	CachedCreationTokens int `json:"cache_creation_input_tokens,omitzero"`
+	CachedCreationTokens int `json:"cached_creation_input_tokens,omitzero"`
 }
 
 // ChatCompletionResponseChunk is described in the OpenAI API documentation:
@@ -2539,7 +2539,7 @@ type ResponseUsageInputTokensDetails struct {
 	CachedTokens int64 `json:"cached_tokens"`
 
 	// The number of tokens that were written to the cache.
-	CachedCreationTokens int64 `json:"cache_creation_input_tokens"`
+	CachedCreationTokens int64 `json:"cached_creation_input_tokens"`
 }
 
 // A detailed breakdown of the output tokens.
@@ -2554,7 +2554,7 @@ type ResponseTokensDetails struct {
 	CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api
 
 	// CachedCreationTokens: number of tokens that were written to the cache.
-	CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle
+	CachedCreationTokens int64 `json:"cached_creation_input_tokens"` //nolint:tagliatelle
 
 	// ReasoningTokens: Number of reasoning tokens (for reasoning models).
 	ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api
diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go
index c592a3b712..21f1fab649 100644
--- a/internal/apischema/openai/openai_test.go
+++ b/internal/apischema/openai/openai_test.go
@@ -1751,7 +1751,7 @@ func TestPromptTokensDetails(t *testing.T) {
 				"text_tokens": 15,
 				"audio_tokens": 8,
 				"cached_tokens": 384,
-				"cache_creation_input_tokens": 10
+				"cached_creation_input_tokens": 10
 			}`,
 		},
 		{
@@ -1765,7 +1765,7 @@ func TestPromptTokensDetails(t *testing.T) {
 			expected: `{
 				"audio_tokens": 8,
 				"cached_tokens": 384,
-				"cache_creation_input_tokens": 10
+				"cached_creation_input_tokens": 10
 			}`,
 		},
 	}
@@ -1838,7 +1838,7 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 				"prompt_tokens_details": {
 					"audio_tokens": 8,
 					"cached_tokens": 384,
-					"cache_creation_input_tokens": 13
+					"cached_creation_input_tokens": 13
 				}
 			}`,
 		},
@@ -1875,7 +1875,7 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					"text_tokens": 15,
 					"audio_tokens": 8,
 					"cached_tokens": 384,
-					"cache_creation_input_tokens": 21
+					"cached_creation_input_tokens": 21
 				}
 			}`,
 		},
diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go
index 7b498f12af..60f612fa17 100644
--- a/internal/extproc/processor_impl_test.go
+++ b/internal/extproc/processor_impl_test.go
@@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}},
-						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cache_creation_input_token_usage"}},
+						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cached_creation_input_token_usage"}},
 						{
 							CELProg:        celProgInt,
 							LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"},
@@ -312,7 +312,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cached_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
-			GetStructValue().Fields["cache_creation_input_token_usage"].GetNumberValue())
+			GetStructValue().Fields["cached_creation_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cel_int"].GetNumberValue())
 		require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
diff --git a/internal/filterapi/runtime_test.go b/internal/filterapi/runtime_test.go
index a0ac5d6fa2..cb9f8d4af0 100644
--- a/internal/filterapi/runtime_test.go
+++ b/internal/filterapi/runtime_test.go
@@ -59,7 +59,7 @@ func TestServer_LoadConfig(t *testing.T) {
 		require.Equal(t, "1 + 1", rc.RequestCosts[1].CEL)
 		prog := rc.RequestCosts[1].CELProg
 		require.NotNil(t, prog)
-		val, err := llmcostcel.EvaluateProgram(prog, "", "", 1, 1, 1, 1)
+		val, err := llmcostcel.EvaluateProgram(prog, "", "", 1, 1, 1, 1, 1)
 		require.NoError(t, err)
 		require.Equal(t, uint64(2), val)
 		require.Equal(t, config.Models, rc.DeclaredModels)
diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go
index 5bc0008d59..46d06c7130 100644
--- a/internal/llmcostcel/cel.go
+++ b/internal/llmcostcel/cel.go
@@ -19,8 +19,8 @@ const (
 	celModelNameKey                 = "model"
 	celBackendKey                   = "backend"
 	celInputTokensKey               = "input_tokens"
-	celCachedInputTokensKey         = "cached_input_tokens"         // #nosec G101
-	celCachedCreationInputTokensKey = "cache_creation_input_tokens" // #nosec G101
+	celCachedInputTokensKey         = "cached_input_tokens"          // #nosec G101
+	celCachedCreationInputTokensKey = "cached_creation_input_tokens" // #nosec G101
 	celOutputTokensKey              = "output_tokens"
 	celTotalTokensKey               = "total_tokens"
 )
diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go
index cee9a259a5..92a323fc63 100644
--- a/internal/llmcostcel/cel_test.go
+++ b/internal/llmcostcel/cel_test.go
@@ -26,7 +26,7 @@ func TestNewProgram(t *testing.T) {
 		require.NoError(t, err)
 	})
 	t.Run("variables", func(t *testing.T) {
-		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens - cache_creation_input_tokens) * output_tokens  : total_tokens")
+		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens - cached_creation_input_tokens) * output_tokens  : total_tokens")
 		require.NoError(t, err)
 		v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3)
 		require.NoError(t, err)
diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go
index cb45ae6051..f739bf8764 100644
--- a/internal/metrics/genai.go
+++ b/internal/metrics/genai.go
@@ -40,7 +40,7 @@ const (
 	//
 	// However, the spec says "a custom value MAY be used.", so we can use it now.
 	genaiTokenTypeCachedInput         = "cached_input"
-	genaiTokenTypeCachedCreationInput = "cache_creation_input"
+	genaiTokenTypeCachedCreationInput = "cached_creation_input"
 	genaiErrorTypeFallback            = "_OTHER"
 )
 
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 38fe032539..a68d810dac 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -261,7 +261,7 @@ func (u *TokenUsage) Override(other TokenUsage) {
 
 // ExtractTokenUsageFromAnthropic extracts the correct token usage from Anthropic API response.
 // According to Claude API documentation, total input tokens is the summation of:
-// input_tokens + cache_creation_input_tokens + cache_read_input_tokens
+// input_tokens + cached_creation_input_tokens + cache_read_input_tokens
 //
 // This function works for both streaming and non-streaming responses by accepting
 // the common usage fields that exist in all Anthropic usage structures.
diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go
index 581815e1bc..bcf6bc8282 100644
--- a/internal/metrics/metrics_impl_test.go
+++ b/internal/metrics/metrics_impl_test.go
@@ -83,7 +83,7 @@ func TestRecordTokenUsage(t *testing.T) {
 	pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}})
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
 		inputTokens: 10, cachedInputTokens: 8, cachedCreationInputTokens: 2, outputTokens: 5,
-		inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true,
+		inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
 	count, sum := testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, inputAttrs)
@@ -301,7 +301,7 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) {
 	pm.SetResponseModel("res-model")
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
 		inputTokens: 2, cachedInputTokens: 1, cachedCreationInputTokens: 6, outputTokens: 3,
-		inputTokenSet: true, cachedInputTokenSet: true, outputTokenSet: true,
+		inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
 	inputAttrs := attribute.NewSet(
diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go
index f0f6d65ece..23fea486a9 100644
--- a/internal/tracing/openinference/openinference.go
+++ b/internal/tracing/openinference/openinference.go
@@ -163,7 +163,7 @@ const (
 	// LLMTokenCountPromptCacheWrite represents the number of prompt tokens
 	// created to cache (cache write). This enables tracking of cache efficiency
 	// and cost savings from cached prompts.
-	LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_creation" // #nosec G101
+	LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cached_creation" // #nosec G101
 
 	// LLMTokenCountPromptAudio represents the number of audio tokens in the prompt.
 	// Used for multimodal models that support audio input.
diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go
index 236dd537d9..9b175e6723 100644
--- a/internal/translator/anthropic_anthropic_test.go
+++ b/internal/translator/anthropic_anthropic_test.go
@@ -89,7 +89,7 @@ func TestAnthropicToAnthropic_ResponseHeaders(t *testing.T) {
 func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) {
 	translator := NewAnthropicToAnthropicTranslator("", "")
 	require.NotNil(t, translator)
-	const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`
+	const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`
 
 	headerMutation, bodyMutation, tokenUsage, responseModel, err := translator.ResponseBody(nil, strings.NewReader(responseBody), true, nil)
 	require.NoError(t, err)
@@ -108,7 +108,7 @@ func TestAnthropicToAnthropic_ResponseBody_streaming(t *testing.T) {
 	// We split the response into two parts to simulate streaming where each part can end in the
 	// middle of an event.
 	const responseHead = `event: message_start
-data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}}    }
+data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}}    }
 
 event: content_block_start
 data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}      }
diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go
index d2c87e4935..d893dfdab5 100644
--- a/internal/translator/anthropic_gcpanthropic_test.go
+++ b/internal/translator/anthropic_gcpanthropic_test.go
@@ -611,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t
 	// 3. message_delta at the end provides output_tokens=5 but no input_tokens
 	// 4. message_stop ends the stream
 	messageStartChunk := `event: message_start
-data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5,  "cache_creation_input_tokens": 1, "output_tokens": 0}}}
+data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5,  "cached_creation_input_tokens": 1, "output_tokens": 0}}}
 `
 	contentBlockStartChunk := `event: content_block_start
 data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}}
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index 42e2b0e83f..a267d65222 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -282,7 +282,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 	t.Run("claude API documentation example", func(t *testing.T) {
 		// This test verifies compliance with Claude API documentation:
 		// "Total input tokens in a request is the summation of input_tokens,
-		// cache_creation_input_tokens, and cache_read_input_tokens".
+		// cached_creation_input_tokens, and cache_read_input_tokens".
 
 		inputTokens := int64(100)
 		cachedCreationTokens := int64(20)
@@ -297,7 +297,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 		inputTokensVal, ok := result.InputTokens()
 		assert.True(t, ok)
 		assert.Equal(t, expectedTotalInput, inputTokensVal,
-			"InputTokens should be sum of input_tokens + cache_creation_input_tokens + cache_read_input_tokens")
+			"InputTokens should be sum of input_tokens + cached_creation_input_tokens + cache_read_input_tokens")
 
 		cachedTokens, ok := result.CachedInputTokens()
 		assert.True(t, ok)
@@ -307,7 +307,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 		cachedCreationTokens, ok := result.CachedCreationInputTokens()
 		assert.True(t, ok)
 		assert.Equal(t, cachedCreationTokens, cachedCreationTokens,
-			"CachedCreationInputTokens should be cache_creation_input_tokens")
+			"CachedCreationInputTokens should be cached_creation_input_tokens")
 
 		// Total tokens should be input + output.
 		expectedTotal := expectedTotalInput + uint32(outputTokens)
diff --git a/tests/data-plane/testupstream_test.go b/tests/data-plane/testupstream_test.go
index 6cc8b1000c..d1ef2abba8 100644
--- a/tests/data-plane/testupstream_test.go
+++ b/tests/data-plane/testupstream_test.go
@@ -931,7 +931,7 @@ data: {"type": "message_stop"}
     ]
   }`,
 			expPath:      "/v1/messages",
-			responseBody: `{"model":"foo","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`,
+			responseBody: `{"model":"foo","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`,
 			expStatus:    http.StatusOK,
 		},
 		{
@@ -954,7 +954,7 @@ data: {"type": "message_stop"}
 			expPath: "/v1/messages",
 			responseBody: `
 event: message_start
-data: {"type":"message_start","message":{"model":"foo","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard"}}    }
+data: {"type":"message_start","message":{"model":"foo","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard"}}    }
 
 event: content_block_start
 data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}      }
@@ -975,7 +975,7 @@ event: content_block_stop
 data: {"type":"content_block_stop","index":0             }
 
 event: message_delta
-data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":16}               }
+data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":16}               }
 
 event: message_stop
 data: {"type":"message_stop"       }

From ff671649b60aa7deb6af5b09d13d3eca79f38942 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:23:27 -0500
Subject: [PATCH 09/20] make apigen

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 ...gateway.envoyproxy.io_aigatewayroutes.yaml | 45 ++++++++++---------
 site/docs/api/api.mdx                         | 11 +++--
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml
index c69175ff95..e75770a409 100644
--- a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml
+++ b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml
@@ -173,16 +173,18 @@ spec:
                   metadataKey: llm_input_token\n\t  type: InputToken\n\t- metadataKey:
                   llm_output_token\n\t  type: OutputToken\n\t- metadataKey: llm_total_token\n\t
                   \ type: TotalToken\n\t- metadataKey: llm_cached_input_token\n\t
-                  \ type: CachedInputToken\n```\nThen, with the following BackendTrafficPolicy
-                  of Envoy Gateway, you can have three\nrate limit buckets for each
-                  unique x-user-id header value. One bucket is for the input token,\nthe
-                  other is for the output token, and the last one is for the total
-                  token.\nEach bucket will be reduced by the corresponding token usage
-                  captured by the AI Gateway filter.\n\n```yaml\n\tapiVersion: gateway.envoyproxy.io/v1alpha1\n\tkind:
-                  BackendTrafficPolicy\n\tmetadata:\n\t  name: some-example-token-rate-limit\n\t
-                  \ namespace: default\n\tspec:\n\t  targetRefs:\n\t  - group: gateway.networking.k8s.io\n\t
-                  \    kind: HTTPRoute\n\t     name: usage-rate-limit\n\t  rateLimit:\n\t
-                  \   type: Global\n\t    global:\n\t      rules:\n\t        - clientSelectors:\n\t
+                  \ type: CachedInputToken\n- metadataKey: llm_cached_creation_input_token\n
+                  \  type: CachedCreationInputToken\n```\nThen, with the following
+                  BackendTrafficPolicy of Envoy Gateway, you can have three\nrate
+                  limit buckets for each unique x-user-id header value. One bucket
+                  is for the input token,\nthe other is for the output token, and
+                  the last one is for the total token.\nEach bucket will be reduced
+                  by the corresponding token usage captured by the AI Gateway filter.\n\n```yaml\n\tapiVersion:
+                  gateway.envoyproxy.io/v1alpha1\n\tkind: BackendTrafficPolicy\n\tmetadata:\n\t
+                  \ name: some-example-token-rate-limit\n\t  namespace: default\n\tspec:\n\t
+                  \ targetRefs:\n\t  - group: gateway.networking.k8s.io\n\t     kind:
+                  HTTPRoute\n\t     name: usage-rate-limit\n\t  rateLimit:\n\t    type:
+                  Global\n\t    global:\n\t      rules:\n\t        - clientSelectors:\n\t
                   \           # Do the rate limiting based on the x-user-id header.\n\t
                   \           - headers:\n\t                - name: x-user-id\n\t
                   \                 type: Distinct\n\t          limit:\n\t            #
@@ -227,15 +229,17 @@ spec:
                         Type: string.\n\t* backend: the backend name in the form of
                         \"name.namespace\". Type: string.\n\t* input_tokens: the number
                         of input tokens. Type: unsigned integer.\n\t* cached_input_tokens:
-                        the number of cached input tokens. Type: unsigned integer.\n\t*
-                        output_tokens: the number of output tokens. Type: unsigned
-                        integer.\n\t* total_tokens: the total number of tokens. Type:
-                        unsigned integer.\n\nFor example, the following expressions
-                        are valid:\n\n\t* \"model == 'llama' ?  input_tokens + output_token
-                        * 0.5 : total_tokens\"\n\t* \"backend == 'foo.default' ?  input_tokens
-                        + output_tokens : total_tokens\"\n\t* \"backend == 'bar.default'
-                        ?  (input_tokens - cached_input_tokens) + cached_input_tokens
-                        * 0.1 + output_tokens : total_tokens\"\n\t* \"input_tokens
+                        the number of cached read input tokens. Type: unsigned integer.\n\t*
+                        cached_creation_input_tokens: the number of cache creation
+                        input tokens. Type: unsigned integer.\n\t* output_tokens:
+                        the number of output tokens. Type: unsigned integer.\n\t*
+                        total_tokens: the total number of tokens. Type: unsigned integer.\n\nFor
+                        example, the following expressions are valid:\n\n\t* \"model
+                        == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens\"\n\t*
+                        \"backend == 'foo.default' ?  input_tokens + output_tokens
+                        : total_tokens\"\n\t* \"backend == 'bar.default' ?  (input_tokens
+                        - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens
+                        * 1.25 + output_tokens : total_tokens\"\n\t* \"input_tokens
                         + output_tokens + total_tokens\"\n\t* \"input_tokens * output_tokens\""
                       type: string
                     metadataKey:
@@ -246,11 +250,12 @@ spec:
                       description: |-
                         Type specifies the type of the request cost. The default is "OutputToken",
                         and it uses "output token" as the cost. The other types are "InputToken", "TotalToken",
-                        and "CEL".
+                        "CachedInputToken", "CachedCreationInputToken", and "CEL".
                       enum:
                       - OutputToken
                       - InputToken
                       - CachedInputToken
+                      - CachedCreationInputToken
                       - TotalToken
                       - CEL
                       type: string
diff --git a/site/docs/api/api.mdx b/site/docs/api/api.mdx
index 36b396c219..0516d744cb 100644
--- a/site/docs/api/api.mdx
+++ b/site/docs/api/api.mdx
@@ -757,7 +757,7 @@ AIGatewayRouteSpec details the AIGatewayRoute configuration.
   name="llmRequestCosts"
   type="[LLMRequestCost](#llmrequestcost) array"
   required="false"
-  description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.<br />The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic<br />metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,<br />For example, let's say we have the following LLMRequestCosts configuration:<br />```yaml<br />	llmRequestCosts:<br />	- metadataKey: llm_input_token<br />	  type: InputToken<br />	- metadataKey: llm_output_token<br />	  type: OutputToken<br />	- metadataKey: llm_total_token<br />	  type: TotalToken<br />	- metadataKey: llm_cached_input_token<br />	  type: CachedInputToken<br />```<br />Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three<br />rate limit buckets for each unique x-user-id header value. One bucket is for the input token,<br />the other is for the output token, and the last one is for the total token.<br />Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.<br />```yaml<br />	apiVersion: gateway.envoyproxy.io/v1alpha1<br />	kind: BackendTrafficPolicy<br />	metadata:<br />	  name: some-example-token-rate-limit<br />	  namespace: default<br />	spec:<br />	  targetRefs:<br />	  - group: gateway.networking.k8s.io<br />	     kind: HTTPRoute<br />	     name: usage-rate-limit<br />	  rateLimit:<br />	    type: Global<br />	    global:<br />	      rules:<br />	        - clientSelectors:<br />	            # Do the rate limiting based on the x-user-id header.<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            # Configures the number of `tokens` allowed per hour.<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              # Setting the request cost to zero allows to only check the rate limit budget,<br />	              # and not consume the budget on the request path.<br />	              number: 0<br />	            # This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.<br />	            # The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited<br />	            # if the budget is exhausted.<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_input_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_output_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_total_token<br />```<br />Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and<br />different costs are configured for the same metadata key, the ai-gateway will pick one of them<br />to configure the metadata key in the generated HTTPRoute, and ignore the rest."
+  description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.<br />The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic<br />metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,<br />For example, let's say we have the following LLMRequestCosts configuration:<br />```yaml<br />	llmRequestCosts:<br />	- metadataKey: llm_input_token<br />	  type: InputToken<br />	- metadataKey: llm_output_token<br />	  type: OutputToken<br />	- metadataKey: llm_total_token<br />	  type: TotalToken<br />	- metadataKey: llm_cached_input_token<br />	  type: CachedInputToken<br />- metadataKey: llm_cached_creation_input_token<br />   type: CachedCreationInputToken<br />```<br />Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three<br />rate limit buckets for each unique x-user-id header value. One bucket is for the input token,<br />the other is for the output token, and the last one is for the total token.<br />Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.<br />```yaml<br />	apiVersion: gateway.envoyproxy.io/v1alpha1<br />	kind: BackendTrafficPolicy<br />	metadata:<br />	  name: some-example-token-rate-limit<br />	  namespace: default<br />	spec:<br />	  targetRefs:<br />	  - group: gateway.networking.k8s.io<br />	     kind: HTTPRoute<br />	     name: usage-rate-limit<br />	  rateLimit:<br />	    type: Global<br />	    global:<br />	      rules:<br />	        - clientSelectors:<br />	            # Do the rate limiting based on the x-user-id header.<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            # Configures the number of `tokens` allowed per hour.<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              # Setting the request cost to zero allows to only check the rate limit budget,<br />	              # and not consume the budget on the request path.<br />	              number: 0<br />	            # This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.<br />	            # The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited<br />	            # if the budget is exhausted.<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_input_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_output_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_total_token<br />```<br />Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and<br />different costs are configured for the same metadata key, the ai-gateway will pick one of them<br />to configure the metadata key in the generated HTTPRoute, and ignore the rest."
 />
 
 
@@ -1664,12 +1664,12 @@ LLMRequestCost configures each request cost.
   name="type"
   type="[LLMRequestCostType](#llmrequestcosttype)"
   required="true"
-  description="Type specifies the type of the request cost. The default is `OutputToken`,<br />and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,<br />and `CEL`."
+  description="Type specifies the type of the request cost. The default is `OutputToken`,<br />and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,<br />`CachedInputToken`, `CachedCreationInputToken`, and `CEL`."
 /><ApiField
   name="cel"
   type="string"
   required="false"
-  description="CEL is the CEL expression to calculate the cost of the request.<br />The CEL expression must return a signed or unsigned integer. If the<br />return value is negative, it will be error.<br />The expression can use the following variables:<br />	* model: the model name extracted from the request content. Type: string.<br />	* backend: the backend name in the form of `name.namespace`. Type: string.<br />	* input_tokens: the number of input tokens. Type: unsigned integer.<br />	* cached_input_tokens: the number of cached input tokens. Type: unsigned integer.<br />	* output_tokens: the number of output tokens. Type: unsigned integer.<br />	* total_tokens: the total number of tokens. Type: unsigned integer.<br />For example, the following expressions are valid:<br />	* `model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens`<br />	* `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`<br />	* `backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + output_tokens : total_tokens`<br />	* `input_tokens + output_tokens + total_tokens`<br />	* `input_tokens * output_tokens`"
+  description="CEL is the CEL expression to calculate the cost of the request.<br />The CEL expression must return a signed or unsigned integer. If the<br />return value is negative, it will be error.<br />The expression can use the following variables:<br />	* model: the model name extracted from the request content. Type: string.<br />	* backend: the backend name in the form of `name.namespace`. Type: string.<br />	* input_tokens: the number of input tokens. Type: unsigned integer.<br />	* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.<br />	* cached_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.<br />	* output_tokens: the number of output tokens. Type: unsigned integer.<br />	* total_tokens: the total number of tokens. Type: unsigned integer.<br />For example, the following expressions are valid:<br />	* `model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens`<br />	* `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`<br />	* `backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens * 1.25 + output_tokens : total_tokens`<br />	* `input_tokens + output_tokens + total_tokens`<br />	* `input_tokens * output_tokens`"
 />
 
 
@@ -1696,6 +1696,11 @@ LLMRequestCostType specifies the type of the LLMRequestCost.
   type="enum"
   required="false"
   description="LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.<br />"
+/><ApiField
+  name="CachedCreationInputToken"
+  type="enum"
+  required="false"
+  description="LLMRequestCostTypeCachedCreationInputToken is the cost type of the cached input token.<br />"
 /><ApiField
   name="OutputToken"
   type="enum"

From 6f5087a0ebc7724172cd7538849e8adfa5dfdcff Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:36:57 -0500
Subject: [PATCH 10/20] fix cached -> cache for anthropic

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/apischema/anthropic/anthropic.go          |  2 +-
 internal/translator/anthropic_anthropic_test.go    |  4 ++--
 internal/translator/anthropic_gcpanthropic_test.go |  2 +-
 internal/translator/anthropic_usage_test.go        | 12 ++++++------
 internal/translator/openai_gcpanthropic_stream.go  |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/internal/apischema/anthropic/anthropic.go b/internal/apischema/anthropic/anthropic.go
index 55dbaceee5..f65d102761 100644
--- a/internal/apischema/anthropic/anthropic.go
+++ b/internal/apischema/anthropic/anthropic.go
@@ -437,7 +437,7 @@ const (
 // so we use float64 to be able to unmarshal both 1234 and 1234.0 without errors.
 type Usage struct {
 	// The number of input tokens used to create the cache entry.
-	CachedCreationInputTokens float64 `json:"cached_creation_input_tokens"`
+	CacheCreationInputTokens float64 `json:"cache_creation_input_tokens"`
 	// The number of input tokens read from the cache.
 	CacheReadInputTokens float64 `json:"cache_read_input_tokens"`
 	// The number of input tokens which were used.
diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go
index 9b175e6723..6d7e12f7ab 100644
--- a/internal/translator/anthropic_anthropic_test.go
+++ b/internal/translator/anthropic_anthropic_test.go
@@ -89,7 +89,7 @@ func TestAnthropicToAnthropic_ResponseHeaders(t *testing.T) {
 func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) {
 	translator := NewAnthropicToAnthropicTranslator("", "")
 	require.NotNil(t, translator)
-	const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`
+	const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`
 
 	headerMutation, bodyMutation, tokenUsage, responseModel, err := translator.ResponseBody(nil, strings.NewReader(responseBody), true, nil)
 	require.NoError(t, err)
@@ -108,7 +108,7 @@ func TestAnthropicToAnthropic_ResponseBody_streaming(t *testing.T) {
 	// We split the response into two parts to simulate streaming where each part can end in the
 	// middle of an event.
 	const responseHead = `event: message_start
-data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}}    }
+data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}}    }
 
 event: content_block_start
 data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}      }
diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go
index d893dfdab5..d2c87e4935 100644
--- a/internal/translator/anthropic_gcpanthropic_test.go
+++ b/internal/translator/anthropic_gcpanthropic_test.go
@@ -611,7 +611,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingFullScenario(t
 	// 3. message_delta at the end provides output_tokens=5 but no input_tokens
 	// 4. message_stop ends the stream
 	messageStartChunk := `event: message_start
-data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5,  "cached_creation_input_tokens": 1, "output_tokens": 0}}}
+data: {"type": "message_start", "message": {"id": "msg_123", "type": "message", "role": "assistant", "content": [], "model": "claude-3-sonnet-20240229", "usage": {"input_tokens": 15, "cache_read_input_tokens": 5,  "cache_creation_input_tokens": 1, "output_tokens": 0}}}
 `
 	contentBlockStartChunk := `event: content_block_start
 data: {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}}
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index a267d65222..01ab735f63 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -282,22 +282,22 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 	t.Run("claude API documentation example", func(t *testing.T) {
 		// This test verifies compliance with Claude API documentation:
 		// "Total input tokens in a request is the summation of input_tokens,
-		// cached_creation_input_tokens, and cache_read_input_tokens".
+		// cache_creation_input_tokens, and cache_read_input_tokens".
 
 		inputTokens := int64(100)
-		cachedCreationTokens := int64(20)
+		cachedWriteTokens := int64(20)
 		cacheReadTokens := int64(30)
 		outputTokens := int64(50)
 
-		result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens)
+		result := metrics.ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedWriteTokens)
 
 		// Total input should be sum of all input token types.
-		expectedTotalInputInt := inputTokens + cachedCreationTokens + cacheReadTokens
+		expectedTotalInputInt := inputTokens + cachedWriteTokens + cacheReadTokens
 		expectedTotalInput := uint32(expectedTotalInputInt) // #nosec G115 - test values are small and safe
 		inputTokensVal, ok := result.InputTokens()
 		assert.True(t, ok)
 		assert.Equal(t, expectedTotalInput, inputTokensVal,
-			"InputTokens should be sum of input_tokens + cached_creation_input_tokens + cache_read_input_tokens")
+			"InputTokens should be sum of input_tokens + cache_creation_input_tokens + cache_read_input_tokens")
 
 		cachedTokens, ok := result.CachedInputTokens()
 		assert.True(t, ok)
@@ -307,7 +307,7 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 		cachedCreationTokens, ok := result.CachedCreationInputTokens()
 		assert.True(t, ok)
 		assert.Equal(t, cachedCreationTokens, cachedCreationTokens,
-			"CachedCreationInputTokens should be cached_creation_input_tokens")
+			"CachedCreationInputTokens should be cache_creation_input_tokens")
 
 		// Total tokens should be input + output.
 		expectedTotal := expectedTotalInput + uint32(outputTokens)
diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go
index 9e5cc95796..9840af1650 100644
--- a/internal/translator/openai_gcpanthropic_stream.go
+++ b/internal/translator/openai_gcpanthropic_stream.go
@@ -203,7 +203,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 			u.InputTokens,
 			u.OutputTokens,
 			u.CacheReadInputTokens,
-			u.CachedCreationInputTokens,
+			u.CacheCreationInputTokens,
 		)
 		// For message_start, we store the initial usage but don't add to the accumulated
 		// The message_delta event will contain the final totals

From d8a318fea7a1be2b6e8d6cf040b33df36564195d Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:42:12 -0500
Subject: [PATCH 11/20] missing a few typos

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/tracing/openinference/anthropic/messages.go | 2 +-
 internal/translator/anthropic_anthropic.go           | 2 +-
 internal/translator/openai_gcpanthropic_stream.go    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go
index 23a52a618c..ec6940056f 100644
--- a/internal/tracing/openinference/anthropic/messages.go
+++ b/internal/tracing/openinference/anthropic/messages.go
@@ -211,7 +211,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer
 		int64(u.InputTokens),
 		int64(u.OutputTokens),
 		int64(u.CacheReadInputTokens),
-		int64(u.CachedCreationInputTokens),
+		int64(u.CacheCreationInputTokens),
 	)
 	input, _ := cost.InputTokens()
 	cacheRead, _ := cost.CachedInputTokens()
diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go
index 60f12cf6af..86356d03a3 100644
--- a/internal/translator/anthropic_anthropic.go
+++ b/internal/translator/anthropic_anthropic.go
@@ -148,7 +148,7 @@ func (a *anthropicToAnthropicTranslator) extractUsageFromBufferEvent(s tracing.M
 					int64(u.InputTokens),
 					int64(u.OutputTokens),
 					int64(u.CacheReadInputTokens),
-					int64(u.CachedCreationInputTokens),
+					int64(u.CacheCreationInputTokens),
 				)
 				// Override with message_start usage (contains input tokens and initial state)
 				a.streamingTokenUsage.Override(messageStartUsage)
diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go
index 9840af1650..5d9e12c4a1 100644
--- a/internal/translator/openai_gcpanthropic_stream.go
+++ b/internal/translator/openai_gcpanthropic_stream.go
@@ -285,7 +285,7 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 			u.InputTokens,
 			u.OutputTokens,
 			u.CacheReadInputTokens,
-			u.CachedCreationInputTokens,
+			u.CacheCreationInputTokens,
 		)
 		// For message_delta, accumulate the incremental output tokens
 		if output, ok := usage.OutputTokens(); ok {

From f2a3cbbba49894d491338b79e509a7b826dd0fbf Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:45:28 -0500
Subject: [PATCH 12/20] update typo

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/anthropic_anthropic.go  | 2 +-
 internal/translator/anthropic_usage_test.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go
index 86356d03a3..1f5c8d5bf0 100644
--- a/internal/translator/anthropic_anthropic.go
+++ b/internal/translator/anthropic_anthropic.go
@@ -103,7 +103,7 @@ func (a *anthropicToAnthropicTranslator) ResponseBody(_ map[string]string, body
 		int64(usage.InputTokens),
 		int64(usage.OutputTokens),
 		int64(usage.CacheReadInputTokens),
-		int64(usage.CachedCreationInputTokens),
+		int64(usage.CacheCreationInputTokens),
 	)
 	if span != nil {
 		span.RecordResponse(anthropicResp)
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index 01ab735f63..4058038688 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -248,7 +248,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 			result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens,
 				tt.usage.OutputTokens,
 				tt.usage.CacheReadInputTokens,
-				tt.usage.CachedCreationInputTokens,
+				tt.usage.CacheCreationInputTokens,
 			)
 			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)

From d027658142c62fd79cc765478b4a5de74d664da9 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:50:19 -0500
Subject: [PATCH 13/20] last try fixing typos

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/anthropic_usage_test.go | 2 +-
 internal/translator/openai_gcpanthropic.go  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index 4058038688..4ff13ab56d 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -181,7 +181,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 			result := metrics.ExtractTokenUsageFromAnthropic(tt.usage.InputTokens,
 				tt.usage.OutputTokens,
 				tt.usage.CacheReadInputTokens,
-				tt.usage.CachedCreationInputTokens,
+				tt.usage.CacheCreationInputTokens,
 			)
 			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
 			assert.Equal(t, expected, result)
diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go
index dce744da00..6a916cc0e2 100644
--- a/internal/translator/openai_gcpanthropic.go
+++ b/internal/translator/openai_gcpanthropic.go
@@ -833,7 +833,7 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri
 		usage.InputTokens,
 		usage.OutputTokens,
 		usage.CacheReadInputTokens,
-		usage.CachedCreationInputTokens,
+		usage.CacheCreationInputTokens,
 	)
 	inputTokens, _ := tokenUsage.InputTokens()
 	outputTokens, _ := tokenUsage.OutputTokens()

From 35b06ed9186d3db9b601c195503a9c5a06e8ffba Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 16:55:03 -0500
Subject: [PATCH 14/20] update anthropic

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/anthropic_usage_test.go   | 48 +++++++++----------
 .../translator/openai_gcpanthropic_test.go    | 10 ++--
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index 4ff13ab56d..bfce108d37 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -135,10 +135,10 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 		{
 			name: "non-streaming response without cache",
 			usage: anthropic.Usage{
-				InputTokens:               150,
-				OutputTokens:              75,
-				CacheReadInputTokens:      0,
-				CachedCreationInputTokens: 0,
+				InputTokens:              150,
+				OutputTokens:             75,
+				CacheReadInputTokens:     0,
+				CacheCreationInputTokens: 0,
 			},
 			expectedInputTokens:          150,
 			expectedOutputTokens:         75,
@@ -149,10 +149,10 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 		{
 			name: "non-streaming response with cache read",
 			usage: anthropic.Usage{
-				InputTokens:               100,
-				OutputTokens:              50,
-				CacheReadInputTokens:      25,
-				CachedCreationInputTokens: 0,
+				InputTokens:              100,
+				OutputTokens:             50,
+				CacheReadInputTokens:     25,
+				CacheCreationInputTokens: 0,
 			},
 			expectedInputTokens:          125, // 100 + 0 + 25
 			expectedOutputTokens:         50,
@@ -163,10 +163,10 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 		{
 			name: "non-streaming response with both cache types",
 			usage: anthropic.Usage{
-				InputTokens:               90,
-				OutputTokens:              60,
-				CacheReadInputTokens:      15,
-				CachedCreationInputTokens: 10,
+				InputTokens:              90,
+				OutputTokens:             60,
+				CacheReadInputTokens:     15,
+				CacheCreationInputTokens: 10,
 			},
 			expectedInputTokens:          115, // 90 + 10 + 15
 			expectedOutputTokens:         60,
@@ -202,10 +202,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 		{
 			name: "message_delta event with final totals",
 			usage: anthropic.MessageDeltaUsage{
-				InputTokens:               250,
-				OutputTokens:              120,
-				CacheReadInputTokens:      30,
-				CachedCreationInputTokens: 0,
+				InputTokens:              250,
+				OutputTokens:             120,
+				CacheReadInputTokens:     30,
+				CacheCreationInputTokens: 0,
 			},
 			expectedInputTokens:          280, // 250 + 0 + 30
 			expectedOutputTokens:         120,
@@ -216,10 +216,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 		{
 			name: "message_delta event with only output tokens",
 			usage: anthropic.MessageDeltaUsage{
-				InputTokens:               0,
-				OutputTokens:              85,
-				CacheReadInputTokens:      0,
-				CachedCreationInputTokens: 0,
+				InputTokens:              0,
+				OutputTokens:             85,
+				CacheReadInputTokens:     0,
+				CacheCreationInputTokens: 0,
 			},
 			expectedInputTokens:          0,
 			expectedOutputTokens:         85,
@@ -230,10 +230,10 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 		{
 			name: "message_delta with cache creation tokens",
 			usage: anthropic.MessageDeltaUsage{
-				InputTokens:               150,
-				OutputTokens:              75,
-				CacheReadInputTokens:      10,
-				CachedCreationInputTokens: 5,
+				InputTokens:              150,
+				OutputTokens:             75,
+				CacheReadInputTokens:     10,
+				CacheCreationInputTokens: 5,
 			},
 			expectedInputTokens:          165, // 150 + 5 + 10
 			expectedOutputTokens:         75,
diff --git a/internal/translator/openai_gcpanthropic_test.go b/internal/translator/openai_gcpanthropic_test.go
index ea0a11d979..e48cd64242 100644
--- a/internal/translator/openai_gcpanthropic_test.go
+++ b/internal/translator/openai_gcpanthropic_test.go
@@ -598,11 +598,11 @@ func TestOpenAIToGCPAnthropicTranslatorV1ChatCompletion_ResponseBody(t *testing.
 			require.NoError(t, err)
 
 			expectedTokenUsage := tokenUsageFrom(
-				int32(tt.expectedOpenAIResponse.Usage.PromptTokens),                      // nolint:gosec
-				uint32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens), // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens),  // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.CompletionTokens),                  // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.TotalTokens),                       // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokens),                             // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens),         // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedCreationTokens), // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.CompletionTokens),                         // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.TotalTokens),                              // nolint:gosec
 			)
 			require.Equal(t, expectedTokenUsage, usedToken)
 

From bfd00542fd11eb48c8981dc4ec6e32fad942382a Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 17:30:02 -0500
Subject: [PATCH 15/20] fix some tests

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/apischema/openai/openai.go           |  2 +-
 internal/translator/anthropic_usage_test.go   |  4 ++--
 internal/translator/openai_awsbedrock.go      | 22 ++++++++++---------
 .../translator/openai_gcpvertexai_test.go     | 12 +++++-----
 .../data-plane/vcr/prometheus_metrics_test.go |  8 +++++--
 5 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go
index 02e8b9eae8..e5810218b4 100644
--- a/internal/apischema/openai/openai.go
+++ b/internal/apischema/openai/openai.go
@@ -2554,7 +2554,7 @@ type ResponseTokensDetails struct {
 	CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api
 
 	// CachedCreationTokens: number of tokens that were written to the cache.
-	CachedCreationTokens int64 `json:"cached_creation_input_tokens"` //nolint:tagliatelle
+	CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle
 
 	// ReasoningTokens: Number of reasoning tokens (for reasoning models).
 	ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index bfce108d37..aa84680b1b 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -183,7 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CacheCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -250,7 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CacheCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens)
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec
 			assert.Equal(t, expected, result)
 		})
 	}
diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go
index 8318fe88aa..531e9b3c03 100644
--- a/internal/translator/openai_awsbedrock.go
+++ b/internal/translator/openai_awsbedrock.go
@@ -853,16 +853,18 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe
 			return chunk, false
 		}
 		chunk.Usage = &openai.Usage{
-			TotalTokens:         event.Usage.TotalTokens,
-			PromptTokens:        event.Usage.InputTokens,
-			CompletionTokens:    event.Usage.OutputTokens,
-			PromptTokensDetails: &openai.PromptTokensDetails{},
-		}
-		if event.Usage.CacheReadInputTokens != nil {
-			chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens
-		}
-		if event.Usage.CacheWriteInputTokens != nil {
-			chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens
+			TotalTokens:      event.Usage.TotalTokens,
+			PromptTokens:     event.Usage.InputTokens,
+			CompletionTokens: event.Usage.OutputTokens,
+		}
+		if event.Usage.CacheReadInputTokens != nil || event.Usage.CacheWriteInputTokens != nil {
+			chunk.Usage.PromptTokensDetails = &openai.PromptTokensDetails{}
+			if event.Usage.CacheReadInputTokens != nil {
+				chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens
+			}
+			if event.Usage.CacheWriteInputTokens != nil {
+				chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens
+			}
 		}
 	// messageStart event.
 	case awsbedrock.ConverseStreamEventTypeMessageStart.String():
diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go
index 93740102ce..88cf30c0e4 100644
--- a/internal/translator/openai_gcpvertexai_test.go
+++ b/internal/translator/openai_gcpvertexai_test.go
@@ -913,7 +913,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
         "total_tokens": 25
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25),
+			wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25),
 		},
 		{
 			name: "response with safety ratings",
@@ -993,7 +993,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
         "total_tokens": 20
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20),
+			wantTokenUsage: tokenUsageFrom(8, 0, 0, 12, 20),
 		},
 		{
 			name: "empty response",
@@ -1025,7 +1025,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t
 
 data: [DONE]
 `),
-			wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8),
+			wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8),
 		},
 		{
 			name: "response with model version field",
@@ -1080,7 +1080,7 @@ data: [DONE]
         "total_tokens": 14
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(6, 0, -1, 8, 14),
+			wantTokenUsage: tokenUsageFrom(6, 0, 0, 8, 14),
 		},
 
 		{
@@ -1214,7 +1214,7 @@ data: [DONE]
     }
 }`),
 
-			wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25),
+			wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25),
 		},
 		{
 			name: "stream chunks with thought summary",
@@ -1236,7 +1236,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t
 
 data: [DONE]
 `),
-			wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8),
+			wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8),
 		},
 	}
 
diff --git a/tests/data-plane/vcr/prometheus_metrics_test.go b/tests/data-plane/vcr/prometheus_metrics_test.go
index be17251fec..4951b4994d 100644
--- a/tests/data-plane/vcr/prometheus_metrics_test.go
+++ b/tests/data-plane/vcr/prometheus_metrics_test.go
@@ -106,8 +106,8 @@ func verifyPrometheusRequestDuration(t *testing.T, metric *dto.MetricFamily, exp
 func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expectedModel string) {
 	t.Helper()
 	require.NotNil(t, metric)
-	require.Len(t, metric.Metric, 3)
-	var inputMetric, cachedInputMetric, outputMetric *dto.Metric
+	require.Len(t, metric.Metric, 4)
+	var inputMetric, cachedInputMetric, cachedCreationInputMetric, outputMetric *dto.Metric
 	for _, m := range metric.Metric {
 		for _, label := range m.Label {
 			if *label.Name == "gen_ai_token_type" {
@@ -116,6 +116,8 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 					inputMetric = m
 				case "cached_input":
 					cachedInputMetric = m
+				case "cached_creation_input":
+					cachedCreationInputMetric = m
 				case "output":
 					outputMetric = m
 				}
@@ -125,6 +127,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 	}
 	require.NotNil(t, inputMetric, "Input metric not found")
 	require.NotNil(t, cachedInputMetric, "Cached Input metric not found")
+	require.NotNil(t, cachedCreationInputMetric, "Cached Creation Input metric not found")
 	require.NotNil(t, outputMetric, "Output metric not found")
 
 	type testCase struct {
@@ -136,6 +139,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 	cases := []testCase{
 		{inputMetric, "input", 8},
 		{cachedInputMetric, "cached_input", 0},
+		{cachedCreationInputMetric, "cached_creation_input", 0},
 		{outputMetric, "output", 377},
 	}
 

From ac126dfb0a319b5d35d7d562ccc4f3d0dd92e6dd Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 18:06:08 -0500
Subject: [PATCH 16/20] fix more tests

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/anthropic_gcpanthropic_test.go |  8 ++++----
 internal/translator/anthropic_usage_test.go        |  6 +++---
 internal/translator/openai_awsbedrock.go           |  2 +-
 internal/translator/openai_gcpvertexai_test.go     | 10 +++++-----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go
index d2c87e4935..187ed4bbc5 100644
--- a/internal/translator/anthropic_gcpanthropic_test.go
+++ b/internal/translator/anthropic_gcpanthropic_test.go
@@ -642,13 +642,13 @@ data: {"type": "message_stop"}
 
 	// Assertions
 	assert.True(t, inputSet, "Input tokens should be set")
-	assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start")
+	assert.Equal(t, uint32(21), inputTokens, "Input tokens should be preserved from message_start")
 
 	assert.True(t, outputSet, "Output tokens should be set")
 	assert.Equal(t, uint32(0), outputTokens, "Output tokens should come from message_delta")
 
 	assert.True(t, totalSet, "Total tokens should be calculated")
-	assert.Equal(t, uint32(20), totalTokens, "Total tokens should be input + output")
+	assert.Equal(t, uint32(21), totalTokens, "Total tokens should be input + output")
 
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
@@ -675,13 +675,13 @@ data: {"type": "message_stop"}
 	cachedCreationTokens, cachedCreationSet = tokenUsage.CachedCreationInputTokens()
 
 	assert.True(t, inputSet, "Input tokens should be set")
-	assert.Equal(t, uint32(20), inputTokens, "Input tokens should be preserved from message_start")
+	assert.Equal(t, uint32(21), inputTokens, "Input tokens should be preserved from message_start")
 
 	assert.True(t, outputSet, "Output tokens should be set")
 	assert.Equal(t, uint32(5), outputTokens, "Output tokens should come from message_delta")
 
 	assert.True(t, totalSet, "Total tokens should be calculated")
-	assert.Equal(t, uint32(25), totalTokens, "Total tokens should be input + output")
+	assert.Equal(t, uint32(26), totalTokens, "Total tokens should be input + output")
 
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index aa84680b1b..86e7a50772 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -171,7 +171,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 			expectedInputTokens:          115, // 90 + 10 + 15
 			expectedOutputTokens:         60,
 			expectedTotalTokens:          175, // 115 + 60
-			expectedCachedTokens:         25,  // 15
+			expectedCachedTokens:         15,  // 15
 			expectedCachedCreationTokens: 10,  // 10
 		},
 	}
@@ -301,12 +301,12 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 
 		cachedTokens, ok := result.CachedInputTokens()
 		assert.True(t, ok)
-		assert.Equal(t, cacheReadTokens, cachedTokens,
+		assert.Equal(t, uint32(cacheReadTokens), cachedTokens,
 			"CachedInputTokens should be  cache_read_input_tokens")
 
 		cachedCreationTokens, ok := result.CachedCreationInputTokens()
 		assert.True(t, ok)
-		assert.Equal(t, cachedCreationTokens, cachedCreationTokens,
+		assert.Equal(t, uint32(cachedWriteTokens), cachedCreationTokens,
 			"CachedCreationInputTokens should be cache_creation_input_tokens")
 
 		// Total tokens should be input + output.
diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go
index 531e9b3c03..fad5dbb2cd 100644
--- a/internal/translator/openai_awsbedrock.go
+++ b/internal/translator/openai_awsbedrock.go
@@ -708,7 +708,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 					tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec
 				}
 				if usage.CacheWriteInputTokens != nil {
-					tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens))
+					tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) //nolint:gosec
 				}
 			}
 			oaiEvent, ok := o.convertEvent(event)
diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go
index 88cf30c0e4..279580d2fb 100644
--- a/internal/translator/openai_gcpvertexai_test.go
+++ b/internal/translator/openai_gcpvertexai_test.go
@@ -1025,7 +1025,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t
 
 data: [DONE]
 `),
-			wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8),
+			wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), // Does not support cache creation.
 		},
 		{
 			name: "response with model version field",
@@ -1080,7 +1080,7 @@ data: [DONE]
         "total_tokens": 14
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(6, 0, 0, 8, 14),
+			wantTokenUsage: tokenUsageFrom(6, 0, -1, 8, 14), // Does not support Cache Creation.
 		},
 
 		{
@@ -1149,7 +1149,7 @@ data: [DONE]
         "total_tokens": 20
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20),
+			wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20), // Does not support Cache Creation.
 		},
 		{
 			name: "response with thought summary",
@@ -1214,7 +1214,7 @@ data: [DONE]
     }
 }`),
 
-			wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25),
+			wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25), // Does not support Cache Creation.
 		},
 		{
 			name: "stream chunks with thought summary",
@@ -1236,7 +1236,7 @@ data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_t
 
 data: [DONE]
 `),
-			wantTokenUsage: tokenUsageFrom(5, 0, 0, 3, 8),
+			wantTokenUsage: tokenUsageFrom(5, 0, -1, 3, 8), // Does not support Cache Creation.
 		},
 	}
 

From a1c4f48a2eb638022d9b4b82ed232ded82c0ee43 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 18:20:26 -0500
Subject: [PATCH 17/20] fixed

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/openai_awsbedrock.go  | 14 +++++++-------
 internal/translator/openai_gcpvertexai.go |  3 +--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go
index fad5dbb2cd..6efaea9db7 100644
--- a/internal/translator/openai_awsbedrock.go
+++ b/internal/translator/openai_awsbedrock.go
@@ -752,7 +752,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 			PromptTokens:     bedrockResp.Usage.InputTokens,
 			CompletionTokens: bedrockResp.Usage.OutputTokens,
 		}
-		if openAIResp.Usage.PromptTokensDetails == nil {
+		if bedrockResp.Usage.CacheReadInputTokens != nil || bedrockResp.Usage.CacheWriteInputTokens != nil {
 			openAIResp.Usage.PromptTokensDetails = &openai.PromptTokensDetails{}
 		}
 		if bedrockResp.Usage.CacheReadInputTokens != nil {
@@ -859,12 +859,12 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe
 		}
 		if event.Usage.CacheReadInputTokens != nil || event.Usage.CacheWriteInputTokens != nil {
 			chunk.Usage.PromptTokensDetails = &openai.PromptTokensDetails{}
-			if event.Usage.CacheReadInputTokens != nil {
-				chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens
-			}
-			if event.Usage.CacheWriteInputTokens != nil {
-				chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens
-			}
+		}
+		if event.Usage.CacheReadInputTokens != nil {
+			chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens
+		}
+		if event.Usage.CacheWriteInputTokens != nil {
+			chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens
 		}
 	// messageStart event.
 	case awsbedrock.ConverseStreamEventTypeMessageStart.String():
diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go
index 74e99c5741..7aca029b4f 100644
--- a/internal/translator/openai_gcpvertexai.go
+++ b/internal/translator/openai_gcpvertexai.go
@@ -170,8 +170,7 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin
 		tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount))         //nolint:gosec
 		tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount))               //nolint:gosec
 		tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec
-		// Gemini does not return cache creation input tokens, set to 0.
-		tokenUsage.SetCachedCreationInputTokens(0)
+		// Gemini does not return cache creation input tokens; Skipping setCachedCreationInputTokens.
 	}
 
 	if span != nil {

From 337bd11f957e8de684f2736e2fc01938b728efce Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 18:33:06 -0500
Subject: [PATCH 18/20] negative

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 internal/translator/openai_gcpvertexai_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go
index 279580d2fb..ef9051bb7c 100644
--- a/internal/translator/openai_gcpvertexai_test.go
+++ b/internal/translator/openai_gcpvertexai_test.go
@@ -913,7 +913,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
         "total_tokens": 25
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(10, 10, 0, 15, 25),
+			wantTokenUsage: tokenUsageFrom(10, 10, -1, 15, 25),
 		},
 		{
 			name: "response with safety ratings",
@@ -993,7 +993,7 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
         "total_tokens": 20
     }
 }`),
-			wantTokenUsage: tokenUsageFrom(8, 0, 0, 12, 20),
+			wantTokenUsage: tokenUsageFrom(8, 0, -1, 12, 20),
 		},
 		{
 			name: "empty response",

From da31c228f967d99a3987f4f6436fafe72323d7d8 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 19:16:13 -0500
Subject: [PATCH 19/20] updated cached creation -> cache creation

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 api/v1alpha1/ai_gateway_route.go              |   4 +-
 api/v1alpha1/shared_types.go                  |  12 +-
 examples/token_ratelimit/token_ratelimit.yaml |   4 +-
 internal/apischema/openai/openai.go           |   8 +-
 internal/apischema/openai/openai_test.go      |  38 +--
 internal/controller/gateway.go                |   4 +-
 internal/controller/gateway_test.go           |   4 +-
 internal/extproc/mocks_test.go                |  28 +-
 internal/extproc/processor_impl.go            |   8 +-
 internal/extproc/processor_impl_test.go       |  10 +-
 internal/filterapi/filterconfig.go            |   4 +-
 internal/llmcostcel/cel.go                    |  32 +--
 internal/llmcostcel/cel_test.go               |   2 +-
 internal/metrics/genai.go                     |   6 +-
 internal/metrics/metrics.go                   |  50 ++--
 internal/metrics/metrics_impl.go              |   6 +-
 internal/metrics/metrics_impl_test.go         |  24 +-
 .../openinference/anthropic/messages.go       |   2 +-
 .../openinference/openai/response_attrs.go    |   6 +-
 .../openinference/openai/responses_test.go    |   4 +-
 .../tracing/openinference/openinference.go    |   2 +-
 internal/translator/anthropic_anthropic.go    |   4 +-
 .../translator/anthropic_anthropic_test.go    |   4 +-
 .../translator/anthropic_gcpanthropic_test.go |  18 +-
 internal/translator/anthropic_usage_test.go   | 250 +++++++++---------
 internal/translator/openai_awsbedrock.go      |   8 +-
 internal/translator/openai_awsbedrock_test.go |   6 +-
 internal/translator/openai_completions.go     |   4 +-
 internal/translator/openai_gcpanthropic.go    |   6 +-
 .../translator/openai_gcpanthropic_stream.go  |  14 +-
 .../translator/openai_gcpanthropic_test.go    |  10 +-
 internal/translator/openai_gcpvertexai.go     |   2 +-
 internal/translator/openai_openai.go          |   4 +-
 internal/translator/openai_responses.go       |  12 +-
 internal/translator/openai_responses_test.go  |  24 +-
 ...gateway.envoyproxy.io_aigatewayroutes.yaml |  12 +-
 site/docs/api/api.mdx                         |  10 +-
 .../testdata/aigatewayroutes/llmcosts.yaml    |   2 +-
 tests/data-plane/testupstream_test.go         |   6 +-
 .../data-plane/vcr/prometheus_metrics_test.go |  10 +-
 40 files changed, 332 insertions(+), 332 deletions(-)

diff --git a/api/v1alpha1/ai_gateway_route.go b/api/v1alpha1/ai_gateway_route.go
index f99a66bfc4..2e8a109090 100644
--- a/api/v1alpha1/ai_gateway_route.go
+++ b/api/v1alpha1/ai_gateway_route.go
@@ -108,8 +108,8 @@ type AIGatewayRouteSpec struct {
 	//	  type: TotalToken
 	//	- metadataKey: llm_cached_input_token
 	//	  type: CachedInputToken
-	// - metadataKey: llm_cached_creation_input_token
-	//    type: CachedCreationInputToken
+	// - metadataKey: llm_cache_creation_input_token
+	//    type: CacheCreationInputToken
 	// ```
 	// Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
 	// rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go
index ed4e2c4dc3..09fad0431a 100644
--- a/api/v1alpha1/shared_types.go
+++ b/api/v1alpha1/shared_types.go
@@ -100,9 +100,9 @@ type LLMRequestCost struct {
 	MetadataKey string `json:"metadataKey"`
 	// Type specifies the type of the request cost. The default is "OutputToken",
 	// and it uses "output token" as the cost. The other types are "InputToken", "TotalToken",
-	// "CachedInputToken", "CachedCreationInputToken", and "CEL".
+	// "CachedInputToken", "CacheCreationInputToken", and "CEL".
 	//
-	// +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CachedCreationInputToken;TotalToken;CEL
+	// +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;CacheCreationInputToken;TotalToken;CEL
 	Type LLMRequestCostType `json:"type"`
 	// CEL is the CEL expression to calculate the cost of the request.
 	// The CEL expression must return a signed or unsigned integer. If the
@@ -114,7 +114,7 @@ type LLMRequestCost struct {
 	//	* backend: the backend name in the form of "name.namespace". Type: string.
 	//	* input_tokens: the number of input tokens. Type: unsigned integer.
 	//	* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.
-	//	* cached_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.
+	//	* cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.
 	//	* output_tokens: the number of output tokens. Type: unsigned integer.
 	//	* total_tokens: the total number of tokens. Type: unsigned integer.
 	//
@@ -122,7 +122,7 @@ type LLMRequestCost struct {
 	//
 	// 	* "model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens"
 	//	* "backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens"
-	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens * 1.25 + output_tokens : total_tokens"
+	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens"
 	//	* "input_tokens + output_tokens + total_tokens"
 	//	* "input_tokens * output_tokens"
 	//
@@ -138,8 +138,8 @@ const (
 	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
 	// LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.
 	LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken"
-	// LLMRequestCostTypeCachedCreationInputToken is the cost type of the cached input token.
-	LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken"
+	// LLMRequestCostTypeCacheCreationInputToken is the cost type of the cached input token.
+	LLMRequestCostTypeCacheCreationInputToken LLMRequestCostType = "CacheCreationInputToken"
 	// LLMRequestCostTypeOutputToken is the cost type of the output token.
 	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
 	// LLMRequestCostTypeTotalToken is the cost type of the total token.
diff --git a/examples/token_ratelimit/token_ratelimit.yaml b/examples/token_ratelimit/token_ratelimit.yaml
index 42b35a9a18..3743b0dacd 100644
--- a/examples/token_ratelimit/token_ratelimit.yaml
+++ b/examples/token_ratelimit/token_ratelimit.yaml
@@ -51,8 +51,8 @@ spec:
       type: InputToken
     - metadataKey: llm_cached_input_token
       type: CachedInputToken
-    - metadataKey: llm_cached_creation_input_token
-      type: CachedCreationInputToken
+    - metadataKey: llm_cache_creation_input_token
+      type: CacheCreationInputToken
     - metadataKey: llm_output_token
       type: OutputToken
     - metadataKey: llm_total_token
diff --git a/internal/apischema/openai/openai.go b/internal/apischema/openai/openai.go
index e5810218b4..56277e3563 100644
--- a/internal/apischema/openai/openai.go
+++ b/internal/apischema/openai/openai.go
@@ -1383,7 +1383,7 @@ type PromptTokensDetails struct {
 	// Cached tokens present in the prompt.
 	CachedTokens int `json:"cached_tokens,omitzero"`
 	// Tokens written to the cache.
-	CachedCreationTokens int `json:"cached_creation_input_tokens,omitzero"`
+	CacheCreationTokens int `json:"cache_creation_input_tokens,omitzero"`
 }
 
 // ChatCompletionResponseChunk is described in the OpenAI API documentation:
@@ -2539,7 +2539,7 @@ type ResponseUsageInputTokensDetails struct {
 	CachedTokens int64 `json:"cached_tokens"`
 
 	// The number of tokens that were written to the cache.
-	CachedCreationTokens int64 `json:"cached_creation_input_tokens"`
+	CacheCreationTokens int64 `json:"cache_creation_input_tokens"`
 }
 
 // A detailed breakdown of the output tokens.
@@ -2553,8 +2553,8 @@ type ResponseTokensDetails struct {
 	// CachedTokens: Number of cached tokens.
 	CachedTokens int `json:"cached_tokens,omitempty"` //nolint:tagliatelle //follow openai api
 
-	// CachedCreationTokens: number of tokens that were written to the cache.
-	CachedCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle
+	// CacheCreationTokens: number of tokens that were written to the cache.
+	CacheCreationTokens int64 `json:"cache_creation_input_tokens"` //nolint:tagliatelle
 
 	// ReasoningTokens: Number of reasoning tokens (for reasoning models).
 	ReasoningTokens int `json:"reasoning_tokens,omitempty"` //nolint:tagliatelle //follow openai api
diff --git a/internal/apischema/openai/openai_test.go b/internal/apischema/openai/openai_test.go
index 21f1fab649..44a1f2aba5 100644
--- a/internal/apischema/openai/openai_test.go
+++ b/internal/apischema/openai/openai_test.go
@@ -1742,30 +1742,30 @@ func TestPromptTokensDetails(t *testing.T) {
 		{
 			name: "with text tokens",
 			details: PromptTokensDetails{
-				TextTokens:           15,
-				AudioTokens:          8,
-				CachedTokens:         384,
-				CachedCreationTokens: 10,
+				TextTokens:          15,
+				AudioTokens:         8,
+				CachedTokens:        384,
+				CacheCreationTokens: 10,
 			},
 			expected: `{
 				"text_tokens": 15,
 				"audio_tokens": 8,
 				"cached_tokens": 384,
-				"cached_creation_input_tokens": 10
+				"cache_creation_input_tokens": 10
 			}`,
 		},
 		{
 			name: "with zero text tokens omitted",
 			details: PromptTokensDetails{
-				TextTokens:           0,
-				AudioTokens:          8,
-				CachedTokens:         384,
-				CachedCreationTokens: 10,
+				TextTokens:          0,
+				AudioTokens:         8,
+				CachedTokens:        384,
+				CacheCreationTokens: 10,
 			},
 			expected: `{
 				"audio_tokens": 8,
 				"cached_tokens": 384,
-				"cached_creation_input_tokens": 10
+				"cache_creation_input_tokens": 10
 			}`,
 		},
 	}
@@ -1822,9 +1822,9 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					RejectedPredictionTokens: 0,
 				},
 				PromptTokensDetails: &PromptTokensDetails{
-					AudioTokens:          8,
-					CachedTokens:         384,
-					CachedCreationTokens: 13,
+					AudioTokens:         8,
+					CachedTokens:        384,
+					CacheCreationTokens: 13,
 				},
 			},
 			expected: `{
@@ -1838,7 +1838,7 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 				"prompt_tokens_details": {
 					"audio_tokens": 8,
 					"cached_tokens": 384,
-					"cached_creation_input_tokens": 13
+					"cache_creation_input_tokens": 13
 				}
 			}`,
 		},
@@ -1856,10 +1856,10 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					RejectedPredictionTokens: 0,
 				},
 				PromptTokensDetails: &PromptTokensDetails{
-					TextTokens:           15,
-					AudioTokens:          8,
-					CachedTokens:         384,
-					CachedCreationTokens: 21,
+					TextTokens:          15,
+					AudioTokens:         8,
+					CachedTokens:        384,
+					CacheCreationTokens: 21,
 				},
 			},
 			expected: `{
@@ -1875,7 +1875,7 @@ func TestChatCompletionResponseUsage(t *testing.T) {
 					"text_tokens": 15,
 					"audio_tokens": 8,
 					"cached_tokens": 384,
-					"cached_creation_input_tokens": 21
+					"cache_creation_input_tokens": 21
 				}
 			}`,
 		},
diff --git a/internal/controller/gateway.go b/internal/controller/gateway.go
index b2e577114f..7469dc711e 100644
--- a/internal/controller/gateway.go
+++ b/internal/controller/gateway.go
@@ -404,8 +404,8 @@ func (c *GatewayController) reconcileFilterConfigSecret(
 					fc.Type = filterapi.LLMRequestCostTypeInputToken
 				case aigv1a1.LLMRequestCostTypeCachedInputToken:
 					fc.Type = filterapi.LLMRequestCostTypeCachedInputToken
-				case aigv1a1.LLMRequestCostTypeCachedCreationInputToken:
-					fc.Type = filterapi.LLMRequestCostTypeCachedCreationInputToken
+				case aigv1a1.LLMRequestCostTypeCacheCreationInputToken:
+					fc.Type = filterapi.LLMRequestCostTypeCacheCreationInputToken
 				case aigv1a1.LLMRequestCostTypeOutputToken:
 					fc.Type = filterapi.LLMRequestCostTypeOutputToken
 				case aigv1a1.LLMRequestCostTypeTotalToken:
diff --git a/internal/controller/gateway_test.go b/internal/controller/gateway_test.go
index fb26991883..e2f98b88c6 100644
--- a/internal/controller/gateway_test.go
+++ b/internal/controller/gateway_test.go
@@ -197,7 +197,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 					{MetadataKey: "bar", Type: aigv1a1.LLMRequestCostTypeOutputToken},
 					{MetadataKey: "baz", Type: aigv1a1.LLMRequestCostTypeTotalToken},
 					{MetadataKey: "qux", Type: aigv1a1.LLMRequestCostTypeCachedInputToken},
-					{MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCachedCreationInputToken},
+					{MetadataKey: "zoo", Type: aigv1a1.LLMRequestCostTypeCacheCreationInputToken},
 				},
 			},
 		},
@@ -280,7 +280,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 		require.Equal(t, filterapi.LLMRequestCostTypeOutputToken, fc.LLMRequestCosts[1].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeTotalToken, fc.LLMRequestCosts[2].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeCachedInputToken, fc.LLMRequestCosts[3].Type)
-		require.Equal(t, filterapi.LLMRequestCostTypeCachedCreationInputToken, fc.LLMRequestCosts[4].Type)
+		require.Equal(t, filterapi.LLMRequestCostTypeCacheCreationInputToken, fc.LLMRequestCosts[4].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[5].Type)
 		require.Equal(t, `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[5].CEL)
 		require.Len(t, fc.Models, 1)
diff --git a/internal/extproc/mocks_test.go b/internal/extproc/mocks_test.go
index 6d5087eb9d..fd375c5192 100644
--- a/internal/extproc/mocks_test.go
+++ b/internal/extproc/mocks_test.go
@@ -171,17 +171,17 @@ func (m *mockMetricsFactory) NewMetrics() metrics.Metrics {
 
 // mockMetrics implements [metrics.Metrics] for testing.
 type mockMetrics struct {
-	requestStart                  time.Time
-	originalModel                 string
-	requestModel                  string
-	responseModel                 string
-	backend                       string
-	requestSuccessCount           int
-	requestErrorCount             int
-	inputTokenCount               int
-	cachedInputTokenCount         int
-	cachedCreationInputTokenCount int
-	outputTokenCount              int
+	requestStart                 time.Time
+	originalModel                string
+	requestModel                 string
+	responseModel                string
+	backend                      string
+	requestSuccessCount          int
+	requestErrorCount            int
+	inputTokenCount              int
+	cachedInputTokenCount        int
+	cacheCreationInputTokenCount int
+	outputTokenCount             int
 	// streamingOutputTokens tracks the cumulative output tokens recorded via RecordTokenLatency.
 	streamingOutputTokens int
 	timeToFirstToken      float64
@@ -219,8 +219,8 @@ func (m *mockMetrics) RecordTokenUsage(_ context.Context, usage metrics.TokenUsa
 	if cachedInput, ok := usage.CachedInputTokens(); ok {
 		m.cachedInputTokenCount += int(cachedInput)
 	}
-	if cachedCreationInput, ok := usage.CachedCreationInputTokens(); ok {
-		m.cachedCreationInputTokenCount += int(cachedCreationInput)
+	if cacheCreationInput, ok := usage.CacheCreationInputTokens(); ok {
+		m.cacheCreationInputTokenCount += int(cacheCreationInput)
 	}
 	if output, ok := usage.OutputTokens(); ok {
 		m.outputTokenCount += int(output)
@@ -285,7 +285,7 @@ func (m *mockMetrics) RequireRequestFailure(t *testing.T) {
 func (m *mockMetrics) RequireTokensRecorded(t *testing.T, expectedInput, expectedCachedInput, expectedWriteCachedInput, expectedOutput int) {
 	require.Equal(t, expectedInput, m.inputTokenCount)
 	require.Equal(t, expectedCachedInput, m.cachedInputTokenCount)
-	require.Equal(t, expectedWriteCachedInput, m.cachedCreationInputTokenCount)
+	require.Equal(t, expectedWriteCachedInput, m.cacheCreationInputTokenCount)
 	require.Equal(t, expectedOutput, m.outputTokenCount)
 }
 
diff --git a/internal/extproc/processor_impl.go b/internal/extproc/processor_impl.go
index 729268644c..c7cf651fb9 100644
--- a/internal/extproc/processor_impl.go
+++ b/internal/extproc/processor_impl.go
@@ -533,8 +533,8 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 			cost, _ = costs.InputTokens()
 		case filterapi.LLMRequestCostTypeCachedInputToken:
 			cost, _ = costs.CachedInputTokens()
-		case filterapi.LLMRequestCostTypeCachedCreationInputToken:
-			cost, _ = costs.CachedCreationInputTokens()
+		case filterapi.LLMRequestCostTypeCacheCreationInputToken:
+			cost, _ = costs.CacheCreationInputTokens()
 		case filterapi.LLMRequestCostTypeOutputToken:
 			cost, _ = costs.OutputTokens()
 		case filterapi.LLMRequestCostTypeTotalToken:
@@ -542,7 +542,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 		case filterapi.LLMRequestCostTypeCEL:
 			in, _ := costs.InputTokens()
 			cachedIn, _ := costs.CachedInputTokens()
-			cachedCreation, _ := costs.CachedCreationInputTokens()
+			cacheCreation, _ := costs.CacheCreationInputTokens()
 			out, _ := costs.OutputTokens()
 			total, _ := costs.TotalTokens()
 			costU64, err := llmcostcel.EvaluateProgram(
@@ -551,7 +551,7 @@ func buildDynamicMetadata(config *filterapi.RuntimeConfig, costs *metrics.TokenU
 				backendName,
 				in,
 				cachedIn,
-				cachedCreation,
+				cacheCreation,
 				out,
 				total,
 			)
diff --git a/internal/extproc/processor_impl_test.go b/internal/extproc/processor_impl_test.go
index 60f612fa17..26855743ff 100644
--- a/internal/extproc/processor_impl_test.go
+++ b/internal/extproc/processor_impl_test.go
@@ -259,7 +259,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		mt.retUsedToken.SetOutputTokens(123)
 		mt.retUsedToken.SetInputTokens(1)
 		mt.retUsedToken.SetCachedInputTokens(1)
-		mt.retUsedToken.SetCachedCreationInputTokens(3)
+		mt.retUsedToken.SetCacheCreationInputTokens(3)
 
 		celProgInt, err := llmcostcel.NewProgram("54321")
 		require.NoError(t, err)
@@ -275,7 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}},
 						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}},
-						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedCreationInputToken, MetadataKey: "cached_creation_input_token_usage"}},
+						{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCacheCreationInputToken, MetadataKey: "cache_creation_input_token_usage"}},
 						{
 							CELProg:        celProgInt,
 							LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"},
@@ -312,7 +312,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cached_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(3), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
-			GetStructValue().Fields["cached_creation_input_token_usage"].GetNumberValue())
+			GetStructValue().Fields["cache_creation_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cel_int"].GetNumberValue())
 		require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
@@ -375,7 +375,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		mt.expResponseBody = final
 		mt.retUsedToken.SetInputTokens(5)
 		mt.retUsedToken.SetCachedInputTokens(3)
-		mt.retUsedToken.SetCachedCreationInputTokens(21)
+		mt.retUsedToken.SetCacheCreationInputTokens(21)
 		mt.retUsedToken.SetOutputTokens(138)
 		mt.retUsedToken.SetTotalTokens(143)
 		_, err = p.ProcessResponseBody(t.Context(), final)
@@ -385,7 +385,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		require.Equal(t, 138, mm.outputTokenCount)
 		require.Equal(t, 138, mm.streamingOutputTokens) // accumulated output tokens from stream
 		require.Equal(t, 3, mm.cachedInputTokenCount)
-		require.Equal(t, 21, mm.cachedCreationInputTokenCount)
+		require.Equal(t, 21, mm.cacheCreationInputTokenCount)
 	})
 }
 
diff --git a/internal/filterapi/filterconfig.go b/internal/filterapi/filterconfig.go
index b2f10de51f..947cce5f4f 100644
--- a/internal/filterapi/filterconfig.go
+++ b/internal/filterapi/filterconfig.go
@@ -81,8 +81,8 @@ const (
 	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
 	// LLMRequestCostTypeCachedInputToken specifies that the request cost is calculated from the cached read input token.
 	LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken"
-	// LLMRequestCostTypeCachedCreationInputToken specifies that the request cost is calculated from the cache creation input token.
-	LLMRequestCostTypeCachedCreationInputToken LLMRequestCostType = "CachedCreationInputToken"
+	// LLMRequestCostTypeCacheCreationInputToken specifies that the request cost is calculated from the cache creation input token.
+	LLMRequestCostTypeCacheCreationInputToken LLMRequestCostType = "CacheCreationInputToken"
 	// LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token.
 	LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
 	// LLMRequestCostTypeCEL specifies that the request cost is calculated from the CEL expression.
diff --git a/internal/llmcostcel/cel.go b/internal/llmcostcel/cel.go
index 46d06c7130..c2ad384268 100644
--- a/internal/llmcostcel/cel.go
+++ b/internal/llmcostcel/cel.go
@@ -16,13 +16,13 @@ import (
 )
 
 const (
-	celModelNameKey                 = "model"
-	celBackendKey                   = "backend"
-	celInputTokensKey               = "input_tokens"
-	celCachedInputTokensKey         = "cached_input_tokens"          // #nosec G101
-	celCachedCreationInputTokensKey = "cached_creation_input_tokens" // #nosec G101
-	celOutputTokensKey              = "output_tokens"
-	celTotalTokensKey               = "total_tokens"
+	celModelNameKey                = "model"
+	celBackendKey                  = "backend"
+	celInputTokensKey              = "input_tokens"
+	celCachedInputTokensKey        = "cached_input_tokens"         // #nosec G101
+	celCacheCreationInputTokensKey = "cache_creation_input_tokens" // #nosec G101
+	celOutputTokensKey             = "output_tokens"
+	celTotalTokensKey              = "total_tokens"
 )
 
 var env *cel.Env
@@ -34,7 +34,7 @@ func init() {
 		cel.Variable(celBackendKey, cel.StringType),
 		cel.Variable(celInputTokensKey, cel.UintType),
 		cel.Variable(celCachedInputTokensKey, cel.UintType),
-		cel.Variable(celCachedCreationInputTokensKey, cel.UintType),
+		cel.Variable(celCacheCreationInputTokensKey, cel.UintType),
 		cel.Variable(celOutputTokensKey, cel.UintType),
 		cel.Variable(celTotalTokensKey, cel.UintType),
 	)
@@ -64,15 +64,15 @@ func NewProgram(expr string) (prog cel.Program, err error) {
 }
 
 // EvaluateProgram evaluates the given CEL program with the given variables.
-func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cachedCreationInputTokens, outputTokens, totalTokens uint32) (uint64, error) {
+func EvaluateProgram(prog cel.Program, modelName, backend string, inputTokens, cachedInputTokens, cacheCreationInputTokens, outputTokens, totalTokens uint32) (uint64, error) {
 	out, _, err := prog.Eval(map[string]any{
-		celModelNameKey:                 modelName,
-		celBackendKey:                   backend,
-		celInputTokensKey:               inputTokens,
-		celCachedInputTokensKey:         cachedInputTokens,
-		celCachedCreationInputTokensKey: cachedCreationInputTokens,
-		celOutputTokensKey:              outputTokens,
-		celTotalTokensKey:               totalTokens,
+		celModelNameKey:                modelName,
+		celBackendKey:                  backend,
+		celInputTokensKey:              inputTokens,
+		celCachedInputTokensKey:        cachedInputTokens,
+		celCacheCreationInputTokensKey: cacheCreationInputTokens,
+		celOutputTokensKey:             outputTokens,
+		celTotalTokensKey:              totalTokens,
 	})
 	if err != nil || out == nil {
 		return 0, fmt.Errorf("failed to evaluate CEL expression: %w", err)
diff --git a/internal/llmcostcel/cel_test.go b/internal/llmcostcel/cel_test.go
index 92a323fc63..cee9a259a5 100644
--- a/internal/llmcostcel/cel_test.go
+++ b/internal/llmcostcel/cel_test.go
@@ -26,7 +26,7 @@ func TestNewProgram(t *testing.T) {
 		require.NoError(t, err)
 	})
 	t.Run("variables", func(t *testing.T) {
-		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens - cached_creation_input_tokens) * output_tokens  : total_tokens")
+		prog, err := NewProgram("model == 'cool_model' ?  (input_tokens - cached_input_tokens - cache_creation_input_tokens) * output_tokens  : total_tokens")
 		require.NoError(t, err)
 		v, err := EvaluateProgram(prog, "cool_model", "cool_backend", 200, 100, 1, 2, 3)
 		require.NoError(t, err)
diff --git a/internal/metrics/genai.go b/internal/metrics/genai.go
index f739bf8764..de560f77bb 100644
--- a/internal/metrics/genai.go
+++ b/internal/metrics/genai.go
@@ -39,9 +39,9 @@ const (
 	// https://github.com/open-telemetry/semantic-conventions/issues/1959
 	//
 	// However, the spec says "a custom value MAY be used.", so we can use it now.
-	genaiTokenTypeCachedInput         = "cached_input"
-	genaiTokenTypeCachedCreationInput = "cached_creation_input"
-	genaiErrorTypeFallback            = "_OTHER"
+	genaiTokenTypeCachedInput        = "cached_input"
+	genaiTokenTypeCacheCreationInput = "cache_creation_input"
+	genaiErrorTypeFallback           = "_OTHER"
 )
 
 // GenAIOperation represents the type of generative AI operation i.e. the endpoint being called.
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index a68d810dac..e9929df812 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -149,10 +149,10 @@ type TokenUsage struct {
 	totalTokens uint32
 	// CachedInputTokens is the total number of tokens read from cache.
 	cachedInputTokens uint32
-	// CachedCreationInputTokens is the total number of tokens written to cache.
-	cachedCreationInputTokens uint32
+	// CacheCreationInputTokens is the total number of tokens written to cache.
+	cacheCreationInputTokens uint32
 
-	inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cachedCreationInputTokenSet bool
+	inputTokenSet, outputTokenSet, totalTokenSet, cachedInputTokenSet, cacheCreationInputTokenSet bool
 }
 
 // InputTokens returns the number of input tokens and whether it was set.
@@ -175,9 +175,9 @@ func (u *TokenUsage) CachedInputTokens() (uint32, bool) {
 	return u.cachedInputTokens, u.cachedInputTokenSet
 }
 
-// CachedCreationInputTokens returns the number of cache creation input tokens and whether it was set.
-func (u *TokenUsage) CachedCreationInputTokens() (uint32, bool) {
-	return u.cachedCreationInputTokens, u.cachedCreationInputTokenSet
+// CacheCreationInputTokens returns the number of cache creation input tokens and whether it was set.
+func (u *TokenUsage) CacheCreationInputTokens() (uint32, bool) {
+	return u.cacheCreationInputTokens, u.cacheCreationInputTokenSet
 }
 
 // SetInputTokens sets the number of input tokens and marks the field as set.
@@ -204,10 +204,10 @@ func (u *TokenUsage) SetCachedInputTokens(tokens uint32) {
 	u.cachedInputTokenSet = true
 }
 
-// SetCachedCreationInputTokens sets the number of cache creation input tokens and marks the field as set.
-func (u *TokenUsage) SetCachedCreationInputTokens(tokens uint32) {
-	u.cachedCreationInputTokens = tokens
-	u.cachedCreationInputTokenSet = true
+// SetCacheCreationInputTokens sets the number of cache creation input tokens and marks the field as set.
+func (u *TokenUsage) SetCacheCreationInputTokens(tokens uint32) {
+	u.cacheCreationInputTokens = tokens
+	u.cacheCreationInputTokenSet = true
 }
 
 // AddInputTokens increments the recorded input tokens and marks the field as set.
@@ -228,10 +228,10 @@ func (u *TokenUsage) AddCachedInputTokens(tokens uint32) {
 	u.cachedInputTokens += tokens
 }
 
-// AddCachedCreationInputTokens increments the recorded cache creation input tokens and marks the field as set.
-func (u *TokenUsage) AddCachedCreationInputTokens(tokens uint32) {
-	u.cachedCreationInputTokenSet = true
-	u.cachedCreationInputTokens += tokens
+// AddCacheCreationInputTokens increments the recorded cache creation input tokens and marks the field as set.
+func (u *TokenUsage) AddCacheCreationInputTokens(tokens uint32) {
+	u.cacheCreationInputTokenSet = true
+	u.cacheCreationInputTokens += tokens
 }
 
 // Override updates the TokenUsage fields with values from another TokenUsage instance.
@@ -253,27 +253,27 @@ func (u *TokenUsage) Override(other TokenUsage) {
 		u.cachedInputTokens = other.cachedInputTokens
 		u.cachedInputTokenSet = true
 	}
-	if other.cachedCreationInputTokenSet {
-		u.cachedCreationInputTokens = other.cachedCreationInputTokens
-		u.cachedCreationInputTokenSet = true
+	if other.cacheCreationInputTokenSet {
+		u.cacheCreationInputTokens = other.cacheCreationInputTokens
+		u.cacheCreationInputTokenSet = true
 	}
 }
 
 // ExtractTokenUsageFromAnthropic extracts the correct token usage from Anthropic API response.
 // According to Claude API documentation, total input tokens is the summation of:
-// input_tokens + cached_creation_input_tokens + cache_read_input_tokens
+// input_tokens + cache_creation_input_tokens + cache_read_input_tokens
 //
 // This function works for both streaming and non-streaming responses by accepting
 // the common usage fields that exist in all Anthropic usage structures.
-func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cachedCreationTokens int64) TokenUsage {
+func ExtractTokenUsageFromAnthropic(inputTokens, outputTokens, cacheReadTokens, cacheCreationTokens int64) TokenUsage {
 	// Calculate total input tokens as per Anthropic API documentation
-	totalInputTokens := inputTokens + cachedCreationTokens + cacheReadTokens
+	totalInputTokens := inputTokens + cacheCreationTokens + cacheReadTokens
 
 	var usage TokenUsage
-	usage.SetInputTokens(uint32(totalInputTokens))                   //nolint:gosec
-	usage.SetOutputTokens(uint32(outputTokens))                      //nolint:gosec
-	usage.SetTotalTokens(uint32(totalInputTokens + outputTokens))    //nolint:gosec
-	usage.SetCachedInputTokens(uint32(cacheReadTokens))              //nolint:gosec
-	usage.SetCachedCreationInputTokens(uint32(cachedCreationTokens)) //nolint:gosec
+	usage.SetInputTokens(uint32(totalInputTokens))                 //nolint:gosec
+	usage.SetOutputTokens(uint32(outputTokens))                    //nolint:gosec
+	usage.SetTotalTokens(uint32(totalInputTokens + outputTokens))  //nolint:gosec
+	usage.SetCachedInputTokens(uint32(cacheReadTokens))            //nolint:gosec
+	usage.SetCacheCreationInputTokens(uint32(cacheCreationTokens)) //nolint:gosec
 	return usage
 }
diff --git a/internal/metrics/metrics_impl.go b/internal/metrics/metrics_impl.go
index 8f13a50104..cbf8748467 100644
--- a/internal/metrics/metrics_impl.go
+++ b/internal/metrics/metrics_impl.go
@@ -148,10 +148,10 @@ func (b *metricsImpl) RecordTokenUsage(ctx context.Context, usage TokenUsage, re
 			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput)),
 		)
 	}
-	if cachedCreationInputTokens, ok := usage.CachedCreationInputTokens(); ok {
-		b.metrics.tokenUsage.Record(ctx, float64(cachedCreationInputTokens),
+	if cacheCreationInputTokens, ok := usage.CacheCreationInputTokens(); ok {
+		b.metrics.tokenUsage.Record(ctx, float64(cacheCreationInputTokens),
 			metric.WithAttributeSet(attrs),
-			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput)),
+			metric.WithAttributes(attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCacheCreationInput)),
 		)
 	}
 	if outputTokens, ok := usage.OutputTokens(); ok {
diff --git a/internal/metrics/metrics_impl_test.go b/internal/metrics/metrics_impl_test.go
index bcf6bc8282..720fa5899d 100644
--- a/internal/metrics/metrics_impl_test.go
+++ b/internal/metrics/metrics_impl_test.go
@@ -71,10 +71,10 @@ func TestRecordTokenUsage(t *testing.T) {
 			attribute.Key(genaiAttributeResponseModel).String("test-model"),
 		}
 		// gen_ai.token.type values - https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/#common-attributes
-		inputAttrs               = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...)
-		outputAttrs              = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...)
-		cachedInputAttrs         = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...)
-		cachedCreationInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput))...)
+		inputAttrs              = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeInput))...)
+		outputAttrs             = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeOutput))...)
+		cachedInputAttrs        = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedInput))...)
+		cacheCreationInputAttrs = attribute.NewSet(append(attrs, attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCacheCreationInput))...)
 	)
 
 	pm.SetOriginalModel("test-model")
@@ -82,8 +82,8 @@ func TestRecordTokenUsage(t *testing.T) {
 	pm.SetResponseModel("test-model")
 	pm.SetBackend(&filterapi.Backend{Schema: filterapi.VersionedAPISchema{Name: filterapi.APISchemaOpenAI}})
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
-		inputTokens: 10, cachedInputTokens: 8, cachedCreationInputTokens: 2, outputTokens: 5,
-		inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true,
+		inputTokens: 10, cachedInputTokens: 8, cacheCreationInputTokens: 2, outputTokens: 5,
+		inputTokenSet: true, cachedInputTokenSet: true, cacheCreationInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
 	count, sum := testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, inputAttrs)
@@ -94,7 +94,7 @@ func TestRecordTokenUsage(t *testing.T) {
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 8.0, sum)
 
-	count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs)
+	count, sum = testotel.GetHistogramValues(t, mr, genaiMetricClientTokenUsage, cacheCreationInputAttrs)
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 2.0, sum)
 
@@ -300,8 +300,8 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) {
 	pm.SetRequestModel("req-model")
 	pm.SetResponseModel("res-model")
 	pm.RecordTokenUsage(t.Context(), TokenUsage{
-		inputTokens: 2, cachedInputTokens: 1, cachedCreationInputTokens: 6, outputTokens: 3,
-		inputTokenSet: true, cachedInputTokenSet: true, cachedCreationInputTokenSet: true, outputTokenSet: true,
+		inputTokens: 2, cachedInputTokens: 1, cacheCreationInputTokens: 6, outputTokens: 3,
+		inputTokenSet: true, cachedInputTokenSet: true, cacheCreationInputTokenSet: true, outputTokenSet: true,
 	}, nil)
 
 	inputAttrs := attribute.NewSet(
@@ -328,15 +328,15 @@ func TestLabels_SetModel_RequestAndResponseDiffer(t *testing.T) {
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 1.0, sum)
 
-	cachedCreationInputAttrs := attribute.NewSet(
+	cacheCreationInputAttrs := attribute.NewSet(
 		attribute.Key(genaiAttributeOperationName).String(string(GenAIOperationCompletion)),
 		attribute.Key(genaiAttributeProviderName).String(genaiProviderOpenAI),
 		attribute.Key(genaiAttributeOriginalModel).String("orig-model"),
 		attribute.Key(genaiAttributeRequestModel).String("req-model"),
 		attribute.Key(genaiAttributeResponseModel).String("res-model"),
-		attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCachedCreationInput),
+		attribute.Key(genaiAttributeTokenType).String(genaiTokenTypeCacheCreationInput),
 	)
-	count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cachedCreationInputAttrs)
+	count, sum = getHistogramValues(t, mr, genaiMetricClientTokenUsage, cacheCreationInputAttrs)
 	assert.Equal(t, uint64(1), count)
 	assert.Equal(t, 6.0, sum)
 
diff --git a/internal/tracing/openinference/anthropic/messages.go b/internal/tracing/openinference/anthropic/messages.go
index ec6940056f..c513611b46 100644
--- a/internal/tracing/openinference/anthropic/messages.go
+++ b/internal/tracing/openinference/anthropic/messages.go
@@ -215,7 +215,7 @@ func buildResponseAttributes(resp *anthropic.MessagesResponse, config *openinfer
 	)
 	input, _ := cost.InputTokens()
 	cacheRead, _ := cost.CachedInputTokens()
-	cacheCreation, _ := cost.CachedCreationInputTokens()
+	cacheCreation, _ := cost.CacheCreationInputTokens()
 	output, _ := cost.OutputTokens()
 	total, _ := cost.TotalTokens()
 
diff --git a/internal/tracing/openinference/openai/response_attrs.go b/internal/tracing/openinference/openai/response_attrs.go
index 5cd3561401..c629430c7a 100644
--- a/internal/tracing/openinference/openai/response_attrs.go
+++ b/internal/tracing/openinference/openai/response_attrs.go
@@ -58,7 +58,7 @@ func buildResponseAttributes(resp *openai.ChatCompletionResponse, config *openin
 			attrs = append(attrs,
 				attribute.Int(openinference.LLMTokenCountPromptAudio, td.AudioTokens),
 				attribute.Int(openinference.LLMTokenCountPromptCacheHit, td.CachedTokens),
-				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CachedCreationTokens),
+				attribute.Int(openinference.LLMTokenCountPromptCacheWrite, td.CacheCreationTokens),
 			)
 		}
 	}
@@ -194,8 +194,8 @@ func buildResponsesResponseAttributes(resp *openai.Response, _ *openinference.Tr
 		if resp.Usage.InputTokensDetails.CachedTokens > 0 {
 			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheHit, int(resp.Usage.InputTokensDetails.CachedTokens)))
 		}
-		if resp.Usage.InputTokensDetails.CachedCreationTokens > 0 {
-			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CachedCreationTokens)))
+		if resp.Usage.InputTokensDetails.CacheCreationTokens > 0 {
+			attrs = append(attrs, attribute.Int(openinference.LLMTokenCountPromptCacheWrite, int(resp.Usage.InputTokensDetails.CacheCreationTokens)))
 		}
 	}
 
diff --git a/internal/tracing/openinference/openai/responses_test.go b/internal/tracing/openinference/openai/responses_test.go
index 3191c6e82d..3d15ceaabc 100644
--- a/internal/tracing/openinference/openai/responses_test.go
+++ b/internal/tracing/openinference/openai/responses_test.go
@@ -77,8 +77,8 @@ var (
 		Usage: &openai.ResponseUsage{
 			InputTokens: 100,
 			InputTokensDetails: openai.ResponseUsageInputTokensDetails{
-				CachedTokens:         10,
-				CachedCreationTokens: 50,
+				CachedTokens:        10,
+				CacheCreationTokens: 50,
 			},
 			OutputTokens: 25,
 			TotalTokens:  125,
diff --git a/internal/tracing/openinference/openinference.go b/internal/tracing/openinference/openinference.go
index 23fea486a9..f0f6d65ece 100644
--- a/internal/tracing/openinference/openinference.go
+++ b/internal/tracing/openinference/openinference.go
@@ -163,7 +163,7 @@ const (
 	// LLMTokenCountPromptCacheWrite represents the number of prompt tokens
 	// created to cache (cache write). This enables tracking of cache efficiency
 	// and cost savings from cached prompts.
-	LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cached_creation" // #nosec G101
+	LLMTokenCountPromptCacheWrite = "llm.token_count.prompt_details.cache_creation" // #nosec G101
 
 	// LLMTokenCountPromptAudio represents the number of audio tokens in the prompt.
 	// Used for multimodal models that support audio input.
diff --git a/internal/translator/anthropic_anthropic.go b/internal/translator/anthropic_anthropic.go
index 1f5c8d5bf0..0a5294a955 100644
--- a/internal/translator/anthropic_anthropic.go
+++ b/internal/translator/anthropic_anthropic.go
@@ -181,8 +181,8 @@ func (a *anthropicToAnthropicTranslator) updateTotalTokens() {
 		if _, cachedSet := a.streamingTokenUsage.CachedInputTokens(); !cachedSet {
 			a.streamingTokenUsage.SetCachedInputTokens(0)
 		}
-		if _, cachedSet := a.streamingTokenUsage.CachedCreationInputTokens(); !cachedSet {
-			a.streamingTokenUsage.SetCachedCreationInputTokens(0)
+		if _, cachedSet := a.streamingTokenUsage.CacheCreationInputTokens(); !cachedSet {
+			a.streamingTokenUsage.SetCacheCreationInputTokens(0)
 		}
 	}
 
diff --git a/internal/translator/anthropic_anthropic_test.go b/internal/translator/anthropic_anthropic_test.go
index 6d7e12f7ab..236dd537d9 100644
--- a/internal/translator/anthropic_anthropic_test.go
+++ b/internal/translator/anthropic_anthropic_test.go
@@ -89,7 +89,7 @@ func TestAnthropicToAnthropic_ResponseHeaders(t *testing.T) {
 func TestAnthropicToAnthropic_ResponseBody_non_streaming(t *testing.T) {
 	translator := NewAnthropicToAnthropicTranslator("", "")
 	require.NotNil(t, translator)
-	const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`
+	const responseBody = `{"model":"claude-sonnet-4-5-20250929","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`
 
 	headerMutation, bodyMutation, tokenUsage, responseModel, err := translator.ResponseBody(nil, strings.NewReader(responseBody), true, nil)
 	require.NoError(t, err)
@@ -108,7 +108,7 @@ func TestAnthropicToAnthropic_ResponseBody_streaming(t *testing.T) {
 	// We split the response into two parts to simulate streaming where each part can end in the
 	// middle of an event.
 	const responseHead = `event: message_start
-data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}}    }
+data: {"type":"message_start","message":{"model":"claude-sonnet-4-5-20250929","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":1,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard"}}    }
 
 event: content_block_start
 data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}      }
diff --git a/internal/translator/anthropic_gcpanthropic_test.go b/internal/translator/anthropic_gcpanthropic_test.go
index 187ed4bbc5..ad6a249af8 100644
--- a/internal/translator/anthropic_gcpanthropic_test.go
+++ b/internal/translator/anthropic_gcpanthropic_test.go
@@ -570,7 +570,7 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te
 	}
 }
 
-func tokenUsageFrom(in, cachedInput, cachedCreationInput, out, total int32) metrics.TokenUsage {
+func tokenUsageFrom(in, cachedInput, cacheCreationInput, out, total int32) metrics.TokenUsage {
 	var usage metrics.TokenUsage
 	if in >= 0 {
 		usage.SetInputTokens(uint32(in))
@@ -578,8 +578,8 @@ func tokenUsageFrom(in, cachedInput, cachedCreationInput, out, total int32) metr
 	if cachedInput >= 0 {
 		usage.SetCachedInputTokens(uint32(cachedInput))
 	}
-	if cachedCreationInput >= 0 {
-		usage.SetCachedCreationInputTokens(uint32(cachedCreationInput))
+	if cacheCreationInput >= 0 {
+		usage.SetCacheCreationInputTokens(uint32(cacheCreationInput))
 	}
 	if out >= 0 {
 		usage.SetOutputTokens(uint32(out))
@@ -638,7 +638,7 @@ data: {"type": "message_stop"}
 	outputTokens, outputSet := tokenUsage.OutputTokens()
 	totalTokens, totalSet := tokenUsage.TotalTokens()
 	cachedTokens, cachedSet := tokenUsage.CachedInputTokens()
-	cachedCreationTokens, cachedCreationSet := tokenUsage.CachedCreationInputTokens()
+	cacheCreationTokens, cacheCreationSet := tokenUsage.CacheCreationInputTokens()
 
 	// Assertions
 	assert.True(t, inputSet, "Input tokens should be set")
@@ -653,8 +653,8 @@ data: {"type": "message_stop"}
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
 
-	assert.True(t, cachedCreationSet, "cache creation tokens should be set")
-	assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario")
+	assert.True(t, cacheCreationSet, "cache creation tokens should be set")
+	assert.Equal(t, uint32(1), cacheCreationTokens, "No cache creation tokens in this scenario")
 
 	_, _, tokenUsage, _, err = translator.ResponseBody(nil, strings.NewReader(contentBlockStartChunk), false, nil)
 	require.NoError(t, err)
@@ -672,7 +672,7 @@ data: {"type": "message_stop"}
 	outputTokens, outputSet = tokenUsage.OutputTokens()
 	totalTokens, totalSet = tokenUsage.TotalTokens()
 	cachedTokens, cachedSet = tokenUsage.CachedInputTokens()
-	cachedCreationTokens, cachedCreationSet = tokenUsage.CachedCreationInputTokens()
+	cacheCreationTokens, cacheCreationSet = tokenUsage.CacheCreationInputTokens()
 
 	assert.True(t, inputSet, "Input tokens should be set")
 	assert.Equal(t, uint32(21), inputTokens, "Input tokens should be preserved from message_start")
@@ -686,6 +686,6 @@ data: {"type": "message_stop"}
 	assert.True(t, cachedSet, "Cached tokens should be set")
 	assert.Equal(t, uint32(5), cachedTokens, "No cached tokens in this scenario")
 
-	assert.True(t, cachedCreationSet, "cache creation tokens should be set")
-	assert.Equal(t, uint32(1), cachedCreationTokens, "No cache creation tokens in this scenario")
+	assert.True(t, cacheCreationSet, "cache creation tokens should be set")
+	assert.Equal(t, uint32(1), cacheCreationTokens, "No cache creation tokens in this scenario")
 }
diff --git a/internal/translator/anthropic_usage_test.go b/internal/translator/anthropic_usage_test.go
index 86e7a50772..cb75e8a7d0 100644
--- a/internal/translator/anthropic_usage_test.go
+++ b/internal/translator/anthropic_usage_test.go
@@ -16,88 +16,88 @@ import (
 
 func TestExtractLLMTokenUsage(t *testing.T) {
 	tests := []struct {
-		name                         string
-		inputTokens                  int64
-		outputTokens                 int64
-		cacheReadTokens              int64
-		cachedCreationTokens         int64
-		expectedInputTokens          uint32
-		expectedOutputTokens         uint32
-		expectedTotalTokens          uint32
-		expectedCachedTokens         uint32
-		expectedCachedCreationTokens uint32
+		name                        string
+		inputTokens                 int64
+		outputTokens                int64
+		cacheReadTokens             int64
+		cacheCreationTokens         int64
+		expectedInputTokens         uint32
+		expectedOutputTokens        uint32
+		expectedTotalTokens         uint32
+		expectedCachedTokens        uint32
+		expectedCacheCreationTokens uint32
 	}{
 		{
-			name:                         "basic usage without cache",
-			inputTokens:                  100,
-			outputTokens:                 50,
-			cacheReadTokens:              0,
-			cachedCreationTokens:         0,
-			expectedInputTokens:          100,
-			expectedOutputTokens:         50,
-			expectedTotalTokens:          150,
-			expectedCachedTokens:         0,
-			expectedCachedCreationTokens: 0,
+			name:                        "basic usage without cache",
+			inputTokens:                 100,
+			outputTokens:                50,
+			cacheReadTokens:             0,
+			cacheCreationTokens:         0,
+			expectedInputTokens:         100,
+			expectedOutputTokens:        50,
+			expectedTotalTokens:         150,
+			expectedCachedTokens:        0,
+			expectedCacheCreationTokens: 0,
 		},
 		{
-			name:                         "usage with cache read tokens",
-			inputTokens:                  80,
-			outputTokens:                 30,
-			cacheReadTokens:              20,
-			cachedCreationTokens:         0,
-			expectedInputTokens:          100, // 80 + 0 + 20
-			expectedOutputTokens:         30,
-			expectedTotalTokens:          130, // 100 + 30
-			expectedCachedTokens:         20,  // 20
-			expectedCachedCreationTokens: 0,
+			name:                        "usage with cache read tokens",
+			inputTokens:                 80,
+			outputTokens:                30,
+			cacheReadTokens:             20,
+			cacheCreationTokens:         0,
+			expectedInputTokens:         100, // 80 + 0 + 20
+			expectedOutputTokens:        30,
+			expectedTotalTokens:         130, // 100 + 30
+			expectedCachedTokens:        20,  // 20
+			expectedCacheCreationTokens: 0,
 		},
 		{
-			name:                         "usage with cache creation tokens",
-			inputTokens:                  60,
-			outputTokens:                 40,
-			cacheReadTokens:              0,
-			cachedCreationTokens:         15,
-			expectedInputTokens:          75, // 60 + 15 + 0
-			expectedOutputTokens:         40,
-			expectedTotalTokens:          115, // 75 + 40
-			expectedCachedTokens:         0,   // 0
-			expectedCachedCreationTokens: 15,  // 15
+			name:                        "usage with cache creation tokens",
+			inputTokens:                 60,
+			outputTokens:                40,
+			cacheReadTokens:             0,
+			cacheCreationTokens:         15,
+			expectedInputTokens:         75, // 60 + 15 + 0
+			expectedOutputTokens:        40,
+			expectedTotalTokens:         115, // 75 + 40
+			expectedCachedTokens:        0,   // 0
+			expectedCacheCreationTokens: 15,  // 15
 		},
 		{
-			name:                         "usage with both cache types",
-			inputTokens:                  70,
-			outputTokens:                 25,
-			cacheReadTokens:              10,
-			cachedCreationTokens:         5,
-			expectedInputTokens:          85, // 70 + 5 + 10
-			expectedOutputTokens:         25,
-			expectedTotalTokens:          110, // 85 + 25
-			expectedCachedTokens:         10,  // 10
-			expectedCachedCreationTokens: 5,   // 5
+			name:                        "usage with both cache types",
+			inputTokens:                 70,
+			outputTokens:                25,
+			cacheReadTokens:             10,
+			cacheCreationTokens:         5,
+			expectedInputTokens:         85, // 70 + 5 + 10
+			expectedOutputTokens:        25,
+			expectedTotalTokens:         110, // 85 + 25
+			expectedCachedTokens:        10,  // 10
+			expectedCacheCreationTokens: 5,   // 5
 		},
 		{
-			name:                         "zero values",
-			inputTokens:                  0,
-			outputTokens:                 0,
-			cacheReadTokens:              0,
-			cachedCreationTokens:         0,
-			expectedInputTokens:          0,
-			expectedOutputTokens:         0,
-			expectedTotalTokens:          0,
-			expectedCachedTokens:         0,
-			expectedCachedCreationTokens: 0,
+			name:                        "zero values",
+			inputTokens:                 0,
+			outputTokens:                0,
+			cacheReadTokens:             0,
+			cacheCreationTokens:         0,
+			expectedInputTokens:         0,
+			expectedOutputTokens:        0,
+			expectedTotalTokens:         0,
+			expectedCachedTokens:        0,
+			expectedCacheCreationTokens: 0,
 		},
 		{
-			name:                         "large values",
-			inputTokens:                  100000,
-			outputTokens:                 50000,
-			cacheReadTokens:              25000,
-			cachedCreationTokens:         15000,
-			expectedInputTokens:          140000, // 100000 + 15000 + 25000
-			expectedOutputTokens:         50000,
-			expectedTotalTokens:          190000, // 140000 + 50000
-			expectedCachedTokens:         25000,  // 25000
-			expectedCachedCreationTokens: 15000,
+			name:                        "large values",
+			inputTokens:                 100000,
+			outputTokens:                50000,
+			cacheReadTokens:             25000,
+			cacheCreationTokens:         15000,
+			expectedInputTokens:         140000, // 100000 + 15000 + 25000
+			expectedOutputTokens:        50000,
+			expectedTotalTokens:         190000, // 140000 + 50000
+			expectedCachedTokens:        25000,  // 25000
+			expectedCacheCreationTokens: 15000,
 		},
 	}
 
@@ -107,15 +107,15 @@ func TestExtractLLMTokenUsage(t *testing.T) {
 				tt.inputTokens,
 				tt.outputTokens,
 				tt.cacheReadTokens,
-				tt.cachedCreationTokens,
+				tt.cacheCreationTokens,
 			)
 
 			expected := tokenUsageFrom(
-				int32(tt.expectedInputTokens),          // nolint:gosec
-				int32(tt.expectedCachedTokens),         // nolint:gosec
-				int32(tt.expectedCachedCreationTokens), // nolint:gosec
-				int32(tt.expectedOutputTokens),         // nolint:gosec
-				int32(tt.expectedTotalTokens),          // nolint:gosec
+				int32(tt.expectedInputTokens),         // nolint:gosec
+				int32(tt.expectedCachedTokens),        // nolint:gosec
+				int32(tt.expectedCacheCreationTokens), // nolint:gosec
+				int32(tt.expectedOutputTokens),        // nolint:gosec
+				int32(tt.expectedTotalTokens),         // nolint:gosec
 			)
 			assert.Equal(t, expected, result)
 		})
@@ -124,13 +124,13 @@ func TestExtractLLMTokenUsage(t *testing.T) {
 
 func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 	tests := []struct {
-		name                         string
-		usage                        anthropic.Usage
-		expectedInputTokens          int32
-		expectedOutputTokens         int32
-		expectedTotalTokens          int32
-		expectedCachedTokens         uint32
-		expectedCachedCreationTokens uint32
+		name                        string
+		usage                       anthropic.Usage
+		expectedInputTokens         int32
+		expectedOutputTokens        int32
+		expectedTotalTokens         int32
+		expectedCachedTokens        uint32
+		expectedCacheCreationTokens uint32
 	}{
 		{
 			name: "non-streaming response without cache",
@@ -140,11 +140,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				CacheReadInputTokens:     0,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:          150,
-			expectedOutputTokens:         75,
-			expectedTotalTokens:          225,
-			expectedCachedTokens:         0,
-			expectedCachedCreationTokens: 0,
+			expectedInputTokens:         150,
+			expectedOutputTokens:        75,
+			expectedTotalTokens:         225,
+			expectedCachedTokens:        0,
+			expectedCacheCreationTokens: 0,
 		},
 		{
 			name: "non-streaming response with cache read",
@@ -154,11 +154,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				CacheReadInputTokens:     25,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:          125, // 100 + 0 + 25
-			expectedOutputTokens:         50,
-			expectedTotalTokens:          175, // 125 + 50
-			expectedCachedTokens:         25,  // 25
-			expectedCachedCreationTokens: 0,   // 0
+			expectedInputTokens:         125, // 100 + 0 + 25
+			expectedOutputTokens:        50,
+			expectedTotalTokens:         175, // 125 + 50
+			expectedCachedTokens:        25,  // 25
+			expectedCacheCreationTokens: 0,   // 0
 		},
 		{
 			name: "non-streaming response with both cache types",
@@ -168,11 +168,11 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				CacheReadInputTokens:     15,
 				CacheCreationInputTokens: 10,
 			},
-			expectedInputTokens:          115, // 90 + 10 + 15
-			expectedOutputTokens:         60,
-			expectedTotalTokens:          175, // 115 + 60
-			expectedCachedTokens:         15,  // 15
-			expectedCachedCreationTokens: 10,  // 10
+			expectedInputTokens:         115, // 90 + 10 + 15
+			expectedOutputTokens:        60,
+			expectedTotalTokens:         175, // 115 + 60
+			expectedCachedTokens:        15,  // 15
+			expectedCacheCreationTokens: 10,  // 10
 		},
 	}
 
@@ -183,7 +183,7 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CacheCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCacheCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -191,13 +191,13 @@ func TestExtractLLMTokenUsageFromUsage(t *testing.T) {
 
 func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 	tests := []struct {
-		name                         string
-		usage                        anthropic.MessageDeltaUsage
-		expectedInputTokens          int32
-		expectedOutputTokens         int32
-		expectedTotalTokens          int32
-		expectedCachedTokens         uint32
-		expectedCachedCreationTokens uint32
+		name                        string
+		usage                       anthropic.MessageDeltaUsage
+		expectedInputTokens         int32
+		expectedOutputTokens        int32
+		expectedTotalTokens         int32
+		expectedCachedTokens        uint32
+		expectedCacheCreationTokens uint32
 	}{
 		{
 			name: "message_delta event with final totals",
@@ -207,11 +207,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:     30,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:          280, // 250 + 0 + 30
-			expectedOutputTokens:         120,
-			expectedTotalTokens:          400, // 280 + 120
-			expectedCachedTokens:         30,  // 30
-			expectedCachedCreationTokens: 0,
+			expectedInputTokens:         280, // 250 + 0 + 30
+			expectedOutputTokens:        120,
+			expectedTotalTokens:         400, // 280 + 120
+			expectedCachedTokens:        30,  // 30
+			expectedCacheCreationTokens: 0,
 		},
 		{
 			name: "message_delta event with only output tokens",
@@ -221,11 +221,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:     0,
 				CacheCreationInputTokens: 0,
 			},
-			expectedInputTokens:          0,
-			expectedOutputTokens:         85,
-			expectedTotalTokens:          85,
-			expectedCachedTokens:         0,
-			expectedCachedCreationTokens: 0,
+			expectedInputTokens:         0,
+			expectedOutputTokens:        85,
+			expectedTotalTokens:         85,
+			expectedCachedTokens:        0,
+			expectedCacheCreationTokens: 0,
 		},
 		{
 			name: "message_delta with cache creation tokens",
@@ -235,11 +235,11 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				CacheReadInputTokens:     10,
 				CacheCreationInputTokens: 5,
 			},
-			expectedInputTokens:          165, // 150 + 5 + 10
-			expectedOutputTokens:         75,
-			expectedTotalTokens:          240, // 165 + 75
-			expectedCachedTokens:         10,  // 10
-			expectedCachedCreationTokens: 5,   // 5
+			expectedInputTokens:         165, // 150 + 5 + 10
+			expectedOutputTokens:        75,
+			expectedTotalTokens:         240, // 165 + 75
+			expectedCachedTokens:        10,  // 10
+			expectedCacheCreationTokens: 5,   // 5
 		},
 	}
 
@@ -250,7 +250,7 @@ func TestExtractLLMTokenUsageFromDeltaUsage(t *testing.T) {
 				tt.usage.CacheReadInputTokens,
 				tt.usage.CacheCreationInputTokens,
 			)
-			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCachedCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec
+			expected := tokenUsageFrom(tt.expectedInputTokens, int32(tt.expectedCachedTokens), int32(tt.expectedCacheCreationTokens), tt.expectedOutputTokens, tt.expectedTotalTokens) // nolint:gosec
 			assert.Equal(t, expected, result)
 		})
 	}
@@ -304,10 +304,10 @@ func TestExtractLLMTokenUsage_ClaudeAPIDocumentationCompliance(t *testing.T) {
 		assert.Equal(t, uint32(cacheReadTokens), cachedTokens,
 			"CachedInputTokens should be  cache_read_input_tokens")
 
-		cachedCreationTokens, ok := result.CachedCreationInputTokens()
+		cacheCreationTokens, ok := result.CacheCreationInputTokens()
 		assert.True(t, ok)
-		assert.Equal(t, uint32(cachedWriteTokens), cachedCreationTokens,
-			"CachedCreationInputTokens should be cache_creation_input_tokens")
+		assert.Equal(t, uint32(cachedWriteTokens), cacheCreationTokens,
+			"CacheCreationInputTokens should be cache_creation_input_tokens")
 
 		// Total tokens should be input + output.
 		expectedTotal := expectedTotalInput + uint32(outputTokens)
diff --git a/internal/translator/openai_awsbedrock.go b/internal/translator/openai_awsbedrock.go
index 6efaea9db7..f0ecf6a69e 100644
--- a/internal/translator/openai_awsbedrock.go
+++ b/internal/translator/openai_awsbedrock.go
@@ -708,7 +708,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 					tokenUsage.SetCachedInputTokens(uint32(*usage.CacheReadInputTokens)) //nolint:gosec
 				}
 				if usage.CacheWriteInputTokens != nil {
-					tokenUsage.SetCachedCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) //nolint:gosec
+					tokenUsage.SetCacheCreationInputTokens(uint32(*usage.CacheWriteInputTokens)) //nolint:gosec
 				}
 			}
 			oaiEvent, ok := o.convertEvent(event)
@@ -760,8 +760,8 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) ResponseBody(_ map[string
 			openAIResp.Usage.PromptTokensDetails.CachedTokens = *bedrockResp.Usage.CacheReadInputTokens
 		}
 		if bedrockResp.Usage.CacheWriteInputTokens != nil {
-			tokenUsage.SetCachedCreationInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec
-			openAIResp.Usage.PromptTokensDetails.CachedCreationTokens = *bedrockResp.Usage.CacheWriteInputTokens
+			tokenUsage.SetCacheCreationInputTokens(uint32(*bedrockResp.Usage.CacheWriteInputTokens)) //nolint:gosec
+			openAIResp.Usage.PromptTokensDetails.CacheCreationTokens = *bedrockResp.Usage.CacheWriteInputTokens
 		}
 	}
 
@@ -864,7 +864,7 @@ func (o *openAIToAWSBedrockTranslatorV1ChatCompletion) convertEvent(event *awsbe
 			chunk.Usage.PromptTokensDetails.CachedTokens = *event.Usage.CacheReadInputTokens
 		}
 		if event.Usage.CacheWriteInputTokens != nil {
-			chunk.Usage.PromptTokensDetails.CachedCreationTokens = *event.Usage.CacheWriteInputTokens
+			chunk.Usage.PromptTokensDetails.CacheCreationTokens = *event.Usage.CacheWriteInputTokens
 		}
 	// messageStart event.
 	case awsbedrock.ConverseStreamEventTypeMessageStart.String():
diff --git a/internal/translator/openai_awsbedrock_test.go b/internal/translator/openai_awsbedrock_test.go
index c533ec6dbf..fd9ef5c6f8 100644
--- a/internal/translator/openai_awsbedrock_test.go
+++ b/internal/translator/openai_awsbedrock_test.go
@@ -1474,8 +1474,8 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T)
 					PromptTokens:     10,
 					CompletionTokens: 20,
 					PromptTokensDetails: &openai.PromptTokensDetails{
-						CachedTokens:         5,
-						CachedCreationTokens: 7,
+						CachedTokens:        5,
+						CacheCreationTokens: 7,
 					},
 				},
 				Choices: []openai.ChatCompletionResponseChoice{
@@ -1725,7 +1725,7 @@ func TestOpenAIToAWSBedrockTranslatorV1ChatCompletion_ResponseBody(t *testing.T)
 					expectedUsage.SetCachedInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedTokens)) //nolint:gosec
 				}
 				if tt.input.Usage.CacheWriteInputTokens != nil {
-					expectedUsage.SetCachedCreationInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec
+					expectedUsage.SetCacheCreationInputTokens(uint32(tt.output.Usage.PromptTokensDetails.CacheCreationTokens)) //nolint:gosec
 				}
 			} else {
 				expectedUsage = tokenUsageFrom(-1, -1, -1, -1, -1)
diff --git a/internal/translator/openai_completions.go b/internal/translator/openai_completions.go
index 7f5008d8dd..8bd894214d 100644
--- a/internal/translator/openai_completions.go
+++ b/internal/translator/openai_completions.go
@@ -171,8 +171,8 @@ func (o *openAIToOpenAITranslatorV1Completion) extractUsageFromBufferEvent(span
 			tokenUsage.SetOutputTokens(uint32(usage.CompletionTokens)) //nolint:gosec
 			tokenUsage.SetTotalTokens(uint32(usage.TotalTokens))       //nolint:gosec
 			if usage.PromptTokensDetails != nil {
-				tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens))                 //nolint:gosec
-				tokenUsage.SetCachedCreationInputTokens(uint32(usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec
+				tokenUsage.SetCachedInputTokens(uint32(usage.PromptTokensDetails.CachedTokens))               //nolint:gosec
+				tokenUsage.SetCacheCreationInputTokens(uint32(usage.PromptTokensDetails.CacheCreationTokens)) //nolint:gosec
 			}
 			// Do not mark buffering done; keep scanning to return the latest usage in this batch.
 		}
diff --git a/internal/translator/openai_gcpanthropic.go b/internal/translator/openai_gcpanthropic.go
index 6a916cc0e2..716053cf4b 100644
--- a/internal/translator/openai_gcpanthropic.go
+++ b/internal/translator/openai_gcpanthropic.go
@@ -839,14 +839,14 @@ func (o *openAIToGCPAnthropicTranslatorV1ChatCompletion) ResponseBody(_ map[stri
 	outputTokens, _ := tokenUsage.OutputTokens()
 	totalTokens, _ := tokenUsage.TotalTokens()
 	cachedTokens, _ := tokenUsage.CachedInputTokens()
-	cacheWriteTokens, _ := tokenUsage.CachedCreationInputTokens()
+	cacheWriteTokens, _ := tokenUsage.CacheCreationInputTokens()
 	openAIResp.Usage = openai.Usage{
 		CompletionTokens: int(outputTokens),
 		PromptTokens:     int(inputTokens),
 		TotalTokens:      int(totalTokens),
 		PromptTokensDetails: &openai.PromptTokensDetails{
-			CachedTokens:         int(cachedTokens),
-			CachedCreationTokens: int(cacheWriteTokens),
+			CachedTokens:        int(cachedTokens),
+			CacheCreationTokens: int(cacheWriteTokens),
 		},
 	}
 
diff --git a/internal/translator/openai_gcpanthropic_stream.go b/internal/translator/openai_gcpanthropic_stream.go
index 5d9e12c4a1..1846ec7358 100644
--- a/internal/translator/openai_gcpanthropic_stream.go
+++ b/internal/translator/openai_gcpanthropic_stream.go
@@ -110,7 +110,7 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t
 		p.tokenUsage.SetTotalTokens(inputTokens + outputTokens)
 		totalTokens, _ := p.tokenUsage.TotalTokens()
 		cachedTokens, _ := p.tokenUsage.CachedInputTokens()
-		cachedCreationTokens, _ := p.tokenUsage.CachedCreationInputTokens()
+		cacheCreationTokens, _ := p.tokenUsage.CacheCreationInputTokens()
 		finalChunk := openai.ChatCompletionResponseChunk{
 			ID:      p.activeMessageID,
 			Created: p.created,
@@ -121,8 +121,8 @@ func (p *anthropicStreamParser) Process(body io.Reader, endOfStream bool, span t
 				CompletionTokens: int(outputTokens),
 				TotalTokens:      int(totalTokens),
 				PromptTokensDetails: &openai.PromptTokensDetails{
-					CachedTokens:         int(cachedTokens),
-					CachedCreationTokens: int(cachedCreationTokens),
+					CachedTokens:        int(cachedTokens),
+					CacheCreationTokens: int(cacheCreationTokens),
 				},
 			},
 			Model: p.requestModel,
@@ -213,8 +213,8 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 		if cached, ok := usage.CachedInputTokens(); ok {
 			p.tokenUsage.SetCachedInputTokens(cached)
 		}
-		if cachedCreation, ok := usage.CachedCreationInputTokens(); ok {
-			p.tokenUsage.SetCachedCreationInputTokens(cachedCreation)
+		if cacheCreation, ok := usage.CacheCreationInputTokens(); ok {
+			p.tokenUsage.SetCacheCreationInputTokens(cacheCreation)
 		}
 
 		// reset the toolIndex for each message
@@ -298,10 +298,10 @@ func (p *anthropicStreamParser) handleAnthropicStreamEvent(eventType []byte, dat
 			p.tokenUsage.AddCachedInputTokens(cached)
 		}
 		// Update input tokens to include write cache tokens from delta
-		if cached, ok := usage.CachedCreationInputTokens(); ok {
+		if cached, ok := usage.CacheCreationInputTokens(); ok {
 			p.tokenUsage.AddInputTokens(cached)
 			// Accumulate any additional cache tokens from delta
-			p.tokenUsage.AddCachedCreationInputTokens(cached)
+			p.tokenUsage.AddCacheCreationInputTokens(cached)
 		}
 		if event.Delta.StopReason != "" {
 			p.stopReason = event.Delta.StopReason
diff --git a/internal/translator/openai_gcpanthropic_test.go b/internal/translator/openai_gcpanthropic_test.go
index e48cd64242..1e36924adc 100644
--- a/internal/translator/openai_gcpanthropic_test.go
+++ b/internal/translator/openai_gcpanthropic_test.go
@@ -598,11 +598,11 @@ func TestOpenAIToGCPAnthropicTranslatorV1ChatCompletion_ResponseBody(t *testing.
 			require.NoError(t, err)
 
 			expectedTokenUsage := tokenUsageFrom(
-				int32(tt.expectedOpenAIResponse.Usage.PromptTokens),                             // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens),         // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedCreationTokens), // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.CompletionTokens),                         // nolint:gosec
-				int32(tt.expectedOpenAIResponse.Usage.TotalTokens),                              // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokens),                            // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CachedTokens),        // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.PromptTokensDetails.CacheCreationTokens), // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.CompletionTokens),                        // nolint:gosec
+				int32(tt.expectedOpenAIResponse.Usage.TotalTokens),                             // nolint:gosec
 			)
 			require.Equal(t, expectedTokenUsage, usedToken)
 
diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go
index 7aca029b4f..1fa59c2f9a 100644
--- a/internal/translator/openai_gcpvertexai.go
+++ b/internal/translator/openai_gcpvertexai.go
@@ -170,7 +170,7 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) ResponseBody(_ map[strin
 		tokenUsage.SetOutputTokens(uint32(gcpResp.UsageMetadata.CandidatesTokenCount))         //nolint:gosec
 		tokenUsage.SetTotalTokens(uint32(gcpResp.UsageMetadata.TotalTokenCount))               //nolint:gosec
 		tokenUsage.SetCachedInputTokens(uint32(gcpResp.UsageMetadata.CachedContentTokenCount)) //nolint:gosec
-		// Gemini does not return cache creation input tokens; Skipping setCachedCreationInputTokens.
+		// Gemini does not return cache creation input tokens; Skipping setCacheCreationInputTokens.
 	}
 
 	if span != nil {
diff --git a/internal/translator/openai_openai.go b/internal/translator/openai_openai.go
index 65b33a34cd..01b300f17c 100644
--- a/internal/translator/openai_openai.go
+++ b/internal/translator/openai_openai.go
@@ -141,8 +141,8 @@ func (o *openAIToOpenAITranslatorV1ChatCompletion) ResponseBody(_ map[string]str
 	tokenUsage.SetOutputTokens(uint32(resp.Usage.CompletionTokens)) //nolint:gosec
 	tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))       //nolint:gosec
 	if resp.Usage.PromptTokensDetails != nil {
-		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens))                 //nolint:gosec
-		tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedCreationTokens)) //nolint:gosec
+		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.PromptTokensDetails.CachedTokens))               //nolint:gosec
+		tokenUsage.SetCacheCreationInputTokens(uint32(resp.Usage.PromptTokensDetails.CacheCreationTokens)) //nolint:gosec
 	}
 	// Fallback to request model for test or non-compliant OpenAI backends
 	responseModel = cmp.Or(resp.Model, o.requestModel)
diff --git a/internal/translator/openai_responses.go b/internal/translator/openai_responses.go
index 30934c5c63..3f61e7a8e9 100644
--- a/internal/translator/openai_responses.go
+++ b/internal/translator/openai_responses.go
@@ -128,11 +128,11 @@ func (o *openAIToOpenAITranslatorV1Responses) handleNonStreamingResponse(body io
 
 	// TODO: Add reasoning token usage
 	if resp.Usage != nil {
-		tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens))                                           // #nosec G115
-		tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens))                                         // #nosec G115
-		tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))                                           // #nosec G115
-		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens))                 // #nosec G115
-		tokenUsage.SetCachedCreationInputTokens(uint32(resp.Usage.InputTokensDetails.CachedCreationTokens)) // #nosec G115
+		tokenUsage.SetInputTokens(uint32(resp.Usage.InputTokens))                                         // #nosec G115
+		tokenUsage.SetOutputTokens(uint32(resp.Usage.OutputTokens))                                       // #nosec G115
+		tokenUsage.SetTotalTokens(uint32(resp.Usage.TotalTokens))                                         // #nosec G115
+		tokenUsage.SetCachedInputTokens(uint32(resp.Usage.InputTokensDetails.CachedTokens))               // #nosec G115
+		tokenUsage.SetCacheCreationInputTokens(uint32(resp.Usage.InputTokensDetails.CacheCreationTokens)) // #nosec G115
 	}
 
 	// Record non-streaming response to span if tracing is enabled.
@@ -180,7 +180,7 @@ func (o *openAIToOpenAITranslatorV1Responses) extractUsageFromBufferEvent(span t
 				tokenUsage.SetTotalTokens(uint32(respComplEvent.Response.Usage.TotalTokens))                           // #nosec G115
 				tokenUsage.SetCachedInputTokens(uint32(respComplEvent.Response.Usage.InputTokensDetails.CachedTokens)) // #nosec G115
 				// Openai does not support cache creation response.
-				tokenUsage.SetCachedCreationInputTokens(uint32(0)) // #nosec G115
+				tokenUsage.SetCacheCreationInputTokens(uint32(0)) // #nosec G115
 			}
 			// Record streaming chunk to span if tracing is enabled.
 			if span != nil {
diff --git a/internal/translator/openai_responses_test.go b/internal/translator/openai_responses_test.go
index 363933d525..eb1a757218 100644
--- a/internal/translator/openai_responses_test.go
+++ b/internal/translator/openai_responses_test.go
@@ -247,9 +247,9 @@ func TestResponsesOpenAIToOpenAITranslator_ResponseBody(t *testing.T) {
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
+		cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedCreationTokens)
+		require.Equal(t, uint32(0), cacheCreationTokens)
 	})
 
 	t.Run("non-streaming response with fallback model", func(t *testing.T) {
@@ -363,9 +363,9 @@ data: [DONE]
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
+		cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedCreationTokens)
+		require.Equal(t, uint32(0), cacheCreationTokens)
 	})
 
 	t.Run("streaming response with fallback model", func(t *testing.T) {
@@ -462,9 +462,9 @@ data: [DONE]
 		cachedTokens, _ := tokenUsage.CachedInputTokens()
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
+		cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedCreationTokens)
+		require.Equal(t, uint32(0), cacheCreationTokens)
 	})
 
 	t.Run("streaming read error", func(t *testing.T) {
@@ -554,9 +554,9 @@ func TestResponses_HandleNonStreamingResponse(t *testing.T) {
 		cachedTokens, _ := tokenUsage.CachedInputTokens()
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
+		cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedCreationTokens)
+		require.Equal(t, uint32(0), cacheCreationTokens)
 	})
 
 	t.Run("invalid JSON", func(t *testing.T) {
@@ -619,9 +619,9 @@ data: [DONE]
 		require.True(t, ok)
 		require.Equal(t, uint32(2), cachedTokens)
 
-		cachedCreationTokens, ok := tokenUsage.CachedCreationInputTokens()
+		cacheCreationTokens, ok := tokenUsage.CacheCreationInputTokens()
 		require.True(t, ok)
-		require.Equal(t, uint32(0), cachedCreationTokens)
+		require.Equal(t, uint32(0), cacheCreationTokens)
 	})
 
 	t.Run("model extraction", func(t *testing.T) {
@@ -686,11 +686,11 @@ data: [DONE]
 		_, outputSet := tokenUsage.OutputTokens()
 		_, totalSet := tokenUsage.TotalTokens()
 		_, cachedSet := tokenUsage.CachedInputTokens()
-		_, cachedCreationSet := tokenUsage.CachedCreationInputTokens()
+		_, cacheCreationSet := tokenUsage.CacheCreationInputTokens()
 
 		require.False(t, totalSet)
 		require.False(t, cachedSet)
-		require.False(t, cachedCreationSet)
+		require.False(t, cacheCreationSet)
 		require.False(t, inputSet)
 		require.False(t, outputSet)
 	})
diff --git a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml
index e75770a409..f256789b72 100644
--- a/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml
+++ b/manifests/charts/ai-gateway-crds-helm/templates/aigateway.envoyproxy.io_aigatewayroutes.yaml
@@ -173,8 +173,8 @@ spec:
                   metadataKey: llm_input_token\n\t  type: InputToken\n\t- metadataKey:
                   llm_output_token\n\t  type: OutputToken\n\t- metadataKey: llm_total_token\n\t
                   \ type: TotalToken\n\t- metadataKey: llm_cached_input_token\n\t
-                  \ type: CachedInputToken\n- metadataKey: llm_cached_creation_input_token\n
-                  \  type: CachedCreationInputToken\n```\nThen, with the following
+                  \ type: CachedInputToken\n- metadataKey: llm_cache_creation_input_token\n
+                  \  type: CacheCreationInputToken\n```\nThen, with the following
                   BackendTrafficPolicy of Envoy Gateway, you can have three\nrate
                   limit buckets for each unique x-user-id header value. One bucket
                   is for the input token,\nthe other is for the output token, and
@@ -230,7 +230,7 @@ spec:
                         \"name.namespace\". Type: string.\n\t* input_tokens: the number
                         of input tokens. Type: unsigned integer.\n\t* cached_input_tokens:
                         the number of cached read input tokens. Type: unsigned integer.\n\t*
-                        cached_creation_input_tokens: the number of cache creation
+                        cache_creation_input_tokens: the number of cache creation
                         input tokens. Type: unsigned integer.\n\t* output_tokens:
                         the number of output tokens. Type: unsigned integer.\n\t*
                         total_tokens: the total number of tokens. Type: unsigned integer.\n\nFor
@@ -238,7 +238,7 @@ spec:
                         == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens\"\n\t*
                         \"backend == 'foo.default' ?  input_tokens + output_tokens
                         : total_tokens\"\n\t* \"backend == 'bar.default' ?  (input_tokens
-                        - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens
+                        - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens
                         * 1.25 + output_tokens : total_tokens\"\n\t* \"input_tokens
                         + output_tokens + total_tokens\"\n\t* \"input_tokens * output_tokens\""
                       type: string
@@ -250,12 +250,12 @@ spec:
                       description: |-
                         Type specifies the type of the request cost. The default is "OutputToken",
                         and it uses "output token" as the cost. The other types are "InputToken", "TotalToken",
-                        "CachedInputToken", "CachedCreationInputToken", and "CEL".
+                        "CachedInputToken", "CacheCreationInputToken", and "CEL".
                       enum:
                       - OutputToken
                       - InputToken
                       - CachedInputToken
-                      - CachedCreationInputToken
+                      - CacheCreationInputToken
                       - TotalToken
                       - CEL
                       type: string
diff --git a/site/docs/api/api.mdx b/site/docs/api/api.mdx
index 0516d744cb..bac8159967 100644
--- a/site/docs/api/api.mdx
+++ b/site/docs/api/api.mdx
@@ -757,7 +757,7 @@ AIGatewayRouteSpec details the AIGatewayRoute configuration.
   name="llmRequestCosts"
   type="[LLMRequestCost](#llmrequestcost) array"
   required="false"
-  description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.<br />The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic<br />metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,<br />For example, let's say we have the following LLMRequestCosts configuration:<br />```yaml<br />	llmRequestCosts:<br />	- metadataKey: llm_input_token<br />	  type: InputToken<br />	- metadataKey: llm_output_token<br />	  type: OutputToken<br />	- metadataKey: llm_total_token<br />	  type: TotalToken<br />	- metadataKey: llm_cached_input_token<br />	  type: CachedInputToken<br />- metadataKey: llm_cached_creation_input_token<br />   type: CachedCreationInputToken<br />```<br />Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three<br />rate limit buckets for each unique x-user-id header value. One bucket is for the input token,<br />the other is for the output token, and the last one is for the total token.<br />Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.<br />```yaml<br />	apiVersion: gateway.envoyproxy.io/v1alpha1<br />	kind: BackendTrafficPolicy<br />	metadata:<br />	  name: some-example-token-rate-limit<br />	  namespace: default<br />	spec:<br />	  targetRefs:<br />	  - group: gateway.networking.k8s.io<br />	     kind: HTTPRoute<br />	     name: usage-rate-limit<br />	  rateLimit:<br />	    type: Global<br />	    global:<br />	      rules:<br />	        - clientSelectors:<br />	            # Do the rate limiting based on the x-user-id header.<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            # Configures the number of `tokens` allowed per hour.<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              # Setting the request cost to zero allows to only check the rate limit budget,<br />	              # and not consume the budget on the request path.<br />	              number: 0<br />	            # This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.<br />	            # The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited<br />	            # if the budget is exhausted.<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_input_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_output_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_total_token<br />```<br />Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and<br />different costs are configured for the same metadata key, the ai-gateway will pick one of them<br />to configure the metadata key in the generated HTTPRoute, and ignore the rest."
+  description="LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.<br />The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic<br />metadata per HTTP request. The namespaced key is `io.envoy.ai_gateway`,<br />For example, let's say we have the following LLMRequestCosts configuration:<br />```yaml<br />	llmRequestCosts:<br />	- metadataKey: llm_input_token<br />	  type: InputToken<br />	- metadataKey: llm_output_token<br />	  type: OutputToken<br />	- metadataKey: llm_total_token<br />	  type: TotalToken<br />	- metadataKey: llm_cached_input_token<br />	  type: CachedInputToken<br />- metadataKey: llm_cache_creation_input_token<br />   type: CacheCreationInputToken<br />```<br />Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three<br />rate limit buckets for each unique x-user-id header value. One bucket is for the input token,<br />the other is for the output token, and the last one is for the total token.<br />Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.<br />```yaml<br />	apiVersion: gateway.envoyproxy.io/v1alpha1<br />	kind: BackendTrafficPolicy<br />	metadata:<br />	  name: some-example-token-rate-limit<br />	  namespace: default<br />	spec:<br />	  targetRefs:<br />	  - group: gateway.networking.k8s.io<br />	     kind: HTTPRoute<br />	     name: usage-rate-limit<br />	  rateLimit:<br />	    type: Global<br />	    global:<br />	      rules:<br />	        - clientSelectors:<br />	            # Do the rate limiting based on the x-user-id header.<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            # Configures the number of `tokens` allowed per hour.<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              # Setting the request cost to zero allows to only check the rate limit budget,<br />	              # and not consume the budget on the request path.<br />	              number: 0<br />	            # This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.<br />	            # The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited<br />	            # if the budget is exhausted.<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_input_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_output_token<br />	        - clientSelectors:<br />	            - headers:<br />	                - name: x-user-id<br />	                  type: Distinct<br />	          limit:<br />	            requests: 10000<br />	            unit: Hour<br />	          cost:<br />	            request:<br />	              from: Number<br />	              number: 0<br />	            response:<br />	              from: Metadata<br />	              metadata:<br />	                namespace: io.envoy.ai_gateway<br />	                key: llm_total_token<br />```<br />Note that when multiple AIGatewayRoute resources are attached to the same Gateway, and<br />different costs are configured for the same metadata key, the ai-gateway will pick one of them<br />to configure the metadata key in the generated HTTPRoute, and ignore the rest."
 />
 
 
@@ -1664,12 +1664,12 @@ LLMRequestCost configures each request cost.
   name="type"
   type="[LLMRequestCostType](#llmrequestcosttype)"
   required="true"
-  description="Type specifies the type of the request cost. The default is `OutputToken`,<br />and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,<br />`CachedInputToken`, `CachedCreationInputToken`, and `CEL`."
+  description="Type specifies the type of the request cost. The default is `OutputToken`,<br />and it uses `output token` as the cost. The other types are `InputToken`, `TotalToken`,<br />`CachedInputToken`, `CacheCreationInputToken`, and `CEL`."
 /><ApiField
   name="cel"
   type="string"
   required="false"
-  description="CEL is the CEL expression to calculate the cost of the request.<br />The CEL expression must return a signed or unsigned integer. If the<br />return value is negative, it will be error.<br />The expression can use the following variables:<br />	* model: the model name extracted from the request content. Type: string.<br />	* backend: the backend name in the form of `name.namespace`. Type: string.<br />	* input_tokens: the number of input tokens. Type: unsigned integer.<br />	* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.<br />	* cached_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.<br />	* output_tokens: the number of output tokens. Type: unsigned integer.<br />	* total_tokens: the total number of tokens. Type: unsigned integer.<br />For example, the following expressions are valid:<br />	* `model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens`<br />	* `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`<br />	* `backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cached_creation_input_tokens * 1.25 + output_tokens : total_tokens`<br />	* `input_tokens + output_tokens + total_tokens`<br />	* `input_tokens * output_tokens`"
+  description="CEL is the CEL expression to calculate the cost of the request.<br />The CEL expression must return a signed or unsigned integer. If the<br />return value is negative, it will be error.<br />The expression can use the following variables:<br />	* model: the model name extracted from the request content. Type: string.<br />	* backend: the backend name in the form of `name.namespace`. Type: string.<br />	* input_tokens: the number of input tokens. Type: unsigned integer.<br />	* cached_input_tokens: the number of cached read input tokens. Type: unsigned integer.<br />	* cache_creation_input_tokens: the number of cache creation input tokens. Type: unsigned integer.<br />	* output_tokens: the number of output tokens. Type: unsigned integer.<br />	* total_tokens: the total number of tokens. Type: unsigned integer.<br />For example, the following expressions are valid:<br />	* `model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens`<br />	* `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`<br />	* `backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + cache_creation_input_tokens * 1.25 + output_tokens : total_tokens`<br />	* `input_tokens + output_tokens + total_tokens`<br />	* `input_tokens * output_tokens`"
 />
 
 
@@ -1697,10 +1697,10 @@ LLMRequestCostType specifies the type of the LLMRequestCost.
   required="false"
   description="LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.<br />"
 /><ApiField
-  name="CachedCreationInputToken"
+  name="CacheCreationInputToken"
   type="enum"
   required="false"
-  description="LLMRequestCostTypeCachedCreationInputToken is the cost type of the cached input token.<br />"
+  description="LLMRequestCostTypeCacheCreationInputToken is the cost type of the cached input token.<br />"
 /><ApiField
   name="OutputToken"
   type="enum"
diff --git a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
index 38b2851a85..86e4421c57 100644
--- a/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
+++ b/tests/crdcel/testdata/aigatewayroutes/llmcosts.yaml
@@ -32,7 +32,7 @@ spec:
     - metadataKey: llm_input_cached_token
       type: CachedInputToken
     - metadataKey: llm_write_input_cached_token
-      type: CachedCreationInputToken
+      type: CacheCreationInputToken
     - metadataKey: llm_output_token
       type: OutputToken
     - metadataKey: llm_total_token
diff --git a/tests/data-plane/testupstream_test.go b/tests/data-plane/testupstream_test.go
index d1ef2abba8..6cc8b1000c 100644
--- a/tests/data-plane/testupstream_test.go
+++ b/tests/data-plane/testupstream_test.go
@@ -931,7 +931,7 @@ data: {"type": "message_stop"}
     ]
   }`,
 			expPath:      "/v1/messages",
-			responseBody: `{"model":"foo","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`,
+			responseBody: `{"model":"foo","id":"msg_01J5gW6Sffiem6avXSAooZZw","type":"message","role":"assistant","content":[{"type":"text","text":"Hi! 👋 How can I help you today?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":16,"service_tier":"standard"}}`,
 			expStatus:    http.StatusOK,
 		},
 		{
@@ -954,7 +954,7 @@ data: {"type": "message_stop"}
 			expPath: "/v1/messages",
 			responseBody: `
 event: message_start
-data: {"type":"message_start","message":{"model":"foo","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"cached_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard"}}    }
+data: {"type":"message_start","message":{"model":"foo","id":"msg_01BfvfMsg2gBzwsk6PZRLtDg","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard"}}    }
 
 event: content_block_start
 data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}      }
@@ -975,7 +975,7 @@ event: content_block_stop
 data: {"type":"content_block_stop","index":0             }
 
 event: message_delta
-data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":9,"cached_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":16}               }
+data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":9,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":16}               }
 
 event: message_stop
 data: {"type":"message_stop"       }
diff --git a/tests/data-plane/vcr/prometheus_metrics_test.go b/tests/data-plane/vcr/prometheus_metrics_test.go
index 4951b4994d..0e365e956b 100644
--- a/tests/data-plane/vcr/prometheus_metrics_test.go
+++ b/tests/data-plane/vcr/prometheus_metrics_test.go
@@ -107,7 +107,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 	t.Helper()
 	require.NotNil(t, metric)
 	require.Len(t, metric.Metric, 4)
-	var inputMetric, cachedInputMetric, cachedCreationInputMetric, outputMetric *dto.Metric
+	var inputMetric, cachedInputMetric, cacheCreationInputMetric, outputMetric *dto.Metric
 	for _, m := range metric.Metric {
 		for _, label := range m.Label {
 			if *label.Name == "gen_ai_token_type" {
@@ -116,8 +116,8 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 					inputMetric = m
 				case "cached_input":
 					cachedInputMetric = m
-				case "cached_creation_input":
-					cachedCreationInputMetric = m
+				case "cache_creation_input":
+					cacheCreationInputMetric = m
 				case "output":
 					outputMetric = m
 				}
@@ -127,7 +127,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 	}
 	require.NotNil(t, inputMetric, "Input metric not found")
 	require.NotNil(t, cachedInputMetric, "Cached Input metric not found")
-	require.NotNil(t, cachedCreationInputMetric, "Cached Creation Input metric not found")
+	require.NotNil(t, cacheCreationInputMetric, "Cached Creation Input metric not found")
 	require.NotNil(t, outputMetric, "Output metric not found")
 
 	type testCase struct {
@@ -139,7 +139,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 	cases := []testCase{
 		{inputMetric, "input", 8},
 		{cachedInputMetric, "cached_input", 0},
-		{cachedCreationInputMetric, "cached_creation_input", 0},
+		{cacheCreationInputMetric, "cache_creation_input", 0},
 		{outputMetric, "output", 377},
 	}
 

From e48c5552f4fe468701a4fda0dda1b59e2af9f5a3 Mon Sep 17 00:00:00 2001
From: Aaron Choo <achoo30@bloomberg.net>
Date: Fri, 2 Jan 2026 19:22:35 -0500
Subject: [PATCH 20/20] update missing

Signed-off-by: Aaron Choo <achoo30@bloomberg.net>
---
 tests/data-plane/vcr/prometheus_metrics_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/data-plane/vcr/prometheus_metrics_test.go b/tests/data-plane/vcr/prometheus_metrics_test.go
index 0e365e956b..5a9161f441 100644
--- a/tests/data-plane/vcr/prometheus_metrics_test.go
+++ b/tests/data-plane/vcr/prometheus_metrics_test.go
@@ -127,7 +127,7 @@ func verifyPrometheusTokenUsage(t *testing.T, metric *dto.MetricFamily, expected
 	}
 	require.NotNil(t, inputMetric, "Input metric not found")
 	require.NotNil(t, cachedInputMetric, "Cached Input metric not found")
-	require.NotNil(t, cacheCreationInputMetric, "Cached Creation Input metric not found")
+	require.NotNil(t, cacheCreationInputMetric, "Cache Creation Input metric not found")
 	require.NotNil(t, outputMetric, "Output metric not found")
 
 	type testCase struct {