Mintplex-Labs · shatfield4 · Oct 4, 2025 · Oct 13, 2025
diff --git a/docker/.env.example b/docker/.env.example
@@ -27,6 +27,7 @@ GID='1000'
 # LLM_PROVIDER='anthropic'
 # ANTHROPIC_API_KEY=sk-ant-xxxx
 # ANTHROPIC_MODEL_PREF='claude-2'
+# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.
 
 # LLM_PROVIDER='lmstudio'
 # LMSTUDIO_BASE_PATH='http://your-server:1234/v1'

diff --git a/server/.env.example b/server/.env.example
@@ -24,6 +24,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
 # LLM_PROVIDER='anthropic'
 # ANTHROPIC_API_KEY=sk-ant-xxxx
 # ANTHROPIC_MODEL_PREF='claude-2'
+# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.
 
 # LLM_PROVIDER='lmstudio'
 # LMSTUDIO_BASE_PATH='http://your-server:1234/v1'

diff --git a/server/utils/AiProviders/anthropic/index.js b/server/utils/AiProviders/anthropic/index.js
@@ -34,6 +34,9 @@ class AnthropicLLM {
 
     this.embedder = embedder ?? new NativeEmbedder();
     this.defaultTemp = 0.7;
+    this.cacheControl = this.#parseCacheControl(
+      process.env.ANTHROPIC_CACHE_CONTROL
+    );
     this.log(`Initialized with ${this.model}`);
   }
 
@@ -57,6 +60,62 @@ class AnthropicLLM {
     return true;
   }
 
+  /**
+   * Parses the cache control ENV variable
+   * @param {string} value - The ENV value (5m or 1h)
+   * @returns {null|object} Cache control configuration
+   */
+  #parseCacheControl(value) {
+    if (!value) return null;
+    const normalized = value.toLowerCase().trim();
+    if (normalized === "5m" || normalized === "1h") {
+      return { type: "ephemeral", ttl: normalized };
+    }
+    return null;
+  }
+
+  /**
+   * Checks if content meets minimum requirements for caching
+   * Per Anthropic docs: minimum 1024 tokens
+   *
+   * Certain models (Haiku 3.5, 3) have a minimum of 2048 tokens but
+   * after testing, 1024 tokens can be passed with no errors and
+   * Anthropic will automatically ignore it unless it's above the minimum of 2048 tokens.
+   * https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
+   * @param {string} content - The content to check
+   * @returns {boolean}
+   */
+  #shouldCache(content) {
+    if (!this.cacheControl || !content) return false;
+    // Rough token estimate: ~4 chars per token
+    // Minimum 1024 tokens = ~4096 characters
+    const estimatedTokens = content.length / 4;
+    return estimatedTokens >= 1024;
+  }
+
+  /**
+   * Builds system parameter with cache control if applicable
+   * @param {string} systemContent - The system prompt content
+   * @returns {string|array} System parameter for API call
+   */
+  #buildSystemWithCache(systemContent) {
+    if (!systemContent) return systemContent;
+
+    // If caching is enabled and content is large enough
+    // apply cache control
+    if (this.#shouldCache(systemContent)) {
+      return [
+        {
+          type: "text",
+          text: systemContent,
+          cache_control: this.cacheControl,
+        },
+      ];
+    }
+
+    return systemContent;
+  }
+
   /**
    * Generates appropriate content array for a message + attachments.
    * @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}}
@@ -105,18 +164,20 @@ class AnthropicLLM {
 
   async getChatCompletion(messages = null, { temperature = 0.7 }) {
     try {
+      const systemContent = messages[0].content;
       const result = await LLMPerformanceMonitor.measureAsyncFunction(
         this.anthropic.messages.create({
           model: this.model,
           max_tokens: 4096,
-          system: messages[0].content, // Strip out the system message
+          system: this.#buildSystemWithCache(systemContent), // Apply cache control if enabled
           messages: messages.slice(1), // Pop off the system message
           temperature: Number(temperature ?? this.defaultTemp),
         })
       );
 
       const promptTokens = result.output.usage.input_tokens;
       const completionTokens = result.output.usage.output_tokens;
+
       return {
         textResponse: result.output.content[0].text,
         metrics: {
@@ -134,11 +195,12 @@ class AnthropicLLM {
   }
 
   async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
+    const systemContent = messages[0].content;
     const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
       this.anthropic.messages.stream({
         model: this.model,
         max_tokens: 4096,
-        system: messages[0].content, // Strip out the system message
+        system: this.#buildSystemWithCache(systemContent), // Apply cache control if enabled
         messages: messages.slice(1), // Pop off the system message
         temperature: Number(temperature ?? this.defaultTemp),
       }),

diff --git a/server/utils/agents/aibitat/providers/anthropic.js b/server/utils/agents/aibitat/providers/anthropic.js
@@ -25,12 +25,72 @@ class AnthropicProvider extends Provider {
     super(client);
 
     this.model = model;
+    this.cacheControl = this.#parseCacheControl(
+      process.env.ANTHROPIC_CACHE_CONTROL
+    );
   }
 
   get supportsAgentStreaming() {
     return true;
   }
 
+  /**
+   * Parses the cache control ENV variable
+   * @param {string} value - The ENV value (5m or 1h)
+   * @returns {null|object} Cache control configuration
+   */
+  #parseCacheControl(value) {
+    if (!value) return null;
+    const normalized = value.toLowerCase().trim();
+    if (normalized === "5m" || normalized === "1h") {
+      return { type: "ephemeral", ttl: normalized };
+    }
+    return null;
+  }
+
+  /**
+   * Checks if content meets minimum requirements for caching
+   * Per Anthropic docs: minimum 1024 tokens
+   *
+   * Certain models (Haiku 3.5, 3) have a minimum of 2048 tokens but
+   * after testing, 1024 tokens can be passed with no errors and
+   * Anthropic will automatically ignore it unless it's above the minimum of 2048 tokens.
+   * https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
+   * @param {string} content - The content to check
+   * @returns {boolean}
+   */
+  #shouldCache(content) {
+    if (!this.cacheControl || !content) return false;
+    // Rough token estimate: ~4 chars per token
+    // Minimum 1024 tokens = ~4096 characters
+    const estimatedTokens = content.length / 4;
+    return estimatedTokens >= 1024;
+  }
+
+  /**
+   * Builds system parameter with cache control if applicable
+   * @param {string} systemContent - The system prompt content
+   * @returns {string|array} System parameter for API call
+   */
+  #buildSystemWithCache(systemContent) {
+    if (!systemContent) return systemContent;
+
+    // If caching is enabled and content is large enough
+    // apply cache control
+    if (this.#shouldCache(systemContent)) {
+      return [
+        {
+          type: "text",
+          text: systemContent,
+          cache_control: this.cacheControl,
+        },
+      ];
+    }
+
+    // Otherwise, return as plain string (no caching)
+    return systemContent;
+  }
+
   #prepareMessages(messages = []) {
     // Extract system prompt and filter out any system messages from the main chat.
     let systemPrompt =
@@ -149,7 +209,7 @@ class AnthropicProvider extends Provider {
         {
           model: this.model,
           max_tokens: 4096,
-          system: systemPrompt,
+          system: this.#buildSystemWithCache(systemPrompt), // Apply cache control if enabled
           messages: chats,
           stream: true,
           ...(Array.isArray(functions) && functions?.length > 0
@@ -276,7 +336,7 @@ class AnthropicProvider extends Provider {
         {
           model: this.model,
           max_tokens: 4096,
-          system: systemPrompt,
+          system: this.#buildSystemWithCache(systemPrompt), // Apply cache control if enabled
           messages: chats,
           stream: false,
           ...(Array.isArray(functions) && functions?.length > 0