Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ GID='1000'
# LLM_PROVIDER='anthropic'
# ANTHROPIC_API_KEY=sk-ant-xxxx
# ANTHROPIC_MODEL_PREF='claude-2'
# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.

# LLM_PROVIDER='lmstudio'
# LMSTUDIO_BASE_PATH='http://your-server:1234/v1'
Expand Down
1 change: 1 addition & 0 deletions server/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
# LLM_PROVIDER='anthropic'
# ANTHROPIC_API_KEY=sk-ant-xxxx
# ANTHROPIC_MODEL_PREF='claude-2'
# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.

# LLM_PROVIDER='lmstudio'
# LMSTUDIO_BASE_PATH='http://your-server:1234/v1'
Expand Down
66 changes: 64 additions & 2 deletions server/utils/AiProviders/anthropic/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class AnthropicLLM {

this.embedder = embedder ?? new NativeEmbedder();
this.defaultTemp = 0.7;
this.cacheControl = this.#parseCacheControl(
process.env.ANTHROPIC_CACHE_CONTROL
);
this.log(`Initialized with ${this.model}`);
}

Expand All @@ -57,6 +60,62 @@ class AnthropicLLM {
return true;
}

/**
* Parses the cache control ENV variable
* @param {string} value - The ENV value (5m or 1h)
* @returns {null|object} Cache control configuration
*/
#parseCacheControl(value) {
if (!value) return null;
const normalized = value.toLowerCase().trim();
if (normalized === "5m" || normalized === "1h") {
return { type: "ephemeral", ttl: normalized };
}
return null;
}

/**
* Checks if content meets minimum requirements for caching
* Per Anthropic docs: minimum 1024 tokens
*
* Certain models (Haiku 3.5, 3) have a minimum of 2048 tokens but
* after testing, 1024 tokens can be passed with no errors and
* Anthropic will automatically ignore it unless it's above the minimum of 2048 tokens.
* https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
* @param {string} content - The content to check
* @returns {boolean}
*/
#shouldCache(content) {
if (!this.cacheControl || !content) return false;
// Rough token estimate: ~4 chars per token
// Minimum 1024 tokens = ~4096 characters
const estimatedTokens = content.length / 4;
return estimatedTokens >= 1024;
}

/**
* Builds system parameter with cache control if applicable
* @param {string} systemContent - The system prompt content
* @returns {string|array} System parameter for API call
*/
#buildSystemWithCache(systemContent) {
if (!systemContent) return systemContent;

// If caching is enabled and content is large enough
// apply cache control
if (this.#shouldCache(systemContent)) {
return [
{
type: "text",
text: systemContent,
cache_control: this.cacheControl,
},
];
}

return systemContent;
}

/**
* Generates appropriate content array for a message + attachments.
* @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}}
Expand Down Expand Up @@ -105,18 +164,20 @@ class AnthropicLLM {

async getChatCompletion(messages = null, { temperature = 0.7 }) {
try {
const systemContent = messages[0].content;
const result = await LLMPerformanceMonitor.measureAsyncFunction(
this.anthropic.messages.create({
model: this.model,
max_tokens: 4096,
system: messages[0].content, // Strip out the system message
system: this.#buildSystemWithCache(systemContent), // Apply cache control if enabled
messages: messages.slice(1), // Pop off the system message
temperature: Number(temperature ?? this.defaultTemp),
})
);

const promptTokens = result.output.usage.input_tokens;
const completionTokens = result.output.usage.output_tokens;

return {
textResponse: result.output.content[0].text,
metrics: {
Expand All @@ -134,11 +195,12 @@ class AnthropicLLM {
}

async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
const systemContent = messages[0].content;
const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
this.anthropic.messages.stream({
model: this.model,
max_tokens: 4096,
system: messages[0].content, // Strip out the system message
system: this.#buildSystemWithCache(systemContent), // Apply cache control if enabled
messages: messages.slice(1), // Pop off the system message
temperature: Number(temperature ?? this.defaultTemp),
}),
Expand Down
64 changes: 62 additions & 2 deletions server/utils/agents/aibitat/providers/anthropic.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,72 @@ class AnthropicProvider extends Provider {
super(client);

this.model = model;
this.cacheControl = this.#parseCacheControl(
process.env.ANTHROPIC_CACHE_CONTROL
);
}

get supportsAgentStreaming() {
return true;
}

/**
* Parses the cache control ENV variable
* @param {string} value - The ENV value (5m or 1h)
* @returns {null|object} Cache control configuration
*/
#parseCacheControl(value) {
if (!value) return null;
const normalized = value.toLowerCase().trim();
if (normalized === "5m" || normalized === "1h") {
return { type: "ephemeral", ttl: normalized };
}
return null;
}

/**
* Checks if content meets minimum requirements for caching
* Per Anthropic docs: minimum 1024 tokens
*
* Certain models (Haiku 3.5, 3) have a minimum of 2048 tokens but
* after testing, 1024 tokens can be passed with no errors and
* Anthropic will automatically ignore it unless it's above the minimum of 2048 tokens.
* https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
* @param {string} content - The content to check
* @returns {boolean}
*/
#shouldCache(content) {
if (!this.cacheControl || !content) return false;
// Rough token estimate: ~4 chars per token
// Minimum 1024 tokens = ~4096 characters
const estimatedTokens = content.length / 4;
return estimatedTokens >= 1024;
}

/**
* Builds system parameter with cache control if applicable
* @param {string} systemContent - The system prompt content
* @returns {string|array} System parameter for API call
*/
#buildSystemWithCache(systemContent) {
if (!systemContent) return systemContent;

// If caching is enabled and content is large enough
// apply cache control
if (this.#shouldCache(systemContent)) {
return [
{
type: "text",
text: systemContent,
cache_control: this.cacheControl,
},
];
}

// Otherwise, return as plain string (no caching)
return systemContent;
}

#prepareMessages(messages = []) {
// Extract system prompt and filter out any system messages from the main chat.
let systemPrompt =
Expand Down Expand Up @@ -149,7 +209,7 @@ class AnthropicProvider extends Provider {
{
model: this.model,
max_tokens: 4096,
system: systemPrompt,
system: this.#buildSystemWithCache(systemPrompt), // Apply cache control if enabled
messages: chats,
stream: true,
...(Array.isArray(functions) && functions?.length > 0
Expand Down Expand Up @@ -276,7 +336,7 @@ class AnthropicProvider extends Provider {
{
model: this.model,
max_tokens: 4096,
system: systemPrompt,
system: this.#buildSystemWithCache(systemPrompt), // Apply cache control if enabled
messages: chats,
stream: false,
...(Array.isArray(functions) && functions?.length > 0
Expand Down