tetherto · simon-iribarren · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
@@ -0,0 +1,60 @@
+import {
+  completion,
+  loadModel,
+  unloadModel,
+  LLAMA_3_2_1B_INST_Q4_0,
+  SDK_SERVER_ERROR_CODES,
+} from "@qvac/sdk";
+
+const requestedCtxSize = Number(process.argv[2]) || 32768;
+
+console.log(`Loading model: ${LLAMA_3_2_1B_INST_Q4_0.name}`);
+console.log(`Requested ctx_size: ${requestedCtxSize}`);
+
+try {
+  const modelId = await loadModel({
+    modelSrc: LLAMA_3_2_1B_INST_Q4_0,
+    modelType: "llm",
+    modelConfig: {
+      ctx_size: requestedCtxSize,
+    },
+    onProgress: (progress) =>
+      console.log(`Loading: ${progress.percentage.toFixed(1)}%`),
+  });
+
+  console.log(`Model loaded with ctx_size=${requestedCtxSize}`);
+
+  const result = completion({
+    modelId,
+    history: [{ role: "user", content: "Say hello in one sentence." }],
+    stream: true,
+  });
+
+  process.stdout.write("Response: ");
+  for await (const token of result.tokenStream) {
+    process.stdout.write(token);
+  }
+  console.log();
+
+  await unloadModel({ modelId, clearStorage: false });
+} catch (error: unknown) {
+  if (
+    error instanceof Error &&
+    "code" in error &&
+    (error as { code: number }).code ===
+      SDK_SERVER_ERROR_CODES.MODEL_MEMORY_EXCEEDED
+  ) {
+    console.error(`\nMemory validation failed: ${error.message}`);
+    console.error(
+      "The SDK detected that loading with the requested ctx_size",
+    );
+    console.error(
+      "would likely exceed available memory and crash the app.",
+    );
+    console.error("Re-run with a smaller ctx_size, for example:");
+    console.error(`  bun run examples/memory-safe-loading.ts 2048`);
+  } else {
+    console.error("Error:", error);
+  }
+  process.exit(1);
+}
@@ -17,7 +17,7 @@ const verbositySchema = z.union([
 
 // Base schema - validates types, all fields optional (for client-side validation)
 export const llmConfigBaseSchema = z.object({
-  ctx_size: z.number().optional(),
+  ctx_size: z.number().int().min(1).optional(),
   temp: z.number().min(0).max(2).optional(),
   top_p: z.number().min(0).max(1).optional(),
   top_k: z.number().int().min(0).max(128).optional(),

@@ -120,6 +120,13 @@ export interface ResolveResult<
   artifacts?: Partial<Record<K, string>>;
 }
 
+export interface ValidateBeforeLoadParams {
+  modelConfig: Record<string, unknown>;
+  modelFileSize: number;
+  availableMemory: number;
+  kvBytesPerToken?: number;
+}
+
 export interface QvacPlugin<
   TConfig = Record<string, unknown>,
   TArtifactKeys extends string = string,
@@ -142,6 +149,12 @@ export interface QvacPlugin<
     modelConfig: TConfig,
     ctx: ResolveContext,
   ) => Promise<ResolveResult<TConfig, TArtifactKeys>>;
+  /**
+   * Optional hook to validate resource requirements before loading.
+   * Called after config resolution and model path resolution, before createModel.
+   * Throw to reject the load (e.g., insufficient memory for the requested ctx_size).
+   */
+  validateBeforeLoad?: (params: ValidateBeforeLoadParams) => void;
 }
 
 // Non-streaming plugin invoke
@@ -262,6 +275,7 @@ export const pluginDefinitionRuntimeSchema = z
       .catchall(z.unknown())
       .optional(),
     resolveConfig: functionRuntimeSchema.optional(),
+    validateBeforeLoad: functionRuntimeSchema.optional(),
     skipPrimaryModelPathValidation: z.boolean().optional(),
   })
   .catchall(z.unknown());

@@ -166,6 +166,14 @@ export const qvacConfigSchema = z.object({
    * ```
    */
   deviceDefaults: z.array(devicePatternSchema).optional(),
+
+  /**
+   * Disable pre-load memory validation for model loading.
+   * When set to true, the SDK skips memory estimation before loading a model,
+   * which may lead to OOM crashes if the model + KV cache exceeds available memory.
+   * Defaults to false (memory validation is enabled).
+   */
+  unsafeDisableMemoryValidation: z.boolean().optional(),
 });
 
 export type QvacConfig = z.infer<typeof qvacConfigSchema>;
@@ -16,6 +16,7 @@ export const SDK_SERVER_ERROR_CODES = {
   MODEL_FILE_LOCATE_FAILED: 52203,
   PROJECTION_MODEL_REQUIRED: 52204,
   VAD_MODEL_REQUIRED: 52205,
+  MODEL_MEMORY_EXCEEDED: 52211,
   TTS_ARTIFACTS_REQUIRED: 52208,
   TTS_REFERENCE_AUDIO_REQUIRED: 52209,
   PARAKEET_ARTIFACTS_REQUIRED: 52210,
@@ -161,6 +162,16 @@ const serverErrorDefinitions: ErrorCodesMap = {
     message: (modelType: string, modelPath: string) =>
       `Failed to locate ${modelType} model file: ${modelPath}`,
   },
+  [SDK_SERVER_ERROR_CODES.MODEL_MEMORY_EXCEEDED]: {
+    name: "MODEL_MEMORY_EXCEEDED",
+    message: (
+      estimatedMB: string,
+      availableMB: string,
+      requestedCtx: string,
+      suggestedCtx: string,
+    ) =>
+      `Model requires approximately ${estimatedMB} MB but only ${availableMB} MB is available. Try reducing ctx_size from ${requestedCtx} to ${suggestedCtx} or lower.`,
+  },
   [SDK_SERVER_ERROR_CODES.PROJECTION_MODEL_REQUIRED]: {
     name: "PROJECTION_MODEL_REQUIRED",
     message: "Projection model source is required for multimodal LLM models",

@@ -0,0 +1,104 @@
+import { LLM_CONFIG_DEFAULTS } from "@/schemas/llamacpp-config";
+
+const BYTES_PER_MB = 1024 * 1024;
+const BYTES_PER_GB = 1024 * BYTES_PER_MB;
+const MEMORY_USAGE_THRESHOLD = 0.80;
+const MAX_CTX_SIZE = 131072;
+const OVERHEAD_FIXED_BYTES = 128 * BYTES_PER_MB;
+const OVERHEAD_MODEL_FRACTION = 0.10;
+
+// Fallback heuristic brackets when GGUF metadata is unavailable.
+// Calibrated from actual model architectures with f16 KV cache:
+//   Llama-3.2-1B  (737MB Q4):  32,768 bytes/token
+//   Llama-3.2-3B  (~2GB Q4):  114,688 bytes/token
+//   Llama-3.1-8B  (~4.5GB Q4): 131,072 bytes/token
+interface KvBytesPerTokenBracket {
+  maxFileSize: number;
+  bytesPerToken: number;
+}
+
+const KV_BRACKETS: KvBytesPerTokenBracket[] = [
+  { maxFileSize: 1 * BYTES_PER_GB, bytesPerToken: 48_000 },
+  { maxFileSize: 3 * BYTES_PER_GB, bytesPerToken: 128_000 },
+  { maxFileSize: 6 * BYTES_PER_GB, bytesPerToken: 200_000 },
+  { maxFileSize: 15 * BYTES_PER_GB, bytesPerToken: 350_000 },
+  { maxFileSize: Infinity, bytesPerToken: 500_000 },
+];
+
+export function estimateKvBytesPerToken(modelFileSize: number): number {
+  for (const bracket of KV_BRACKETS) {
+    if (modelFileSize < bracket.maxFileSize) return bracket.bytesPerToken;
+  }
+  return KV_BRACKETS[KV_BRACKETS.length - 1]!.bytesPerToken;
+}
+
+export function estimateOverhead(modelFileSize: number): number {
+  return OVERHEAD_FIXED_BYTES + Math.floor(modelFileSize * OVERHEAD_MODEL_FRACTION);
+}
+
+function resolveKvBytesPerToken(
+  modelFileSize: number,
+  exactKvBytesPerToken?: number,
+): number {
+  if (exactKvBytesPerToken !== undefined && exactKvBytesPerToken > 0) {
+    return exactKvBytesPerToken;
+  }
+  return estimateKvBytesPerToken(modelFileSize);
+}
+
+export function estimateMemoryRequired(
+  modelFileSize: number,
+  ctxSize: number,
+  exactKvBytesPerToken?: number,
+): number {
+  const kvBytesPerToken = resolveKvBytesPerToken(modelFileSize, exactKvBytesPerToken);
+  const kvCacheBytes = ctxSize * kvBytesPerToken;
+  return modelFileSize + kvCacheBytes + estimateOverhead(modelFileSize);
+}
+
+export function computeSafeCtxSize(
+  availableMemory: number,
+  modelFileSize: number,
+  exactKvBytesPerToken?: number,
+): number {
+  const kvBytesPerToken = resolveKvBytesPerToken(modelFileSize, exactKvBytesPerToken);
+  const usableBudget = availableMemory * MEMORY_USAGE_THRESHOLD;
+  const budgetForKv = usableBudget - modelFileSize - estimateOverhead(modelFileSize);
+
+  if (budgetForKv <= 0) return LLM_CONFIG_DEFAULTS.ctx_size;
+
+  const safeCtx = Math.floor(budgetForKv / kvBytesPerToken);
+  return Math.min(Math.max(safeCtx, LLM_CONFIG_DEFAULTS.ctx_size), MAX_CTX_SIZE);
+}
+
+export interface MemoryValidationResult {
+  safe: boolean;
+  estimatedBytes: number;
+  availableBytes: number;
+  requestedCtxSize: number;
+  suggestedCtxSize: number;
+  exact: boolean;
+}
+
+export function validateMemoryForModel(
+  modelFileSize: number,
+  ctxSize: number,
+  availableMemory: number,
+  exactKvBytesPerToken?: number,
+): MemoryValidationResult {
+  const estimatedBytes = estimateMemoryRequired(modelFileSize, ctxSize, exactKvBytesPerToken);
+  const threshold = availableMemory * MEMORY_USAGE_THRESHOLD;
+  const safe = availableMemory <= 0 || estimatedBytes <= threshold;
+  const suggestedCtxSize = safe
+    ? ctxSize
+    : computeSafeCtxSize(availableMemory, modelFileSize, exactKvBytesPerToken);
+
+  return {
+    safe,
+    estimatedBytes,
+    availableBytes: availableMemory,
+    requestedCtxSize: ctxSize,
+    suggestedCtxSize,
+    exact: exactKvBytesPerToken !== undefined && exactKvBytesPerToken > 0,
+  };
+}
@@ -11,10 +11,12 @@ import {
   translateResponseSchema,
   ModelType,
   llmConfigBaseSchema,
+  LLM_CONFIG_DEFAULTS,
   ADDON_LLM,
   type CreateModelParams,
   type PluginModelResult,
   type ResolveContext,
+  type ValidateBeforeLoadParams,
   type LlmConfig,
   type LlmConfigInput,
 } from "@/schemas";
@@ -26,6 +28,9 @@ import { completion } from "@/server/bare/plugins/llamacpp-completion/ops/comple
 import { finetune } from "@/server/bare/plugins/llamacpp-completion/ops/finetune";
 import { translate } from "@/server/bare/ops/translate";
 import { attachModelExecutionMs } from "@/profiling/model-execution";
+import { validateMemoryForModel } from "@/server/bare/plugins/llamacpp-completion/memory-estimator";
+import { ModelMemoryExceededError } from "@/utils/errors-server";
+import { getServerLogger } from "@/logging";
 
 function transformLlmConfig(llmConfig: LlmConfig) {
   const transformed = JSON.parse(
@@ -109,6 +114,34 @@ export const llmPlugin = definePlugin({
     };
   },
 
+  validateBeforeLoad(params: ValidateBeforeLoadParams) {
+    const ctxSize =
+      (params.modelConfig["ctx_size"] as number | undefined) ??
+      LLM_CONFIG_DEFAULTS.ctx_size;
+    const result = validateMemoryForModel(
+      params.modelFileSize,
+      ctxSize,
+      params.availableMemory,
+      params.kvBytesPerToken,
+    );
+
+    if (!result.safe) {
+      throw new ModelMemoryExceededError(
+        result.estimatedBytes,
+        result.availableBytes,
+        result.requestedCtxSize,
+        result.suggestedCtxSize,
+      );
+    }
+
+    if (result.estimatedBytes > result.availableBytes * 0.6) {
+      getServerLogger().warn(
+        `Memory usage will be high: estimated ${Math.round(result.estimatedBytes / (1024 * 1024))} MB ` +
+          `with ${Math.round(result.availableBytes / (1024 * 1024))} MB available`,
+      );
+    }
+  },
+
   createModel(params: CreateModelParams): PluginModelResult {
     const llmConfig = (params.modelConfig ?? {}) as LlmConfig;
 

@@ -23,6 +23,7 @@ const configRegistry: QvacConfig = {
   httpDownloadConcurrency: undefined,
   registryDownloadMaxRetries: undefined,
   deviceDefaults: undefined,
+  unsafeDisableMemoryValidation: undefined,
 };
 
 let configIsSet = false;
@@ -116,6 +117,19 @@ export function setSDKConfig(config: QvacConfig) {
     );
   }
 
+  if (
+    config.unsafeDisableMemoryValidation !== undefined &&
+    config.unsafeDisableMemoryValidation !== null
+  ) {
+    configRegistry.unsafeDisableMemoryValidation =
+      config.unsafeDisableMemoryValidation;
+    if (config.unsafeDisableMemoryValidation) {
+      logger.warn(
+        "Memory validation disabled via unsafeDisableMemoryValidation — OOM crashes may occur",
+      );
+    }
+  }
+
   // Mark config as set - now it's immutable
   configIsSet = true;
 }
@@ -131,3 +145,7 @@ function getDefaultCacheDir() {
 export function getConfiguredCacheDir(): string {
   return configRegistry.cacheDirectory || getDefaultCacheDir();
 }
+
+export function isMemoryValidationEnabled(): boolean {
+  return configRegistry.unsafeDisableMemoryValidation !== true;
+}
@@ -0,0 +1,52 @@
+import os from "bare-os";
+
+export interface PlatformInfo {
+  os: string;
+  arch: string;
+  totalMemory: number;
+  availableMemory: number;
+}
+
+function safeCall<T>(fn: () => T, fallback: T): T {
+  try {
+    return fn();
+  } catch {
+    return fallback;
+  }
+}
+
+// On macOS (and iOS), os.freemem() returns only truly unallocated pages
+// ("Pages free" from vm_stat), which is always tiny because the OS
+// aggressively caches files in inactive/purgeable memory. The actual
+// memory available for new allocations is much larger.
+// We use totalMemory * fraction as a realistic estimate instead.
+const AVAILABLE_MEMORY_FRACTION_DESKTOP = 0.7;
+// Modern iOS (iPhone 12+) allows ~60-70% of total RAM before jetsam kill.
+// Android varies but typically 50-65%. We use 65% as a balanced estimate.
+const AVAILABLE_MEMORY_FRACTION_MOBILE = 0.65;
+
+function estimateAvailableMemory(
+  totalMemory: number,
+  platform: string,
+): number {
+  if (totalMemory <= 0) return 0;
+
+  const fraction =
+    platform === "ios" || platform === "android"
+      ? AVAILABLE_MEMORY_FRACTION_MOBILE
+      : AVAILABLE_MEMORY_FRACTION_DESKTOP;
+
+  return Math.floor(totalMemory * fraction);
+}
+
+export function getPlatformInfo(): PlatformInfo {
+  const platform = safeCall(() => os.platform(), "unknown");
+  const totalMemory = safeCall(() => os.totalmem(), 0);
+
+  return {
+    os: platform,
+    arch: safeCall(() => os.arch(), "unknown"),
+    totalMemory,
+    availableMemory: estimateAvailableMemory(totalMemory, platform),
+  };
+}