Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions packages/sdk/examples/memory-safe-loading.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import {
completion,
loadModel,
unloadModel,
LLAMA_3_2_1B_INST_Q4_0,
SDK_SERVER_ERROR_CODES,
} from "@qvac/sdk";

const requestedCtxSize = Number(process.argv[2]) || 32768;

console.log(`Loading model: ${LLAMA_3_2_1B_INST_Q4_0.name}`);
console.log(`Requested ctx_size: ${requestedCtxSize}`);

try {
const modelId = await loadModel({
modelSrc: LLAMA_3_2_1B_INST_Q4_0,
modelType: "llm",
modelConfig: {
ctx_size: requestedCtxSize,
},
onProgress: (progress) =>
console.log(`Loading: ${progress.percentage.toFixed(1)}%`),
});

console.log(`Model loaded with ctx_size=${requestedCtxSize}`);

const result = completion({
modelId,
history: [{ role: "user", content: "Say hello in one sentence." }],
stream: true,
});

process.stdout.write("Response: ");
for await (const token of result.tokenStream) {
process.stdout.write(token);
}
console.log();

await unloadModel({ modelId, clearStorage: false });
} catch (error: unknown) {
if (
error instanceof Error &&
"code" in error &&
(error as { code: number }).code ===
SDK_SERVER_ERROR_CODES.MODEL_MEMORY_EXCEEDED
) {
console.error(`\nMemory validation failed: ${error.message}`);
console.error(
"The SDK detected that loading with the requested ctx_size",
);
console.error(
"would likely exceed available memory and crash the app.",
);
console.error("Re-run with a smaller ctx_size, for example:");
console.error(` bun run examples/memory-safe-loading.ts 2048`);
} else {
console.error("Error:", error);
}
process.exit(1);
}
2 changes: 1 addition & 1 deletion packages/sdk/schemas/llamacpp-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ const verbositySchema = z.union([

// Base schema - validates types, all fields optional (for client-side validation)
export const llmConfigBaseSchema = z.object({
ctx_size: z.number().optional(),
ctx_size: z.number().int().min(1).optional(),
temp: z.number().min(0).max(2).optional(),
top_p: z.number().min(0).max(1).optional(),
top_k: z.number().int().min(0).max(128).optional(),
Expand Down
14 changes: 14 additions & 0 deletions packages/sdk/schemas/plugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ export interface ResolveResult<
artifacts?: Partial<Record<K, string>>;
}

export interface ValidateBeforeLoadParams {
modelConfig: Record<string, unknown>;
modelFileSize: number;
availableMemory: number;
kvBytesPerToken?: number;
}

export interface QvacPlugin<
TConfig = Record<string, unknown>,
TArtifactKeys extends string = string,
Expand All @@ -142,6 +149,12 @@ export interface QvacPlugin<
modelConfig: TConfig,
ctx: ResolveContext,
) => Promise<ResolveResult<TConfig, TArtifactKeys>>;
/**
* Optional hook to validate resource requirements before loading.
* Called after config resolution and model path resolution, before createModel.
* Throw to reject the load (e.g., insufficient memory for the requested ctx_size).
*/
validateBeforeLoad?: (params: ValidateBeforeLoadParams) => void;
}

// Non-streaming plugin invoke
Expand Down Expand Up @@ -262,6 +275,7 @@ export const pluginDefinitionRuntimeSchema = z
.catchall(z.unknown())
.optional(),
resolveConfig: functionRuntimeSchema.optional(),
validateBeforeLoad: functionRuntimeSchema.optional(),
skipPrimaryModelPathValidation: z.boolean().optional(),
})
.catchall(z.unknown());
Expand Down
8 changes: 8 additions & 0 deletions packages/sdk/schemas/sdk-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@ export const qvacConfigSchema = z.object({
* ```
*/
deviceDefaults: z.array(devicePatternSchema).optional(),

/**
* Disable pre-load memory validation for model loading.
* When set to true, the SDK skips memory estimation before loading a model,
* which may lead to OOM crashes if the model + KV cache exceeds available memory.
* Defaults to false (memory validation is enabled).
*/
unsafeDisableMemoryValidation: z.boolean().optional(),
});

export type QvacConfig = z.infer<typeof qvacConfigSchema>;
11 changes: 11 additions & 0 deletions packages/sdk/schemas/sdk-errors-server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export const SDK_SERVER_ERROR_CODES = {
MODEL_FILE_LOCATE_FAILED: 52203,
PROJECTION_MODEL_REQUIRED: 52204,
VAD_MODEL_REQUIRED: 52205,
MODEL_MEMORY_EXCEEDED: 52211,
TTS_ARTIFACTS_REQUIRED: 52208,
TTS_REFERENCE_AUDIO_REQUIRED: 52209,
PARAKEET_ARTIFACTS_REQUIRED: 52210,
Expand Down Expand Up @@ -161,6 +162,16 @@ const serverErrorDefinitions: ErrorCodesMap = {
message: (modelType: string, modelPath: string) =>
`Failed to locate ${modelType} model file: ${modelPath}`,
},
[SDK_SERVER_ERROR_CODES.MODEL_MEMORY_EXCEEDED]: {
name: "MODEL_MEMORY_EXCEEDED",
message: (
estimatedMB: string,
availableMB: string,
requestedCtx: string,
suggestedCtx: string,
) =>
`Model requires approximately ${estimatedMB} MB but only ${availableMB} MB is available. Try reducing ctx_size from ${requestedCtx} to ${suggestedCtx} or lower.`,
},
[SDK_SERVER_ERROR_CODES.PROJECTION_MODEL_REQUIRED]: {
name: "PROJECTION_MODEL_REQUIRED",
message: "Projection model source is required for multimodal LLM models",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import { LLM_CONFIG_DEFAULTS } from "@/schemas/llamacpp-config";

const BYTES_PER_MB = 1024 * 1024;
const BYTES_PER_GB = 1024 * BYTES_PER_MB;
const MEMORY_USAGE_THRESHOLD = 0.80;
const MAX_CTX_SIZE = 131072;
const OVERHEAD_FIXED_BYTES = 128 * BYTES_PER_MB;
const OVERHEAD_MODEL_FRACTION = 0.10;

// Fallback heuristic brackets when GGUF metadata is unavailable.
// Calibrated from actual model architectures with f16 KV cache:
// Llama-3.2-1B (737MB Q4): 32,768 bytes/token
// Llama-3.2-3B (~2GB Q4): 114,688 bytes/token
// Llama-3.1-8B (~4.5GB Q4): 131,072 bytes/token
interface KvBytesPerTokenBracket {
maxFileSize: number;
bytesPerToken: number;
}

const KV_BRACKETS: KvBytesPerTokenBracket[] = [
{ maxFileSize: 1 * BYTES_PER_GB, bytesPerToken: 48_000 },
{ maxFileSize: 3 * BYTES_PER_GB, bytesPerToken: 128_000 },
{ maxFileSize: 6 * BYTES_PER_GB, bytesPerToken: 200_000 },
{ maxFileSize: 15 * BYTES_PER_GB, bytesPerToken: 350_000 },
{ maxFileSize: Infinity, bytesPerToken: 500_000 },
];

export function estimateKvBytesPerToken(modelFileSize: number): number {
for (const bracket of KV_BRACKETS) {
if (modelFileSize < bracket.maxFileSize) return bracket.bytesPerToken;
}
return KV_BRACKETS[KV_BRACKETS.length - 1]!.bytesPerToken;
}

export function estimateOverhead(modelFileSize: number): number {
return OVERHEAD_FIXED_BYTES + Math.floor(modelFileSize * OVERHEAD_MODEL_FRACTION);
}

function resolveKvBytesPerToken(
modelFileSize: number,
exactKvBytesPerToken?: number,
): number {
if (exactKvBytesPerToken !== undefined && exactKvBytesPerToken > 0) {
return exactKvBytesPerToken;
}
return estimateKvBytesPerToken(modelFileSize);
}

export function estimateMemoryRequired(
modelFileSize: number,
ctxSize: number,
exactKvBytesPerToken?: number,
): number {
const kvBytesPerToken = resolveKvBytesPerToken(modelFileSize, exactKvBytesPerToken);
const kvCacheBytes = ctxSize * kvBytesPerToken;
return modelFileSize + kvCacheBytes + estimateOverhead(modelFileSize);
}

export function computeSafeCtxSize(
availableMemory: number,
modelFileSize: number,
exactKvBytesPerToken?: number,
): number {
const kvBytesPerToken = resolveKvBytesPerToken(modelFileSize, exactKvBytesPerToken);
const usableBudget = availableMemory * MEMORY_USAGE_THRESHOLD;
const budgetForKv = usableBudget - modelFileSize - estimateOverhead(modelFileSize);

if (budgetForKv <= 0) return LLM_CONFIG_DEFAULTS.ctx_size;

const safeCtx = Math.floor(budgetForKv / kvBytesPerToken);
return Math.min(Math.max(safeCtx, LLM_CONFIG_DEFAULTS.ctx_size), MAX_CTX_SIZE);
}

export interface MemoryValidationResult {
safe: boolean;
estimatedBytes: number;
availableBytes: number;
requestedCtxSize: number;
suggestedCtxSize: number;
exact: boolean;
}

export function validateMemoryForModel(
modelFileSize: number,
ctxSize: number,
availableMemory: number,
exactKvBytesPerToken?: number,
): MemoryValidationResult {
const estimatedBytes = estimateMemoryRequired(modelFileSize, ctxSize, exactKvBytesPerToken);
const threshold = availableMemory * MEMORY_USAGE_THRESHOLD;
const safe = availableMemory <= 0 || estimatedBytes <= threshold;
const suggestedCtxSize = safe
? ctxSize
: computeSafeCtxSize(availableMemory, modelFileSize, exactKvBytesPerToken);

return {
safe,
estimatedBytes,
availableBytes: availableMemory,
requestedCtxSize: ctxSize,
suggestedCtxSize,
exact: exactKvBytesPerToken !== undefined && exactKvBytesPerToken > 0,
};
}
33 changes: 33 additions & 0 deletions packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ import {
translateResponseSchema,
ModelType,
llmConfigBaseSchema,
LLM_CONFIG_DEFAULTS,
ADDON_LLM,
type CreateModelParams,
type PluginModelResult,
type ResolveContext,
type ValidateBeforeLoadParams,
type LlmConfig,
type LlmConfigInput,
} from "@/schemas";
Expand All @@ -26,6 +28,9 @@ import { completion } from "@/server/bare/plugins/llamacpp-completion/ops/comple
import { finetune } from "@/server/bare/plugins/llamacpp-completion/ops/finetune";
import { translate } from "@/server/bare/ops/translate";
import { attachModelExecutionMs } from "@/profiling/model-execution";
import { validateMemoryForModel } from "@/server/bare/plugins/llamacpp-completion/memory-estimator";
import { ModelMemoryExceededError } from "@/utils/errors-server";
import { getServerLogger } from "@/logging";

function transformLlmConfig(llmConfig: LlmConfig) {
const transformed = JSON.parse(
Expand Down Expand Up @@ -109,6 +114,34 @@ export const llmPlugin = definePlugin({
};
},

validateBeforeLoad(params: ValidateBeforeLoadParams) {
const ctxSize =
(params.modelConfig["ctx_size"] as number | undefined) ??
LLM_CONFIG_DEFAULTS.ctx_size;
const result = validateMemoryForModel(
params.modelFileSize,
ctxSize,
params.availableMemory,
params.kvBytesPerToken,
);

if (!result.safe) {
throw new ModelMemoryExceededError(
result.estimatedBytes,
result.availableBytes,
result.requestedCtxSize,
result.suggestedCtxSize,
);
}

if (result.estimatedBytes > result.availableBytes * 0.6) {
getServerLogger().warn(
`Memory usage will be high: estimated ${Math.round(result.estimatedBytes / (1024 * 1024))} MB ` +
`with ${Math.round(result.availableBytes / (1024 * 1024))} MB available`,
);
}
},

createModel(params: CreateModelParams): PluginModelResult {
const llmConfig = (params.modelConfig ?? {}) as LlmConfig;

Expand Down
18 changes: 18 additions & 0 deletions packages/sdk/server/bare/registry/config-registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const configRegistry: QvacConfig = {
httpDownloadConcurrency: undefined,
registryDownloadMaxRetries: undefined,
deviceDefaults: undefined,
unsafeDisableMemoryValidation: undefined,
};

let configIsSet = false;
Expand Down Expand Up @@ -116,6 +117,19 @@ export function setSDKConfig(config: QvacConfig) {
);
}

if (
config.unsafeDisableMemoryValidation !== undefined &&
config.unsafeDisableMemoryValidation !== null
) {
configRegistry.unsafeDisableMemoryValidation =
config.unsafeDisableMemoryValidation;
if (config.unsafeDisableMemoryValidation) {
logger.warn(
"Memory validation disabled via unsafeDisableMemoryValidation — OOM crashes may occur",
);
}
}

// Mark config as set - now it's immutable
configIsSet = true;
}
Expand All @@ -131,3 +145,7 @@ function getDefaultCacheDir() {
export function getConfiguredCacheDir(): string {
return configRegistry.cacheDirectory || getDefaultCacheDir();
}

export function isMemoryValidationEnabled(): boolean {
return configRegistry.unsafeDisableMemoryValidation !== true;
}
52 changes: 52 additions & 0 deletions packages/sdk/server/bare/utils/platform.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os from "bare-os";

export interface PlatformInfo {
os: string;
arch: string;
totalMemory: number;
availableMemory: number;
}

function safeCall<T>(fn: () => T, fallback: T): T {
try {
return fn();
} catch {
return fallback;
}
}

// On macOS (and iOS), os.freemem() returns only truly unallocated pages
// ("Pages free" from vm_stat), which is always tiny because the OS
// aggressively caches files in inactive/purgeable memory. The actual
// memory available for new allocations is much larger.
// We use totalMemory * fraction as a realistic estimate instead.
const AVAILABLE_MEMORY_FRACTION_DESKTOP = 0.7;
// Modern iOS (iPhone 12+) allows ~60-70% of total RAM before jetsam kill.
// Android varies but typically 50-65%. We use 65% as a balanced estimate.
const AVAILABLE_MEMORY_FRACTION_MOBILE = 0.65;

function estimateAvailableMemory(
totalMemory: number,
platform: string,
): number {
if (totalMemory <= 0) return 0;

const fraction =
platform === "ios" || platform === "android"
? AVAILABLE_MEMORY_FRACTION_MOBILE
: AVAILABLE_MEMORY_FRACTION_DESKTOP;

return Math.floor(totalMemory * fraction);
}

export function getPlatformInfo(): PlatformInfo {
const platform = safeCall(() => os.platform(), "unknown");
const totalMemory = safeCall(() => os.totalmem(), 0);

return {
os: platform,
arch: safeCall(() => os.arch(), "unknown"),
totalMemory,
availableMemory: estimateAvailableMemory(totalMemory, platform),
};
}
Loading
Loading