diff --git a/pkg/llama/context.go b/pkg/llama/context.go index e523813..85303af 100644 --- a/pkg/llama/context.go +++ b/pkg/llama/context.go @@ -23,7 +23,8 @@ var FFITypeContextParams = ffi.NewType( &ffi.TypePointer, &ffi.TypePointer, &ffi.TypeUint8, &ffi.TypeUint8, &ffi.TypeUint8, &ffi.TypeUint8, - &ffi.TypeUint8, &ffi.TypeUint8) + &ffi.TypeUint8, &ffi.TypeUint8, + &ffi.TypeUint32, &ffi.TypeSint32) var ( // LLAMA_API struct llama_context_params llama_context_default_params(void); diff --git a/pkg/llama/llama.go b/pkg/llama/llama.go index 68f172a..7a75cbf 100644 --- a/pkg/llama/llama.go +++ b/pkg/llama/llama.go @@ -341,6 +341,11 @@ type ContextParams struct { OpOffload uint8 // offload host tensor operations to device SwaFull uint8 // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) KVUnified uint8 // use a unified buffer across the input sequences when computing the attentions + // [EXPERIMENTAL] + // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) + // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) + Samplers uintptr // llama_sampler_seq_config * + NSamplers uint32 // number of sampler chains } // Model quantize parameters