381381# //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
382382# LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
383383# LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
384+ # LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
384385#
385386# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
386387# };
419420# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35
420421LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
421422LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
423+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
422424LLAMA_FTYPE_GUESSED = 1024
423425
424426# enum llama_rope_scaling_type {
@@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
691693# bool use_mmap; // use mmap if possible
692694# bool use_mlock; // force system to keep model in RAM
693695# bool check_tensors; // validate model tensor data
696+ # bool use_extra_bufts; // use extra buffer types (used for weight repacking)
694697# };
695698class llama_model_params (ctypes .Structure ):
696699 """Parameters for llama_model
@@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure):
708711 vocab_only (bool): only load the vocabulary, no weights
709712 use_mmap (bool): use mmap if possible
710713 use_mlock (bool): force system to keep model in RAM
711- check_tensors (bool): validate model tensor data"""
714+ check_tensors (bool): validate model tensor data
715+ use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
712716
713717 if TYPE_CHECKING :
714718 devices : CtypesArray [ctypes .c_void_p ] # NOTE: unused
@@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure):
724728 use_mmap : bool
725729 use_mlock : bool
726730 check_tensors : bool
731+ use_extra_bufts : bool
727732
728733 _fields_ = [
729734 ("devices" , ctypes .c_void_p ), # NOTE: unnused
@@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure):
739744 ("use_mmap" , ctypes .c_bool ),
740745 ("use_mlock" , ctypes .c_bool ),
741746 ("check_tensors" , ctypes .c_bool ),
747+ ("use_extra_bufts" , ctypes .c_bool ),
742748 ]
743749
744750
@@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure):
787793# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
788794# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
789795# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
796+ # bool kv_unified; // use a unified buffer across the input sequences when computing the attention
797+ # // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
798+ # // ref: https://github.com/ggml-org/llama.cpp/pull/14363
790799# };
791800class llama_context_params (ctypes .Structure ):
792801 """Parameters for llama_context
@@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure):
821830 no_perf (bool): whether to measure performance timings
822831 op_offload (bool): offload host tensor operations to device
823832 swa_full (bool): use full-size SWA cache
833+ kv_unified (bool): use a unified buffer across the input sequences when computing the attention
824834 """
825835
826836 if TYPE_CHECKING :
@@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure):
853863 no_perf : bool
854864 op_offload : bool
855865 swa_full : bool
866+ kv_unified : bool
856867
857868 _fields_ = [
858869 ("n_ctx" , ctypes .c_uint32 ),
@@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure):
884895 ("no_perf" , ctypes .c_bool ),
885896 ("op_offload" , ctypes .c_bool ),
886897 ("swa_full" , ctypes .c_bool ),
898+ ("kv_unified" , ctypes .c_bool ),
887899 ]
888900
889901
@@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
16511663 ...
16521664
16531665
1666+ # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
1667+ # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
1668+ @ctypes_function ("llama_model_is_diffusion" , [llama_model_p_ctypes ], ctypes .c_bool )
1669+ def llama_model_is_diffusion (model : llama_model_p , / ) -> bool :
1670+ """Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)"""
1671+ ...
1672+
1673+
16541674# // Returns 0 on success
16551675# LLAMA_API uint32_t llama_model_quantize(
16561676# const char * fname_inp,
@@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /):
28332853# // in the order they have appeared in the batch.
28342854# // Rows: number of tokens for which llama_batch.logits[i] != 0
28352855# // Cols: n_vocab
2856+ # // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
28362857# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
28372858@ctypes_function (
28382859 "llama_get_logits" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
@@ -2873,6 +2894,7 @@ def llama_get_logits_ith(
28732894# // in the order they have appeared in the batch.
28742895# // shape: [n_outputs*n_embd]
28752896# // Otherwise, returns NULL.
2897+ # // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
28762898# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
28772899@ctypes_function (
28782900 "llama_get_embeddings" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
@@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
30203042 ...
30213043
30223044
3045+ # LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
3046+ @ctypes_function ("llama_vocab_mask" , [llama_vocab_p_ctypes ], llama_token )
3047+ def llama_vocab_mask (vocab : llama_vocab_p , / ) -> llama_token :
3048+ """mask"""
3049+ ...
3050+
3051+
30233052# LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
30243053@ctypes_function (
30253054 "llama_vocab_get_add_bos" ,
@@ -4176,6 +4205,7 @@ def llama_log_set(
41764205
41774206# int32_t n_p_eval;
41784207# int32_t n_eval;
4208+ # int32_t n_reused; // number of times a ggml compute graph had been reused
41794209# };
41804210class llama_perf_context_data (ctypes .Structure ):
41814211 _fields_ = [
@@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure):
41854215 ("t_eval_ms" , ctypes .c_double ),
41864216 ("n_p_eval" , ctypes .c_int32 ),
41874217 ("n_eval" , ctypes .c_int32 ),
4218+ ("n_reused" , ctypes .c_int32 ),
41884219 ]
41894220
41904221
0 commit comments