Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
b760272
hexagon: guard HMX clock request for v75+ platforms (#22377)
trivikram-reddy1 Apr 26, 2026
f454bd7
opencl: add iq4_nl support (#22272)
lhez Apr 26, 2026
2dd8416
ggml-cpu: optimize avx2 q6_k (#22345)
netrunnereve Apr 26, 2026
0c6ee1c
ggml-cpu : re-enable fast gelu_quick_f16 (#22339)
CISC Apr 26, 2026
b1a5bd4
CUDA: better coalesce data-access for contiguous concat (#22330)
ORippler Apr 26, 2026
7ec36aa
Github: set meta backend code owner (#22388)
JohannesGaessler Apr 26, 2026
78433f6
Fix recurrent state serialization for partial reads and writes (#22362)
gaugarg-nv Apr 26, 2026
06a811d
add performance-portable tuning for register-tile and subgroup matmul…
SharmaRithik Apr 26, 2026
f535774
pr2wt : symlink .pi (#22386)
ggerganov Apr 26, 2026
5594d13
common: fix missing exports in llama-common (#22340)
max-krasnyansky Apr 27, 2026
f84270e
ggml : use 64 bytes aligned tile buffers (#21058)
angt Apr 27, 2026
d13540b
convert : remove input_scale for dequantized fp8 modelopt (#22356)
CISC Apr 27, 2026
0f1bb60
model : remove duplicate wo_s scale after build_attn (Qwen3, LLaMA) (…
ynankani Apr 27, 2026
e940b3d
download : prefer q8_0 when q4_k not available (#22428)
ggerganov Apr 27, 2026
42401c7
Fix type casting for unaccounted memory calculation (#22424)
rankaiyx Apr 27, 2026
ceaf47c
fix: rpc-server cache may not work in Windows environments (#22394)
unraido Apr 27, 2026
4414c04
Additional test for common/gemma4 : handle parsing edge cases (#22420)
hextriclosan Apr 27, 2026
665abc6
add fast mat-vec kernels for i-quants (#22344)
SharmaRithik Apr 27, 2026
983ca89
server: (router) Forward form-data to model server (Fixes #22044) (#2…
tha80 Apr 27, 2026
434b2a1
ggml-webgpu: add Q1_0 support (#22374)
SharmaRithik Apr 27, 2026
516e8d7
server: use pos_next instead of n_tokens for m-rope (#22439)
am17an Apr 28, 2026
14e733e
spec : refactor params (#22397)
ggerganov Apr 28, 2026
c3e08f4
CANN: add new ops, optimize existing ops (#21204)
hipudding Apr 28, 2026
d530d6e
ggml : revert to -lm linking instead of find_library (#22355)
angt Apr 28, 2026
50494a2
ggml : skip already registered backends and devices (#22296)
angt Apr 28, 2026
698d19b
ggml: improve SPIR-V headers detection with __has_include (#21918)
EmilAskerov Apr 28, 2026
1982117
vulkan: add barrier after writetimestamp (#21865)
jeffbolznv Apr 28, 2026
f42e29f
webui: Server tools (#21237)
allozaur Apr 28, 2026
fd9eb46
Add DeepSeek V4 GGUF conversion
nisparks Apr 25, 2026
8e6ee61
Optimize GGUF conversion paths
nisparks Apr 25, 2026
a922f7d
Bring up native FP4 FP8 quant support
nisparks Apr 25, 2026
5ec1ff6
WIP DeepSeek V4 runtime support
nisparks Apr 26, 2026
bc341ef
Implement DeepSeek4 runtime state save
nisparks Apr 26, 2026
43de75b
Tune DeepSeek4 F8 scale decode
nisparks Apr 26, 2026
f69bf66
Port DeepSeek4 performance hot paths
nisparks Apr 26, 2026
781246c
Tune fused DeepSeek4 F8 MMVQ
nisparks Apr 26, 2026
14660a6
Add CUDA warp TOP_K fast path
nisparks Apr 26, 2026
c2744c7
Tune DeepSeek4 F8 row blocking
nisparks Apr 26, 2026
a299c77
Tune Q8 activation quantization
nisparks Apr 26, 2026
55acc5b
Tune DeepSeek4 copy and RMSNorm kernels
nisparks Apr 26, 2026
056d7a5
Avoid DeepSeek4 hc_post vector transpose
nisparks Apr 26, 2026
f29cbee
Add DeepSeek4 HC weighted sum op
nisparks Apr 26, 2026
ee9e652
Improve prompt cache reuse for full-removal memory
nisparks Apr 26, 2026
b990219
Test prompt cache full-removal allocation
nisparks Apr 26, 2026
cdbc7ba
Broaden HC weighted-sum test shapes
nisparks Apr 26, 2026
a4ef65f
Avoid FP8 packer scale expansion temporary
nisparks Apr 26, 2026
0bc9344
Avoid MXFP4 packer nibble expansion
nisparks Apr 26, 2026
ec69799
Validate DeepSeek4 native scale storage
nisparks Apr 26, 2026
a7d9255
Harden HC weighted-sum shape checks
nisparks Apr 26, 2026
0afe2c9
Cover F8 in CPU unsupported op switches
nisparks Apr 26, 2026
8c8641f
Complete F8 CPU op switch coverage
nisparks Apr 26, 2026
ba5dcb0
Test DeepSeek4 native packers
nisparks Apr 26, 2026
86a851b
Keep DeepSeek4 packer test Python 3.8 compatible
nisparks Apr 26, 2026
32bec0e
Add MoE selective-copy trace logging
nisparks Apr 27, 2026
12ae263
Add MoE copy LRU simulator
nisparks Apr 27, 2026
a635524
Keep MoE LRU simulator within slot budget
nisparks Apr 27, 2026
85a5596
Report MoE LRU cache footprint
nisparks Apr 27, 2026
6f45300
Validate MoE LRU trace metadata
nisparks Apr 27, 2026
868cd1f
Guard MoE LRU simulator byte accounting
nisparks Apr 27, 2026
933ea33
Prototype MoE LRU expert cache
nisparks Apr 27, 2026
f069485
Harden MoE cache slot parsing
nisparks Apr 27, 2026
6309b76
Clear MoE cache tensor metadata
nisparks Apr 27, 2026
3944c54
Summarize MoE runtime cache logs
nisparks Apr 27, 2026
6f2ca57
Document MoE LRU simulator modes
nisparks Apr 27, 2026
bf84c10
Key MoE ID cache by expert count
nisparks Apr 27, 2026
a3f6b9e
Guard empty MoE selective copies
nisparks Apr 27, 2026
ce3917d
Log MoE cache bypass reasons
nisparks Apr 27, 2026
8ff0511
Summarize MoE cache bypass logs
nisparks Apr 27, 2026
75962ee
Cover MoE bypass-only runtime logs
nisparks Apr 27, 2026
f38a6f2
Report MoE bypass slots
nisparks Apr 27, 2026
f79849d
Group MoE runtime stats by slots
nisparks Apr 27, 2026
506016f
Report MoE runtime cache footprint
nisparks Apr 27, 2026
65986f3
Validate MoE runtime cache footprint
nisparks Apr 27, 2026
a60eb34
Experiment with MoE LRU cache balancing
nisparks Apr 27, 2026
6c85a6d
Simulate speculative MoE expert prefetch
nisparks Apr 27, 2026
ccc7cb7
Prototype MoE set-Markov cache retention
nisparks Apr 27, 2026
dda2c9b
Prime MoE cache from prompt bypasses
nisparks Apr 27, 2026
f696ef5
Tune IQ4_XS MMVQ row blocking
nisparks Apr 27, 2026
1245d8d
Skip IQ4_XS Q8 activation sums
nisparks Apr 27, 2026
7209909
Fix DeepSeek4 arch smoke coverage
nisparks Apr 27, 2026
19e7b86
Make backend ops smoke bounded
nisparks Apr 27, 2026
95dba05
Optimize DeepSeek V4 native cache and reasoning
nisparks Apr 28, 2026
f8a7572
fix(server): prevent GGML_ABORT when prompt cache pos_min == -1
May 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -53,28 +53,29 @@
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
/ggml/include/ @ggerganov
/ggml/src/ggml-backend-meta.cpp @JohannesGaessler
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
/ggml/src/ggml-common.h @ggerganov
/ggml/src/ggml-cpu/ @ggerganov
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
/ggml/src/ggml-hip/ @IMbackK
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
/ggml/src/ggml-impl.h @ggerganov
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
/ggml/src/ggml-threading.* @ggerganov
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
/ggml/src/ggml-virtgpu/ @kpouget
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml.c @ggerganov
/ggml/src/ggml.cpp @ggerganov
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
Expand Down
582 changes: 371 additions & 211 deletions common/arg.cpp

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions common/arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ struct common_arg {
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
std::string help;
bool is_sparam = false; // is current arg a sampling param?
bool is_sampling = false; // is current arg a sampling param?
bool is_spec = false; // is current arg a speculative decoding param?
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
void (*handler_void) (common_params & params) = nullptr;
void (*handler_string) (common_params & params, const std::string &) = nullptr;
Expand Down Expand Up @@ -74,7 +75,8 @@ struct common_arg {
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
common_arg & set_env(const char * env);
common_arg & set_sparam();
common_arg & set_sampling();
common_arg & set_spec();
common_arg & set_preset_only();
bool in_example(enum llama_example ex);
bool is_exclude(enum llama_example ex);
Expand Down
14 changes: 7 additions & 7 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ common_time_meas::~common_time_meas() {
// CPU utils
//

int32_t cpu_get_num_physical_cores() {
int32_t common_cpu_get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
Expand Down Expand Up @@ -185,11 +185,11 @@ static int cpu_count_math_cpus(int n_cpu) {
/**
* Returns number of CPUs on system that are useful for math.
*/
int32_t cpu_get_num_math() {
int32_t common_cpu_get_num_math() {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
if (n_cpu < 1) {
return cpu_get_num_physical_cores();
return common_cpu_get_num_physical_cores();
}
if (is_hybrid_cpu()) {
cpu_set_t affinity;
Expand All @@ -202,7 +202,7 @@ int32_t cpu_get_num_math() {
}
}
#endif
return cpu_get_num_physical_cores();
return common_cpu_get_num_physical_cores();
}

// Helper for setting process priority
Expand Down Expand Up @@ -263,15 +263,15 @@ bool set_process_priority(enum ggml_sched_priority prio) {
//


void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
int32_t n_set = 0;

if (cpuparams.n_threads < 0) {
// Assuming everything about cpuparams is invalid
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = cpu_get_num_math();
cpuparams.n_threads = common_cpu_get_num_math();
}
}

Expand Down Expand Up @@ -1521,7 +1521,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
return cparams;
}

struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
struct ggml_threadpool_params tpp;

ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
Expand Down
92 changes: 56 additions & 36 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ struct common_control_vector_load_info;
// CPU utils
//

struct cpu_params {
struct common_cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
Expand All @@ -63,8 +63,8 @@ struct cpu_params {
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};

int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
int32_t common_cpu_get_num_physical_cores();
int32_t common_cpu_get_num_math();

//
// Common params
Expand Down Expand Up @@ -297,60 +297,80 @@ struct common_params_model {

struct common_ngram_mod;

struct common_params_speculative {
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding

// general-purpose speculative decoding parameters

int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)

// ngram-based speculative decoding

uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed

std::shared_ptr<common_ngram_mod> ngram_mod;
// draft-model-based speculative decoding parameters
struct common_params_speculative_draft {
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding

std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)

// draft-model speculative decoding
common_params_model mparams;

struct common_params_model mparams_dft;
llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts

llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts

llama_context_params cparams_dft; // these are the parameters for the draft llama_context
llama_context_params cparams; // these are the parameters for the draft llama_context

int32_t n_ctx = 0; // draft context size
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
common_cpu_params cpuparams;
common_cpu_params cpuparams_batch;

std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
};

struct common_params_speculative_ngram_mod {
int32_t n_match = 24;

int32_t n_max = 64;
int32_t n_min = 48;

// shared instance of the ngram container for all speculative decoding contexts
std::shared_ptr<common_ngram_mod> obj;
};

struct common_params_speculative_ngram_map {
uint16_t size_n = 12; // ngram size for lookup
uint16_t size_m = 48; // mgram size for speculative tokens
uint16_t min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
};

struct common_params_speculative_ngram_cache {
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
};

struct common_params_speculative {
// TODO: become a vector in order to support "chains of speculators"
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;

common_params_speculative_draft draft;

common_params_speculative_ngram_mod ngram_mod;
common_params_speculative_ngram_map ngram_simple;
common_params_speculative_ngram_map ngram_map_k;
common_params_speculative_ngram_map ngram_map_k4v;

common_params_speculative_ngram_cache ngram_cache;

bool has_dft() const {
return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
}
};

struct common_params_vocoder {
struct common_params_model model;

std::string speaker_file = ""; // speaker file path // NOLINT
std::string speaker_file; // speaker file path

bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
};

struct common_params_diffusion {
Expand Down Expand Up @@ -433,8 +453,8 @@ struct common_params {

enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
common_cpu_params cpuparams;
common_cpu_params cpuparams_batch;

ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
Expand Down Expand Up @@ -678,7 +698,7 @@ std::string common_params_get_system_info(const common_params & params);

bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);

//
Expand Down Expand Up @@ -846,7 +866,7 @@ common_init_result_ptr common_init_from_params(common_params & params);

struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);

// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
Expand Down
57 changes: 40 additions & 17 deletions common/debug.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,38 @@
#include "debug.h"

#include "common.h"
#include "log.h"

#include <cmath>
#include <regex>
#include <string>
#include <vector>

struct common_debug_cb_user_data::impl {
std::vector<uint8_t> data;
std::vector<std::regex> tensor_filters;
bool abort_on_nan{false};
};

common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
common_debug_cb_user_data::~common_debug_cb_user_data() = default;

common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
: pimpl(std::make_unique<impl>())
{
for (const auto & pattern : filter_patterns) {
try {
std::string anchored_pattern = "^" + pattern;
pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
} catch (const std::regex_error & e) {
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
}
}
pimpl->abort_on_nan = abort_on_nan;

params.cb_eval = common_debug_cb_eval;
params.cb_eval_user_data = this;
}

static std::string common_ggml_ne_string(const ggml_tensor * t) {
std::string str;
Expand Down Expand Up @@ -47,8 +76,7 @@ static float common_ggml_get_float_value(const uint8_t * data,

#define INDENT " "

template <bool abort>
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
Expand Down Expand Up @@ -94,7 +122,7 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
LOG(INDENT "sum = %f\n", sum);
}

if constexpr (abort) {
if (abort_on_nan) {
if (std::isnan(sum)) {
LOG("encountered NaN - aborting\n");
exit(0);
Expand All @@ -112,8 +140,9 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (base_callback_data *) user_data;
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (common_debug_cb_user_data *) user_data;
auto * pimpl = cb_data->pimpl.get();

const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
Expand All @@ -122,10 +151,10 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
return true; // Always retrieve data
}

bool matches_filter = cb_data->tensor_filters.empty();
bool matches_filter = pimpl->tensor_filters.empty();

if (!matches_filter) {
for (const auto & filter : cb_data->tensor_filters) {
for (const auto & filter : pimpl->tensor_filters) {
if (std::regex_search(t->name, filter)) {
matches_filter = true;
break;
Expand All @@ -148,20 +177,14 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b

if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
pimpl->data.resize(n_bytes);
ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
}

if (!ggml_is_quantized(t->type) && matches_filter) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
}

return true;
}

// Explicit template instantiations
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
Loading