Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.completion = true;
}
));
add_opt(common_arg(
{"--profile"},
"enable cross-backend profiling (CPU, BLAS, CUDA)",
[](common_params & params) {
params.profiling = true;
}
).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--profile-output"}, "FNAME",
"write profiling JSON output to FNAME (default: stdout)",
[](common_params & params, const std::string & value) {
params.profiling = true;
params.profiling_output = value;
}
).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--verbose-prompt"},
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
Expand Down
9 changes: 9 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "gguf.h"

#include "common.h"
#include "ggml-profiler.h"
#include "log.h"
#include "llama.h"
#include "sampling.h"
Expand Down Expand Up @@ -1231,6 +1232,14 @@ common_init_result::common_init_result(common_params & params) :
return;
}

if (params.profiling) {
ggml_backend_sched_t sched = llama_context_get_sched(lctx);
if (sched != nullptr) {
ggml_backend_sched_set_profiling(sched, true);
LOG_INF("%s: profiling enabled\n", __func__);
}
}

pimpl->context.reset(lctx);
}

Expand Down
5 changes: 5 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#pragma once

#include "ggml-opt.h"
#include "ggml-profiler.h"
#include "ggml.h"
#include "llama-cpp.h"

Expand Down Expand Up @@ -669,6 +670,10 @@ struct common_params {

bool spm_infill = false; // suffix/prefix/middle pattern for infill

// profiling
bool profiling = false; // enable cross-backend profiling
std::string profiling_output; // path to write profiling JSON output (empty = stdout)

// batched-bench params
bool batched_bench_output_jsonl = false;

Expand Down
17 changes: 17 additions & 0 deletions examples/debug/debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,23 @@ int main(int argc, char ** argv) {
return 1;
}

// Export profiling data if profiling was enabled
if (params.profiling) {
ggml_backend_sched_t sched = llama_context_get_sched(ctx);
if (sched != nullptr) {
if (params.profiling_output.empty()) {
ggml_backend_sched_print_profiling(sched);
} else {
int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
if (ret == 0) {
LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
} else {
LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
}
}
}
}

LOG("\n");
llama_perf_context_print(ctx);

Expand Down
17 changes: 17 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@ extern "C" {

// use only reference implementations
bool use_ref;

// profiler context (set by backend when profiling is enabled, NULL otherwise)
// when non-NULL, the compute loop will record per-node timing
void * profiling_context;

// callback for recording a profile record from C code (set by backend when profiling)
// params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4]
void (*profiling_record_fn)(void * context,
int type,
const char * name,
int split_id,
uint64_t start_ns,
uint64_t end_ns,
uint64_t bytes,
const char * extra,
const int64_t ne_src0[4],
const int64_t ne_src1[4]);
};

// numa strategies
Expand Down
104 changes: 104 additions & 0 deletions ggml/include/ggml-profiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#pragma once

#include "ggml-backend.h"
#include "ggml.h"

#ifdef __cplusplus
extern "C" {
#endif

//
// Profiler
//

// Profile event types
enum ggml_profile_event_type {
GGML_PROFILE_EVENT_OP, // single operation execution (computation kernel)
GGML_PROFILE_EVENT_COPY, // data transfer between devices
};

// A single profiling record representing a timed interval
typedef struct ggml_profile_record {
enum ggml_profile_event_type type;
const char * name; // operation name (e.g., "mul_mat", "copy_H2D")
int backend_id; // scheduler's backend index (0 = highest priority)
int split_id; // which graph split (0..n_splits-1)
uint64_t start_ns; // start timestamp in nanoseconds
uint64_t end_ns; // end timestamp in nanoseconds
uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops)
const char * extra; // fusion name for fused ops, or NULL
int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
} ggml_profile_record;

// Backend profiler interface - each backend optionally implements this
// to provide fine-grained operation timing
struct ggml_backend_profiler {
void * context; // backend-specific profiler context

// Enable or disable profiling on this backend
void (*enable)(void * context, bool enable);

// Clear all recorded data
void (*reset)(void * context);

// Set the current split ID (called by scheduler before graph_compute)
void (*set_split_id)(void * context, int split_id);

// Get recorded profiling data
// Returns the number of records; sets *out to point to internal storage
// The returned pointer remains valid until the next reset or disable call
int (*get_records)(void * context, const ggml_profile_record ** out);

// Free the profiler context
void (*free_context)(void * context);
};

typedef struct ggml_backend_profiler * ggml_backend_profiler_t;

// Register a profiler on a backend (called by backend during init)
// The profiler is owned by the backend and will be freed when the backend is freed
GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler);

// Get the profiler associated with a backend (returns NULL if none)
GGML_API ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend);

//
// Scheduler profiling API
//

// Enable or disable profiling on a scheduler
// When enabled, the scheduler will:
// - Time data copy operations between backends
// - Enable profiling on all backends that support it
// - Collect profiling records from all backends after each graph compute
GGML_API void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable);

// Check if profiling is enabled on a scheduler
GGML_API bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched);

// Get profiling data from the last graph compute
// Records are owned by the scheduler; valid until the next compute or reset
// Returns the number of records
GGML_API int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records);

// Print a human-readable summary of the last profiling run to stdout
// Groups records by operation name and shows total/count/min/max/avg time
GGML_API void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched);

// Reset profiling data (clear all recorded data)
GGML_API void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched);

// Get current time in nanoseconds (for manual profiling if needed)
GGML_API uint64_t ggml_profiler_time_ns(void);

// Export profiling data as JSON to a file
// Returns 0 on success, -1 on error
GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath);

// Export profiling data as JSON to a FILE pointer
GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp);

#ifdef __cplusplus
}
#endif
2 changes: 2 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,14 @@ add_library(ggml-base
../include/ggml-backend.h
../include/ggml-cpp.h
../include/ggml-opt.h
../include/ggml-profiler.h
../include/gguf.h
ggml.c
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-opt.cpp
ggml-profiler.cpp
ggml-threading.cpp
ggml-threading.h
ggml-quants.c
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// ggml-backend internal header

#include "ggml-backend.h"
#include "ggml-profiler.h"

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -124,6 +125,9 @@ extern "C" {
struct ggml_backend_i iface;
ggml_backend_dev_t device;
void * context;

// Optional profiler (set by backend during init, NULL if not profiling)
ggml_backend_profiler_t profiler;
};

struct ggml_backend_event {
Expand Down
Loading
Loading