ggml-org · pwilkin · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
@@ -1073,6 +1073,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.completion = true;
         }
     ));
+    add_opt(common_arg(
+        {"--profile"},
+        "enable cross-backend profiling (CPU, BLAS, CUDA)",
+        [](common_params & params) {
+            params.profiling = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
+    add_opt(common_arg(
+        {"--profile-output"}, "FNAME",
+        "write profiling JSON output to FNAME (default: stdout)",
+        [](common_params & params, const std::string & value) {
+            params.profiling        = true;
+            params.profiling_output = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
     add_opt(common_arg(
         {"--verbose-prompt"},
         string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),

@@ -2,6 +2,7 @@
 #include "gguf.h"
 
 #include "common.h"
+#include "ggml-profiler.h"
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
@@ -1231,6 +1232,14 @@ common_init_result::common_init_result(common_params & params) :
         return;
     }
 
+    if (params.profiling) {
+        ggml_backend_sched_t sched = llama_context_get_sched(lctx);
+        if (sched != nullptr) {
+            ggml_backend_sched_set_profiling(sched, true);
+            LOG_INF("%s: profiling enabled\n", __func__);
+        }
+    }
+
     pimpl->context.reset(lctx);
 }
 

@@ -3,6 +3,7 @@
 #pragma once
 
 #include "ggml-opt.h"
+#include "ggml-profiler.h"
 #include "ggml.h"
 #include "llama-cpp.h"
 
@@ -669,6 +670,10 @@ struct common_params {
 
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
 
+    // profiling
+    bool        profiling = false;  // enable cross-backend profiling
+    std::string profiling_output;   // path to write profiling JSON output (empty = stdout)
+
     // batched-bench params
     bool batched_bench_output_jsonl = false;
 

@@ -244,6 +244,23 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // Export profiling data if profiling was enabled
+    if (params.profiling) {
+        ggml_backend_sched_t sched = llama_context_get_sched(ctx);
+        if (sched != nullptr) {
+            if (params.profiling_output.empty()) {
+                ggml_backend_sched_print_profiling(sched);
+            } else {
+                int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
+                if (ret == 0) {
+                    LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
+                } else {
+                    LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
+                }
+            }
+        }
+    }
+
     LOG("\n");
     llama_perf_context_print(ctx);
 

@@ -22,6 +22,23 @@ extern "C" {
 
         // use only reference implementations
         bool use_ref;
+
+        // profiler context (set by backend when profiling is enabled, NULL otherwise)
+        // when non-NULL, the compute loop will record per-node timing
+        void * profiling_context;
+
+        // callback for recording a profile record from C code (set by backend when profiling)
+        // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4]
+        void (*profiling_record_fn)(void *        context,
+                                    int           type,
+                                    const char *  name,
+                                    int           split_id,
+                                    uint64_t      start_ns,
+                                    uint64_t      end_ns,
+                                    uint64_t      bytes,
+                                    const char *  extra,
+                                    const int64_t ne_src0[4],
+                                    const int64_t ne_src1[4]);
     };
 
     // numa strategies

@@ -0,0 +1,104 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Profiler
+//
+
+// Profile event types
+enum ggml_profile_event_type {
+    GGML_PROFILE_EVENT_OP,    // single operation execution (computation kernel)
+    GGML_PROFILE_EVENT_COPY,  // data transfer between devices
+};
+
+// A single profiling record representing a timed interval
+typedef struct ggml_profile_record {
+    enum ggml_profile_event_type type;
+    const char *                 name;        // operation name (e.g., "mul_mat", "copy_H2D")
+    int                          backend_id;  // scheduler's backend index (0 = highest priority)
+    int                          split_id;    // which graph split (0..n_splits-1)
+    uint64_t                     start_ns;    // start timestamp in nanoseconds
+    uint64_t                     end_ns;      // end timestamp in nanoseconds
+    uint64_t                     bytes;       // bytes transferred (for copy) or tensor size (for ops)
+    const char *                 extra;       // fusion name for fused ops, or NULL
+    int64_t                      ne_src0[4];  // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
+    int64_t                      ne_src1[4];  // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
+} ggml_profile_record;
+
+// Backend profiler interface - each backend optionally implements this
+// to provide fine-grained operation timing
+struct ggml_backend_profiler {
+    void * context;  // backend-specific profiler context
+
+    // Enable or disable profiling on this backend
+    void (*enable)(void * context, bool enable);
+
+    // Clear all recorded data
+    void (*reset)(void * context);
+
+    // Set the current split ID (called by scheduler before graph_compute)
+    void (*set_split_id)(void * context, int split_id);
+
+    // Get recorded profiling data
+    // Returns the number of records; sets *out to point to internal storage
+    // The returned pointer remains valid until the next reset or disable call
+    int (*get_records)(void * context, const ggml_profile_record ** out);
+
+    // Free the profiler context
+    void (*free_context)(void * context);
+};
+
+typedef struct ggml_backend_profiler * ggml_backend_profiler_t;
+
+// Register a profiler on a backend (called by backend during init)
+// The profiler is owned by the backend and will be freed when the backend is freed
+GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler);
+
+// Get the profiler associated with a backend (returns NULL if none)
+GGML_API ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend);
+
+//
+// Scheduler profiling API
+//
+
+// Enable or disable profiling on a scheduler
+// When enabled, the scheduler will:
+//   - Time data copy operations between backends
+//   - Enable profiling on all backends that support it
+//   - Collect profiling records from all backends after each graph compute
+GGML_API void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable);
+
+// Check if profiling is enabled on a scheduler
+GGML_API bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched);
+
+// Get profiling data from the last graph compute
+// Records are owned by the scheduler; valid until the next compute or reset
+// Returns the number of records
+GGML_API int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records);
+
+// Print a human-readable summary of the last profiling run to stdout
+// Groups records by operation name and shows total/count/min/max/avg time
+GGML_API void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched);
+
+// Reset profiling data (clear all recorded data)
+GGML_API void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched);
+
+// Get current time in nanoseconds (for manual profiling if needed)
+GGML_API uint64_t ggml_profiler_time_ns(void);
+
+// Export profiling data as JSON to a file
+// Returns 0 on success, -1 on error
+GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath);
+
+// Export profiling data as JSON to a FILE pointer
+GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -195,12 +195,14 @@ add_library(ggml-base
             ../include/ggml-backend.h
             ../include/ggml-cpp.h
             ../include/ggml-opt.h
+            ../include/ggml-profiler.h
             ../include/gguf.h
             ggml.c
             ggml.cpp
             ggml-alloc.c
             ggml-backend.cpp
             ggml-opt.cpp
+            ggml-profiler.cpp
             ggml-threading.cpp
             ggml-threading.h
             ggml-quants.c

diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
@@ -3,6 +3,7 @@
 // ggml-backend internal header
 
 #include "ggml-backend.h"
+#include "ggml-profiler.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -124,6 +125,9 @@ extern "C" {
         struct ggml_backend_i iface;
         ggml_backend_dev_t device;
         void * context;
+
+        // Optional profiler (set by backend during init, NULL if not profiling)
+        ggml_backend_profiler_t profiler;
     };
 
     struct ggml_backend_event {