-
Notifications
You must be signed in to change notification settings - Fork 13.5k
llama : llama_perf + option to disable timings during decode #9355
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
471e7e1
ade52b6
6cce78c
fd46535
f42de24
7362f28
44f0218
f35e9b8
444b757
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -343,7 +343,7 @@ extern "C" { | |
| bool embeddings; // if true, extract embeddings (together with logits) | ||
| bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU | ||
| bool flash_attn; // whether to use flash attention [EXPERIMENTAL] | ||
| //bool no_perf; // whether to measure performance timings, TODO: implement | ||
| bool no_perf; // whether to measure performance timings | ||
|
|
||
| // Abort callback | ||
|
Comment on lines
344
to
348
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is minor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this will be a breaking change, since There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIK such changes still break external bindings, such as: https://github.com/abetlen/llama-cpp-python/blob/c032fc65b0873337ed39e5d63e15468a5d797646/llama_cpp/llama_cpp.py#L841 |
||
| // if it returns true, execution of llama_decode() will be aborted | ||
|
|
@@ -1168,11 +1168,30 @@ extern "C" { | |
| // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. | ||
| // | ||
|
|
||
| // performance timing information | ||
| struct llama_perf_data { | ||
| // llama_context | ||
| double t_start_ms; | ||
| double t_load_ms; | ||
| double t_p_eval_ms; | ||
| double t_eval_ms; | ||
|
|
||
| int32_t n_p_eval; | ||
| int32_t n_eval; | ||
|
|
||
| // llama_sampler_chain | ||
| double t_sample_ms; | ||
|
|
||
| int32_t n_sample; | ||
| }; | ||
|
|
||
| enum llama_perf_type { | ||
| LLAMA_PERF_TYPE_CONTEXT = 0, | ||
| LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, | ||
| }; | ||
|
|
||
| LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type); | ||
|
||
|
|
||
| LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); | ||
| LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.