diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index 70355920b89..d53978548a1 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -26,17 +26,28 @@ options: -h, --help --numa numa mode (default: disabled) -r, --repetitions number of times to repeat each test (default: 5) - --prio <0|1|2|3> process/thread priority (default: 0) + --prio <-1|0|1|2|3> process/thread priority (default: 0) --delay <0...N> (seconds) delay between each test (default: 0) -o, --output output format printed to stdout (default: md) -oe, --output-err output format printed to stderr (default: none) --list-devices list available devices and exit -v, --verbose verbose output --progress print test progress indicators + --no-warmup skip warmup runs before benchmarking + -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off) + -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096) -rpc, --rpc register RPC devices (comma separated) test parameters: -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive + default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist. + example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M + (default: unused) + -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo + (default: unused) + -hft, --hf-token Hugging Face access token + (default: value from HF_TOKEN environment variable) -p, --n-prompt (default: 512) -n, --n-gen (default: 128) -pg (default: ) @@ -49,21 +60,21 @@ test parameters: -C, --cpu-mask (default: 0x0) --cpu-strict <0|1> (default: 0) --poll <0...100> (default: 50) - -ngl, --n-gpu-layers (default: 99) + -ngl, --n-gpu-layers (default: -1) -ncmoe, --n-cpu-moe (default: 0) - -sm, --split-mode (default: layer) + -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) - -fa, --flash-attn <0|1> (default: 0) + -fa, --flash-attn (default: auto) -dev, --device (default: auto) -mmp, --mmap <0|1> (default: 1) + -dio, --direct-io <0|1> (default: 0) -embd, --embeddings <0|1> (default: 0) -ts, --tensor-split (default: 0) - -ot --override-tensors =;... + -ot --override-tensor =;... (default: disabled) -nopo, --no-op-offload <0|1> (default: 0) - -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off) - -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096) + --no-host <0|1> (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as @@ -97,12 +108,12 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0. | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 132.19 ± 0.55 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 256 | 129.37 ± 0.54 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 512 | 123.83 ± 0.25 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 128 | 82.17 ± 0.31 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 256 | 80.74 ± 0.23 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 512 | 78.08 ± 0.07 | ### Prompt processing with different batch sizes @@ -112,10 +123,10 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 | model | size | params | backend | ngl | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 128 | pp 1024 | 1436.51 ± 3.66 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 256 | pp 1024 | 1932.43 ± 23.48 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 512 | pp 1024 | 2254.45 ± 15.59 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 1024 | pp 1024 | 2498.61 ± 13.58 | ### Different numbers of threads @@ -171,10 +182,10 @@ $ ./llama-bench -d 0,512 | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 | 7340.20 ± 23.45 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 | 120.60 ± 0.59 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 @ d512 | 6425.91 ± 18.88 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 @ d512 | 116.71 ± 0.60 | ## Output formats @@ -188,8 +199,8 @@ $ ./llama-bench -o md | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | pp 512 | 2368.80 ± 93.24 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 131.42 ± 0.59 | ### CSV @@ -198,9 +209,9 @@ $ ./llama-bench -o csv ``` ```csv -build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" -"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,n_cpu_moe,split_mode,main_gpu,no_kv_offload,flash_attn,devices,tensor_split,tensor_buft_overrides,use_mmap,use_direct_io,embeddings,no_op_offload,no_host,fit_target,fit_min_ctx,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" ``` ### JSON @@ -229,14 +240,22 @@ $ ./llama-bench -o json "poll": 50, "type_k": "f16", "type_v": "f16", - "n_gpu_layers": 99, + "n_gpu_layers": -1, + "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, - "flash_attn": false, + "flash_attn": -1, + "devices": "auto", "tensor_split": "0.00", + "tensor_buft_overrides": "none", "use_mmap": true, + "use_direct_io": false, "embeddings": false, + "no_op_offload": 0, + "no_host": false, + "fit_target": 0, + "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, @@ -266,14 +285,22 @@ $ ./llama-bench -o json "poll": 50, "type_k": "f16", "type_v": "f16", - "n_gpu_layers": 99, + "n_gpu_layers": -1, + "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, - "flash_attn": false, + "flash_attn": -1, + "devices": "auto", "tensor_split": "0.00", + "tensor_buft_overrides": "none", "use_mmap": true, + "use_direct_io": false, "embeddings": false, + "no_op_offload": 0, + "no_host": false, + "fit_target": 0, + "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, @@ -296,8 +323,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} -{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} ``` @@ -310,7 +337,7 @@ $ ./llama-bench -o sql ``` ```sql -CREATE TABLE IF NOT EXISTS test ( +CREATE TABLE IF NOT EXISTS llama_bench ( build_commit TEXT, build_number INTEGER, cpu_info TEXT, @@ -329,13 +356,21 @@ CREATE TABLE IF NOT EXISTS test ( type_k TEXT, type_v TEXT, n_gpu_layers INTEGER, + n_cpu_moe INTEGER, split_mode TEXT, main_gpu INTEGER, no_kv_offload INTEGER, flash_attn INTEGER, + devices TEXT, tensor_split TEXT, + tensor_buft_overrides TEXT, use_mmap INTEGER, + use_direct_io INTEGER, embeddings INTEGER, + no_op_offload INTEGER, + no_host INTEGER, + fit_target INTEGER, + fit_min_ctx INTEGER, n_prompt INTEGER, n_gen INTEGER, n_depth INTEGER, @@ -346,6 +381,6 @@ CREATE TABLE IF NOT EXISTS test ( stddev_ts REAL ); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); +INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); +INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); ``` diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index d9732096866..a85f86c3ab2 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -19,6 +19,7 @@ #include #include +#include "arg.h" #include "build-info.h" #include "common.h" #include "download.h" @@ -275,9 +276,11 @@ static std::string pair_str(const std::pair & p) { return buf; } -static std::vector parse_int_range(const std::string & s) { +static std::vector parse_int_range(const std::string & s, bool allow_negative = false) { // first[-last[(+|*)step]] - std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"); + std::regex range_regex(allow_negative + ? R"(^(-?\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))" + : R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"); std::smatch match; std::string::const_iterator search_start(s.cbegin()); @@ -337,7 +340,7 @@ struct cmd_params { std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; - std::vector flash_attn; + std::vector flash_attn; std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; @@ -376,12 +379,12 @@ static const cmd_params cmd_params_defaults = { /* cpu_mask */ { "0x0" }, /* cpu_strict */ { false }, /* poll */ { 50 }, - /* n_gpu_layers */ { 99 }, + /* n_gpu_layers */ { -1 }, /* n_cpu_moe */ { 0 }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, - /* flash_attn */ { false }, + /* flash_attn */ { LLAMA_FLASH_ATTN_TYPE_AUTO }, /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, @@ -451,7 +454,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -fa, --flash-attn (default: %s)\n", join(transform_to_str(cmd_params_defaults.flash_attn, llama_flash_attn_type_name), ",").c_str()); printf(" -dev, --device (default: auto)\n"); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str()); @@ -710,7 +713,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = parse_int_range(argv[i]); + auto p = parse_int_range(argv[i], /*allow_negative=*/true); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") { if (++i >= argc) { @@ -793,8 +796,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); - params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); + auto p = string_split(argv[i], split_delim); + + std::vector types; + for (const auto & v : p) { + llama_flash_attn_type type; + if (common_arg_utils::is_truthy(v)) { + type = LLAMA_FLASH_ATTN_TYPE_ENABLED; + } else if (common_arg_utils::is_falsey(v)) { + type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + } else if (common_arg_utils::is_autoy(v)) { + type = LLAMA_FLASH_ATTN_TYPE_AUTO; + } else { + invalid_param = true; + break; + } + types.push_back(type); + } + if (invalid_param) { + break; + } + params.flash_attn.insert(params.flash_attn.end(), types.begin(), types.end()); } else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) { invalid_param = true; @@ -1138,7 +1160,7 @@ struct cmd_params_instance { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool flash_attn; + llama_flash_attn_type flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; @@ -1222,7 +1244,7 @@ struct cmd_params_instance { cparams.type_k = type_k; cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED; + cparams.flash_attn_type = flash_attn; cparams.embeddings = embeddings; cparams.op_offload = !no_op_offload; cparams.swa_full = false; @@ -1400,7 +1422,7 @@ struct test { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool flash_attn; + llama_flash_attn_type flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; @@ -1522,10 +1544,10 @@ struct test { field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" || - field == "fit_target" || field == "fit_min_ctx") { + field == "fit_target" || field == "fit_min_ctx" || field == "flash_attn") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") { return BOOL; } @@ -1594,7 +1616,7 @@ struct test { split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), - std::to_string(flash_attn), + std::to_string((int) flash_attn), devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, @@ -1779,7 +1801,7 @@ struct markdown_printer : public printer { return 6; } if (field == "flash_attn") { - return 2; + return 3; } if (field == "devices") { return -12;