1+ #include  " arg.h" 
2+ #include  " log.h" 
13#include  " ggml.h" 
24#include  " llama.h" 
35#include  " common.h" 
4- #include  " llama-vocab.h" 
6+ #include  " ../src/ llama-vocab.h" 
57
68#ifdef  _WIN32
79#define  WIN32_LEAN_AND_MEAN 
1820#include  < vector> 
1921
2022static  void  print_usage (int , char  ** argv) {
21-     LOG_TEE (" \n example usage:\n " 
22-     LOG_TEE (" \n     %s -m model.gguf -c 8192 -b 2048 -ub 512\n " 0 ]);
23-     LOG_TEE (" \n " 
23+     LOG_INF (" \n example usage:\n " 
24+     LOG_INF (" \n     %s -m model.gguf -c 8192 -b 2048 -ub 512\n " 0 ]);
25+     LOG_INF (" \n " 
2426}
2527
2628int  main (int  argc, char  ** argv) {
29+     common_params params;
2730
28-     gpt_params params;
29- 
30-     if  (!gpt_params_parse (argc, argv, params)) {
31-         print_usage (argc, argv);
31+     if  (!common_params_parse (argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
3232        return  1 ;
3333    }
3434
35-     //  init LLM 
35+     common_init (); 
3636
37+     //  init LLM
3738    llama_backend_init ();
3839    llama_numa_init (params.numa );
3940
4041    //  initialize the model
42+     common_init_result llama_init = common_init_from_params (params);
4143
42-     llama_model_params model_params = llama_model_params_from_gpt_params (params);
43- 
44-     llama_model * model = llama_load_model_from_file (params.model .c_str (), model_params);
44+     llama_model * model = llama_init.model .get ();
45+     llama_context * ctx = llama_init.context .get ();
4546
46-     if  (model == NULL ) {
47-         fprintf (stderr ,  " %s: error: unable  to load model \n "   , __func__);
47+     if  (model == nullptr  || ctx ==  nullptr ) {
48+         LOG_ERR ( " %s : failed  to init \n " 
4849        return  1 ;
4950    }
5051
51-     llama_context_params ctx_params = llama_context_params_from_gpt_params (params);
52- 
53-     llama_context * ctx = llama_new_context_with_model (model, ctx_params);
54- 
55-     if  (ctx == NULL ) {
56-         fprintf (stderr , " %s: error: failed to create the llama_context\n " 
57-         return  1 ;
52+     //  print system information
53+     {
54+         LOG_INF (" \n " 
55+         LOG_INF (" %s\n " common_params_get_system_info (params).c_str ());
56+         LOG_INF (" \n " 
5857    }
5958
6059    const  unsigned  int  n_kv_max = llama_n_ctx (ctx);
6160
61+     const  llama_vocab * vocab = llama_model_get_vocab (model);
62+     llama_token bos = vocab->token_bos ();
63+     const  unsigned  int  n_vocab  = llama_vocab_n_tokens (vocab);
6264
63-     const  llama_vocab * vocab = llama_get_vocab (ctx);
64-     llama_token bos = llama_token_bos_impl (*vocab);
65-     // llama_token eos = llama_token_eos_impl(*vocab);
66- 
67-     const  unsigned  int  n_vocab  = llama_n_vocab (model);
68- 
69-     //  decode in batches of ctx_params.n_batch tokens
65+     //  decode in batches of n_batch tokens
7066    auto  decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t  n_batch) {
7167        for  (int32_t  i = 0 ; i < (int32_t ) batch.n_tokens ; i += n_batch) {
7268            const  int32_t  n_tokens = std::min (n_batch, (int32_t ) (batch.n_tokens  - i));
@@ -83,7 +79,7 @@ int main(int argc, char ** argv) {
8379
8480            const  int  ret = llama_decode (ctx, batch_view);
8581            if  (ret != 0 ) {
86-                 LOG_TEE (" failed to decode the batch, n_batch = %d, ret = %d\n " 
82+                 LOG_INF (" failed to decode the batch, n_batch = %d, ret = %d\n " 
8783                return  false ;
8884            }
8985
@@ -96,64 +92,66 @@ int main(int argc, char ** argv) {
9692    const  unsigned  int  pp = params.n_ubatch ;
9793    const  unsigned  int  tg = params.n_ubatch  / 4 ;
9894
99-     if  (!params.sweep_bench_output_jsonl ) {
100-         LOG_TEE (" \n " 
101-         LOG_TEE (" %s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n " n_batch , params.n_ubatch , params.flash_attn , params.n_gpu_layers , ctx_params.n_threads , ctx_params.n_threads_batch );
102-         LOG_TEE (" \n " 
103-         LOG_TEE (" |%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n " " PP" " TG" " N_KV" " T_PP s" " S_PP t/s" " T_TG s" " S_TG t/s" 
104-         LOG_TEE (" |%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n " " ------" " ------" " ------" " --------" " --------" " --------" " --------" 
105-     }
95+     const  unsigned  int  n_threads       = params.cpuparams .n_threads ;
96+     const  unsigned  int  n_threads_batch = params.cpuparams_batch .n_threads ;
97+     const  int32_t  n_batch = llama_n_batch (ctx);
98+ 
99+     LOG_INF (" \n " 
100+     LOG_INF (" %s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n " n_batch , params.n_ubatch , params.flash_attn , params.n_gpu_layers , n_threads, n_threads_batch);
101+     LOG_INF (" \n " 
102+     LOG_INF (" |%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n " " PP" " TG" " N_KV" " T_PP s" " S_PP t/s" " T_TG s" " S_TG t/s" 
103+     LOG_INF (" |%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n " " ------" " ------" " ------" " --------" " --------" " --------" " --------" 
106104
107105    llama_batch batch = llama_batch_init (n_kv_max, 0 , 1 );
108106
109107    //  warm up
110108    {
111-         llama_batch_add (batch, bos, 0 , { 0  }, false );
109+         common_batch_add (batch, bos, 0 , { 0  }, false );
112110
113-         if  (!decode_helper (ctx, batch, ctx_params. n_batch )) {
114-             LOG_TEE (" %s: llama_decode() failed\n " 
111+         if  (!decode_helper (ctx, batch, n_batch)) {
112+             LOG_INF (" %s: llama_decode() failed\n " 
115113            return  1 ;
116114        }
117115    }
118116
119-     llama_batch_clear (batch);
120-     llama_kv_cache_clear (ctx);
117+     common_batch_clear (batch);
118+     llama_kv_self_clear (ctx);
121119
122120    for  (unsigned  int  n_kv = 0 ; n_kv < n_kv_max; n_kv += params.n_ubatch ) {
123121        //  clean up KV cache before generation
124-         llama_kv_cache_seq_rm (ctx, 0 ,  n_kv, -1 );
122+         llama_kv_self_seq_rm (ctx, 0 ,n_kv, -1 );
125123
126124        //  first measure token generation performance at this context size
127125        const  auto  t_tg_start = ggml_time_us ();
128126
129127        for  (unsigned  int  i = 0 ; i < tg; ++i) {
130-             llama_batch_clear (batch);
131-             llama_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0  }, true );
128+             common_batch_clear (batch);
129+             common_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0  }, true );
132130
133-             if  (!decode_helper (ctx, batch, ctx_params. n_batch )) {
134-                 LOG_TEE (" %s: llama_decode() failed\n " 
131+             if  (!decode_helper (ctx, batch, n_batch)) {
132+                 LOG_INF (" %s: llama_decode() failed\n " 
135133                return  1 ;
136134            }
137135        }
138136
139137        const  auto  t_tg_end = ggml_time_us ();
140138
141139        //  clean up KV cache after generation
142-         llama_kv_cache_seq_rm (ctx, 0 , n_kv, -1 );
140+         llama_kv_self_seq_rm (ctx, 0 , n_kv, -1 );
143141
144142        //  prepare batch of pp size for prompt processing performance measurement
145-         llama_batch_clear (batch);
143+         common_batch_clear (batch);
146144
147145        for  (unsigned  int  i = 0 ; i < pp; ++i) {
148-             llama_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0  }, false );
146+             common_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0  }, false );
149147        }
150148        batch.logits [batch.n_tokens  - 1 ] = true ;
151149
152150        //  measure prompt processing performance
153151        const  auto  t_pp_start = ggml_time_us ();
154152
155-         if  (!decode_helper (ctx, batch, ctx_params. n_batch )) {
156-             LOG_TEE (" %s: llama_decode() failed\n " 
153+         if  (!decode_helper (ctx, batch, n_batch)) {
154+             LOG_INF (" %s: llama_decode() failed\n " 
157155            return  1 ;
158156        }
159157
@@ -166,23 +164,9 @@ int main(int argc, char ** argv) {
166164        const  float  speed_pp = pp / t_pp;
167165        const  float  speed_tg = tg / t_tg;
168166
169-         if (params.sweep_bench_output_jsonl ) {
170-             LOG_TEE (
171-                 " {\" n_kv_max\" : %d, \" n_batch\" : %d, \" n_ubatch\" : %d, \" flash_attn\" : %d, \" n_gpu_layers\" : %d, \" n_threads\" : %u, \" n_threads_batch\" : %u, " 
172-                 " \" pp\" : %d, \" tg\" : %d, \" n_kv\" : %d, \" t_pp\" : %f, \" speed_pp\" : %f, \" t_tg\" : %f, \" speed_tg\" : %f }\n " 
173-                 n_kv_max, params.n_batch , params.n_ubatch , params.flash_attn , params.n_gpu_layers , ctx_params.n_threads , ctx_params.n_threads_batch ,
174-                 pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg
175-             );
176-         } else  {
177-             LOG_TEE (" |%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n " 
178-         }
167+         LOG_INF (" |%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n " 
179168    }
180169
181-     llama_batch_free (batch);
182- 
183-     llama_free (ctx);
184-     llama_free_model (model);
185- 
186170    llama_backend_free ();
187171
188172    return  0 ;
0 commit comments