@@ -1083,9 +1083,9 @@ enum e_model {
10831083 MODEL_70B,
10841084};
10851085
1086- static const size_t kB = 1024 ;
1087- static const size_t MB = 1024 *kB ;
1088- static const size_t GB = 1024 *MB;
1086+ static const size_t kB = 1000 ;
1087+ static const size_t MB = 1000 *kB ;
1088+ static const size_t GB = 1000 *MB;
10891089
10901090struct llama_hparams {
10911091 bool vocab_only;
@@ -1481,7 +1481,7 @@ static bool llama_kv_cache_init(
14811481 vram_kv_cache += ggml_nbytes (cache.k );
14821482 }
14831483 if (vram_kv_cache > 0 ) {
1484- LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MB\n " , __func__, vram_kv_cache / 1024.0 / 1024.0 );
1484+ LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MB\n " , __func__, vram_kv_cache / 1e6 );
14851485 }
14861486 }
14871487#endif
@@ -2520,9 +2520,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
25202520 LLAMA_LOG_INFO (" %s: model ftype = %s\n " , __func__, llama_model_ftype_name (model.ftype ).c_str ());
25212521 LLAMA_LOG_INFO (" %s: model params = %.2f B\n " , __func__, ml.n_elements *1e-9 );
25222522 if (ml.n_bytes < GB) {
2523- LLAMA_LOG_INFO (" %s: model size = %.2f MiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 / 1024.0 , ml.n_bytes *8.0 /ml.n_elements );
2523+ LLAMA_LOG_INFO (" %s: model size = %.2f MB (%.2f BPW) \n " , __func__, ml.n_bytes /1e6 , ml.n_bytes *8.0 /ml.n_elements );
25242524 } else {
2525- LLAMA_LOG_INFO (" %s: model size = %.2f GiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 / 1024.0 / 1024.0 , ml.n_bytes *8.0 /ml.n_elements );
2525+ LLAMA_LOG_INFO (" %s: model size = %.2f GB (%.2f BPW) \n " , __func__, ml.n_bytes /1e9 , ml.n_bytes *8.0 /ml.n_elements );
25262526 }
25272527
25282528 // general kv
@@ -2558,7 +2558,7 @@ static void llm_load_tensors(
25582558
25592559 ml.calc_sizes (ctx_size, mmapped_size);
25602560
2561- LLAMA_LOG_INFO (" %s: ggml ctx size = %7.2f MB\n " , __func__, ctx_size/1024.0 / 1024.0 );
2561+ LLAMA_LOG_INFO (" %s: ggml ctx size = %7.2f MB\n " , __func__, ctx_size/1e6 );
25622562
25632563 // create the ggml context
25642564 {
@@ -3207,7 +3207,7 @@ static void llm_load_tensors(
32073207 ctx_size +
32083208 mmapped_size - vram_weights; // weights in VRAM not in memory
32093209
3210- LLAMA_LOG_INFO (" %s: mem required = %7.2f MB\n " , __func__, mem_required / 1024.0 / 1024.0 );
3210+ LLAMA_LOG_INFO (" %s: mem required = %7.2f MB\n " , __func__, mem_required / 1e6 );
32113211
32123212#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
32133213 const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
@@ -3226,7 +3226,7 @@ static void llm_load_tensors(
32263226#endif // GGML_USE_CUBLAS
32273227
32283228 LLAMA_LOG_INFO (" %s: offloaded %d/%d layers to GPU\n " , __func__, std::min (n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3229- LLAMA_LOG_INFO (" %s: VRAM used: %.2f MB\n " , __func__, vram_weights / 1024.0 / 1024.0 );
3229+ LLAMA_LOG_INFO (" %s: VRAM used: %.2f MB\n " , __func__, vram_weights / 1e6 );
32303230#else
32313231 (void ) n_gpu_layers;
32323232#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -7878,7 +7878,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
78787878 new_type = tensor->type ;
78797879 new_data = tensor->data ;
78807880 new_size = ggml_nbytes (tensor);
7881- LLAMA_LOG_INFO (" size = %8.3f MB\n " , ggml_nbytes (tensor)/1024.0 / 1024.0 );
7881+ LLAMA_LOG_INFO (" size = %8.3f MB\n " , ggml_nbytes (tensor)/1e6 );
78827882 } else {
78837883 const size_t nelements = ggml_nelements (tensor);
78847884
@@ -7938,7 +7938,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
79387938 workers.clear ();
79397939 }
79407940
7941- LLAMA_LOG_INFO (" size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes (tensor)/1024.0 / 1024.0 , new_size/1024.0 / 1024.0 );
7941+ LLAMA_LOG_INFO (" size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes (tensor)/1e6 , new_size/1e6 );
79427942 int64_t tot_count = 0 ;
79437943 for (size_t i = 0 ; i < hist_cur.size (); i++) {
79447944 hist_all[i] += hist_cur[i];
@@ -7976,8 +7976,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
79767976
79777977 gguf_free (ctx_out);
79787978
7979- LLAMA_LOG_INFO (" %s: model size = %8.2f MB\n " , __func__, total_size_org/1024.0 / 1024.0 );
7980- LLAMA_LOG_INFO (" %s: quant size = %8.2f MB\n " , __func__, total_size_new/1024.0 / 1024.0 );
7979+ LLAMA_LOG_INFO (" %s: model size = %8.2f MB\n " , __func__, total_size_org/1e6 );
7980+ LLAMA_LOG_INFO (" %s: quant size = %8.2f MB\n " , __func__, total_size_new/1e6 );
79817981
79827982 // print histogram for all tensors
79837983 {
@@ -8478,7 +8478,7 @@ struct llama_context * llama_new_context_with_model(
84788478
84798479 {
84808480 const size_t memory_size = ggml_nbytes (ctx->kv_self .k ) + ggml_nbytes (ctx->kv_self .v );
8481- LLAMA_LOG_INFO (" %s: kv self size = %7.2f MB\n " , __func__, memory_size / 1024.0 / 1024.0 );
8481+ LLAMA_LOG_INFO (" %s: kv self size = %7.2f MB\n " , __func__, memory_size / 1e6 );
84828482 }
84838483
84848484 // resized during inference
@@ -8523,7 +8523,7 @@ struct llama_context * llama_new_context_with_model(
85238523 // measure memory requirements for the graph
85248524 size_t alloc_size = ggml_allocr_alloc_graph (ctx->alloc , gf) + tensor_alignment;
85258525
8526- LLAMA_LOG_INFO (" %s: compute buffer total size = %.2f MB\n " , __func__, (ctx->buf_compute .size + alloc_size) / 1024.0 / 1024.0 );
8526+ LLAMA_LOG_INFO (" %s: compute buffer total size = %.2f MB\n " , __func__, (ctx->buf_compute .size + alloc_size) / 1e6 );
85278527
85288528 // recreate allocator with exact memory requirements
85298529 ggml_allocr_free (ctx->alloc );
@@ -8537,7 +8537,7 @@ struct llama_context * llama_new_context_with_model(
85378537#endif
85388538#ifdef GGML_USE_CUBLAS
85398539 ggml_cuda_set_scratch_size (alloc_size);
8540- LLAMA_LOG_INFO (" %s: VRAM scratch buffer: %.2f MB\n " , __func__, alloc_size / 1024.0 / 1024.0 );
8540+ LLAMA_LOG_INFO (" %s: VRAM scratch buffer: %.2f MB\n " , __func__, alloc_size / 1e6 );
85418541
85428542 // calculate total VRAM usage
85438543 auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8558,9 +8558,9 @@ struct llama_context * llama_new_context_with_model(
85588558 size_t total_vram_size = model_vram_size + ctx_vram_size;
85598559
85608560 LLAMA_LOG_INFO (" %s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n " , __func__,
8561- total_vram_size / 1024.0 / 1024.0 ,
8562- model_vram_size / 1024.0 / 1024.0 ,
8563- ctx_vram_size / 1024.0 / 1024.0 );
8561+ total_vram_size / 1e6 ,
8562+ model_vram_size / 1e6 ,
8563+ ctx_vram_size / 1e6 );
85648564#endif
85658565 }
85668566
@@ -8581,7 +8581,7 @@ struct llama_context * llama_new_context_with_model(
85818581
85828582 const size_t max_size = ggml_get_max_tensor_size (ctx->model .ctx );
85838583
8584- LLAMA_LOG_INFO (" %s: max tensor size = %8.2f MB\n " , __func__, max_size/1024.0 / 1024.0 );
8584+ LLAMA_LOG_INFO (" %s: max tensor size = %8.2f MB\n " , __func__, max_size/1e6 );
85858585
85868586#define LLAMA_METAL_CHECK_BUF (result ) \
85878587 if (!(result)) { \
0 commit comments