diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp index 3ab21dc0489286..2ca07ea86e4a03 100644 --- a/samples/cpp/benchmark_app/main.cpp +++ b/samples/cpp/benchmark_app/main.cpp @@ -639,8 +639,8 @@ int main(int argc, char* argv[]) { } // 获取.bin文件的大小 - long fileSize = getFileSize(binPath); - std::cout << "my test: The size of '" << binPath << "' is " << fileSize/(1024*1024) << " Mbytes." << std::endl; + int fileSize = static_cast(getFileSize(binPath)/(1024*1024)); + std::cout << "my test: The size of '" << binPath << "' is " << fileSize << " Mbytes." << std::endl; auto duration_ms = get_duration_ms_till_now(startTime); slog::info << "Read model took " << double_to_string(duration_ms) << " ms" << slog::endl; @@ -788,6 +788,8 @@ int main(int argc, char* argv[]) { // -------------------------------------------------------- next_step(); startTime = Time::now(); + // my test + // device_config["weights_size"] = fileSize; compiledModel = core.compile_model(model, device_name, device_config); duration_ms = get_duration_ms_till_now(startTime); slog::info << "Compile model took " << double_to_string(duration_ms) << " ms" << slog::endl; diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index 637a5e45596357..71303d5eaba989 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -34,6 +34,7 @@ # include "openvino/proxy/plugin.hpp" # include "openvino/proxy/properties.hpp" #endif +#include "openvino/op/constant.hpp" ov::ICore::~ICore() = default; @@ -722,6 +723,8 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::LoadTime, "Core::compile_model::model"); std::string deviceName = device_name; ov::AnyMap config_with_batch = config; + config_with_batch["weights_size"] = coreConfig.get_weight_size(); + // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); @@ -1525,6 +1528,15 @@ bool ov::CoreImpl::CoreConfig::get_enable_mmap() const { return _flag_enable_mmap; } +int ov::CoreImpl::CoreConfig::get_weight_size() const { + return _weight_size; +} + +void ov::CoreImpl::CoreConfig::set_weight_size(const int& size) { + std::lock_guard lock(_cacheConfigMutex); + _weight_size = size; +} + // Creating thread-safe copy of config including shared_ptr to ICacheManager // Passing empty or not-existing name will return global cache config ov::CoreImpl::CoreConfig::CacheConfig ov::CoreImpl::CoreConfig::get_cache_config_for_device( @@ -1582,7 +1594,24 @@ void ov::CoreImpl::add_mutex(const std::string& dev_name) { std::shared_ptr ov::CoreImpl::read_model(const std::string& modelPath, const std::string& binPath) const { OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::ReadTime, "CoreImpl::read_model from file"); - return ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap()); + // return ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap()); + auto ov_model = ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap()); + // cal weight size + float total_weight_size = 0; + for (auto& op : ov_model->get_ordered_ops()) { + if (auto constop = std::dynamic_pointer_cast(op)) { + auto weight = static_cast(constop->get_byte_size()) / (1024 * 1024); + total_weight_size += weight; + // std::cout << "my test weight: " << weight << std::endl; + } + } + //my test fix number + // total_weight_size = 5; + + const_cast(coreConfig).set_weight_size(static_cast(total_weight_size)); + // std::cout << "my test Total weight: " << total_weight_size << std::endl; + + return ov_model; } std::shared_ptr ov::CoreImpl::read_model(const std::string& model, diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 40f2a15bb725e0..fc85df1e00c3d4 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -83,6 +83,9 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this _cacheConfigPerDevice; bool _flag_enable_mmap = true; + int _weight_size = 0; }; struct CacheContent { diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index b1a57cedde5a22..85a3b09da15590 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -370,6 +370,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { ov::hint::kv_cache_precision.name(), ". Supported values: u8, bf16, f16, f32"); } + } else if (key == "weights_size") { + weightSize = val.as(); } else { OPENVINO_THROW("NotFound: Unsupported property ", key, " by CPU plugin."); } diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 7580d0bb61baf9..bfe38ea603d359 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -100,6 +100,7 @@ struct Config { int modelPreferThreads = -1; ModelType modelType = ModelType::Unknown; + int weightSize = 0; #ifdef CPU_DEBUG_CAPS DebugCapsConfig debugCaps; diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp index 08ba43f9eff106..82576c0a28611a 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp @@ -235,7 +235,12 @@ std::vector> get_streams_info_table(const int input_streams, } else if (hint_model_distribution_policy.size() == 0) { for (auto& row : proc_socket_table) { if (row[PROC_SOCKET_ID] == current_socket_id) { - n_threads_per_stream = std::max(n_threads_per_stream, row[ALL_PROC]); + // n_threads_per_stream = std::max(n_threads_per_stream, row[ALL_PROC]); + // my test: add update here? + n_threads_per_stream = + model_prefer_threads > 0 + ? std::min(std::max(n_threads_per_stream, row[ALL_PROC]), model_prefer_threads) + : std::max(n_threads_per_stream, row[ALL_PROC]); } } } else { @@ -468,6 +473,7 @@ int get_model_prefer_threads(const int num_streams, Config& config) { const int sockets = get_num_sockets(); auto model_prefer = 0; + bool isNewXeon = false; if (-1 == config.modelPreferThreads) { const auto isa = dnnl::get_effective_cpu_isa(); float isaSpecificThreshold = 1.0f; @@ -484,7 +490,9 @@ int get_model_prefer_threads(const int num_streams, isaSpecificThreshold = 2.0f; break; case dnnl::cpu_isa::avx512_core_amx: + case dnnl::cpu_isa::avx512_core_amx_fp16: isaSpecificThreshold = 4.0f; + isNewXeon = true; break; default: isaSpecificThreshold = 1.0f; @@ -552,6 +560,7 @@ int get_model_prefer_threads(const int num_streams, // latency if (num_streams <= sockets && num_streams > 0) { + bool llm_related = has_matmul_with_compressed_weights(model); if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) { #ifdef __APPLE__ if ((proc_type_table.size() == 1) && (proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) { @@ -560,7 +569,6 @@ int get_model_prefer_threads(const int num_streams, : proc_type_table[0][ALL_PROC]; } #else - bool llm_related = has_matmul_with_compressed_weights(model); bool int8_intensive = ov::op::util::has_op_with_type(model) || llm_related; const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; @@ -574,6 +582,11 @@ int get_model_prefer_threads(const int num_streams, : proc_type_table[0][MAIN_CORE_PROC]) : proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC]; #endif + } else if (isNewXeon && !llm_related && proc_type_table.size() > 1 && proc_type_table[1][MAIN_CORE_PROC] >= 32) { + // // my test + // model_prefer = 32; + // TODO: config.weightSize threshold need tobe updated + model_prefer = (config.weightSize <= 100) ? 32 : proc_type_table[1][MAIN_CORE_PROC]; } } else { // throughput model_prefer = config.modelPreferThreads;