Skip to content

Commit

Permalink
CPU core numbers heuristic by weight size
Browse files Browse the repository at this point in the history
  • Loading branch information
liubo-intel committed Jul 4, 2024
1 parent 0c4e86f commit 4c7cf04
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 5 deletions.
6 changes: 4 additions & 2 deletions samples/cpp/benchmark_app/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,8 +639,8 @@ int main(int argc, char* argv[]) {
}

// 获取.bin文件的大小
long fileSize = getFileSize(binPath);
std::cout << "my test: The size of '" << binPath << "' is " << fileSize/(1024*1024) << " Mbytes." << std::endl;
int fileSize = static_cast<int>(getFileSize(binPath)/(1024*1024));
std::cout << "my test: The size of '" << binPath << "' is " << fileSize << " Mbytes." << std::endl;

auto duration_ms = get_duration_ms_till_now(startTime);
slog::info << "Read model took " << double_to_string(duration_ms) << " ms" << slog::endl;
Expand Down Expand Up @@ -788,6 +788,8 @@ int main(int argc, char* argv[]) {
// --------------------------------------------------------
next_step();
startTime = Time::now();
// my test
// device_config["weights_size"] = fileSize;
compiledModel = core.compile_model(model, device_name, device_config);
duration_ms = get_duration_ms_till_now(startTime);
slog::info << "Compile model took " << double_to_string(duration_ms) << " ms" << slog::endl;
Expand Down
31 changes: 30 additions & 1 deletion src/inference/src/dev/core_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
# include "openvino/proxy/plugin.hpp"
# include "openvino/proxy/properties.hpp"
#endif
#include "openvino/op/constant.hpp"

ov::ICore::~ICore() = default;

Expand Down Expand Up @@ -722,6 +723,8 @@ ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::compile_model(const std::shared_ptr<
OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::LoadTime, "Core::compile_model::model");
std::string deviceName = device_name;
ov::AnyMap config_with_batch = config;
config_with_batch["weights_size"] = coreConfig.get_weight_size();

// if auto-batching is applicable, the below function will patch the device name and config accordingly:
auto model = apply_auto_batching(model_, deviceName, config_with_batch);

Expand Down Expand Up @@ -1525,6 +1528,15 @@ bool ov::CoreImpl::CoreConfig::get_enable_mmap() const {
return _flag_enable_mmap;
}

int ov::CoreImpl::CoreConfig::get_weight_size() const {
return _weight_size;
}

void ov::CoreImpl::CoreConfig::set_weight_size(const int& size) {
std::lock_guard<std::mutex> lock(_cacheConfigMutex);
_weight_size = size;
}

// Creating thread-safe copy of config including shared_ptr to ICacheManager
// Passing empty or not-existing name will return global cache config
ov::CoreImpl::CoreConfig::CacheConfig ov::CoreImpl::CoreConfig::get_cache_config_for_device(
Expand Down Expand Up @@ -1582,7 +1594,24 @@ void ov::CoreImpl::add_mutex(const std::string& dev_name) {

std::shared_ptr<ov::Model> ov::CoreImpl::read_model(const std::string& modelPath, const std::string& binPath) const {
OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::ReadTime, "CoreImpl::read_model from file");
return ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap());
// return ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap());
auto ov_model = ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap());
// cal weight size
float total_weight_size = 0;
for (auto& op : ov_model->get_ordered_ops()) {
if (auto constop = std::dynamic_pointer_cast<op::v0::Constant>(op)) {
auto weight = static_cast<float>(constop->get_byte_size()) / (1024 * 1024);
total_weight_size += weight;
// std::cout << "my test weight: " << weight << std::endl;
}
}
//my test fix number
// total_weight_size = 5;

const_cast<ov::CoreImpl::CoreConfig&>(coreConfig).set_weight_size(static_cast<int>(total_weight_size));
// std::cout << "my test Total weight: " << total_weight_size << std::endl;

return ov_model;
}

std::shared_ptr<ov::Model> ov::CoreImpl::read_model(const std::string& model,
Expand Down
4 changes: 4 additions & 0 deletions src/inference/src/dev/core_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this<ov::ICore

bool get_enable_mmap() const;

int get_weight_size() const;
void set_weight_size(const int& size);

// Creating thread-safe copy of config including shared_ptr to ICacheManager
// Passing empty or not-existing name will return global cache config
CacheConfig get_cache_config_for_device(const ov::Plugin& plugin, ov::AnyMap& parsedConfig) const;
Expand All @@ -92,6 +95,7 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this<ov::ICore
CacheConfig _cacheConfig;
std::map<std::string, CacheConfig> _cacheConfigPerDevice;
bool _flag_enable_mmap = true;
int _weight_size = 0;
};

struct CacheContent {
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
ov::hint::kv_cache_precision.name(),
". Supported values: u8, bf16, f16, f32");
}
} else if (key == "weights_size") {
weightSize = val.as<int>();
} else {
OPENVINO_THROW("NotFound: Unsupported property ", key, " by CPU plugin.");
}
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ struct Config {

int modelPreferThreads = -1;
ModelType modelType = ModelType::Unknown;
int weightSize = 0;

#ifdef CPU_DEBUG_CAPS
DebugCapsConfig debugCaps;
Expand Down
21 changes: 19 additions & 2 deletions src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,12 @@ std::vector<std::vector<int>> get_streams_info_table(const int input_streams,
} else if (hint_model_distribution_policy.size() == 0) {
for (auto& row : proc_socket_table) {
if (row[PROC_SOCKET_ID] == current_socket_id) {
n_threads_per_stream = std::max(n_threads_per_stream, row[ALL_PROC]);
// n_threads_per_stream = std::max(n_threads_per_stream, row[ALL_PROC]);
// my test: add update here?
n_threads_per_stream =
model_prefer_threads > 0
? std::min(std::max(n_threads_per_stream, row[ALL_PROC]), model_prefer_threads)
: std::max(n_threads_per_stream, row[ALL_PROC]);
}
}
} else {
Expand Down Expand Up @@ -468,6 +473,7 @@ int get_model_prefer_threads(const int num_streams,
Config& config) {
const int sockets = get_num_sockets();
auto model_prefer = 0;
bool isNewXeon = false;
if (-1 == config.modelPreferThreads) {
const auto isa = dnnl::get_effective_cpu_isa();
float isaSpecificThreshold = 1.0f;
Expand All @@ -484,7 +490,9 @@ int get_model_prefer_threads(const int num_streams,
isaSpecificThreshold = 2.0f;
break;
case dnnl::cpu_isa::avx512_core_amx:
case dnnl::cpu_isa::avx512_core_amx_fp16:
isaSpecificThreshold = 4.0f;
isNewXeon = true;
break;
default:
isaSpecificThreshold = 1.0f;
Expand Down Expand Up @@ -552,6 +560,7 @@ int get_model_prefer_threads(const int num_streams,

// latency
if (num_streams <= sockets && num_streams > 0) {
bool llm_related = has_matmul_with_compressed_weights(model);
if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) {
#ifdef __APPLE__
if ((proc_type_table.size() == 1) && (proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) {
Expand All @@ -560,7 +569,6 @@ int get_model_prefer_threads(const int num_streams,
: proc_type_table[0][ALL_PROC];
}
#else
bool llm_related = has_matmul_with_compressed_weights(model);
bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model) || llm_related;
const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
Expand All @@ -574,6 +582,15 @@ int get_model_prefer_threads(const int num_streams,
: proc_type_table[0][MAIN_CORE_PROC])
: proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
#endif
} else if (isNewXeon && !llm_related && proc_type_table.size() > 1) {
// // my test
// model_prefer = 32;
// TODO: config.weightSize threshold need tobe updated
if (config.weightSize <= 100 && proc_type_table[1][MAIN_CORE_PROC] > 32) {
model_prefer = 32;
} else {
model_prefer = proc_type_table[1][MAIN_CORE_PROC];
}
}
} else { // throughput
model_prefer = config.modelPreferThreads;
Expand Down

0 comments on commit 4c7cf04

Please sign in to comment.