diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 001ffe6cc8f92..24557bb81bce3 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -211,13 +211,23 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses /// /// Key for using the ORT format model flatbuffer bytes directly for initializers. /// This avoids copying the bytes and reduces peak memory usage during model loading and initialization. -/// Requires `session.use_ort_model_bytes_directly` to be true. +/// Requires `session.use_ort_model_bytes_directly` or `session.use_memory_mapped_ort_model` to be true. /// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire /// duration of the InferenceSession. /// static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers = "session.use_ort_model_bytes_for_initializers"; +/// +/// Key for using memory-mapped I/O to load ORT format model files. +/// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O +/// to load the .ort model file instead of reading it into a heap-allocated buffer. +/// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes, +/// although the mapping must remain valid and model weights will be immutable. +/// The model load will fail if the mapping fails; fallbacks should be caller-handled. +/// +static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model"; + // This should only be specified when exporting an ORT format model for use on a different platform. // If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0" // Available since version 1.11. diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc index aeddef0c5188f..8faec1423cd3c 100644 --- a/onnxruntime/core/platform/posix/env.cc +++ b/onnxruntime/core/platform/posix/env.cc @@ -54,6 +54,7 @@ limitations under the License. #include #include "core/common/logging/logging.h" #include "core/common/narrow.h" +#include "core/common/safeint.h" #include "core/platform/scoped_resource.h" #include "core/platform/EigenNonBlockingThreadPool.h" @@ -430,9 +431,21 @@ class PosixEnv : public Env { return Status::OK(); } + // Validate that the file is large enough for the requested mapping. + struct stat file_stat; + if (fstat(file_descriptor.Get(), &file_stat) != 0) { + return ReportSystemError("fstat", file_path); + } + const size_t requested_end = SafeInt(offset) + length; + ORT_RETURN_IF(static_cast(file_stat.st_size) < requested_end, + "File \"", file_path, + "\" is too small for the requested mapping (file size: ", + file_stat.st_size, " bytes, requested offset + length: ", + requested_end, " bytes)."); + static const size_t page_size = narrow(sysconf(_SC_PAGESIZE)); const FileOffsetType offset_to_page = offset % static_cast(page_size); - const size_t mapped_length = length + static_cast(offset_to_page); + const size_t mapped_length = SafeInt(length) + static_cast(offset_to_page); const FileOffsetType mapped_offset = offset - offset_to_page; void* const mapped_base = mmap(nullptr, mapped_length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file_descriptor.Get(), mapped_offset); diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 91255f6ed7376..4d80b5afff4b8 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -424,6 +424,22 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path, " - ", std::system_category().message(error_code)); } + // Validate that the file is large enough for the requested mapping. + LARGE_INTEGER actual_size; + if (!GetFileSizeEx(file_handle.get(), &actual_size)) { + const auto error_code = GetLastError(); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "GetFileSizeEx ", ToUTF8String(Basename(file_path)), + " fail, errcode = ", error_code, + " - ", std::system_category().message(error_code)); + } + const size_t requested_end = SafeInt(offset) + length; + ORT_RETURN_IF(static_cast(actual_size.QuadPart) < requested_end, + "File ", ToUTF8String(Basename(file_path)), + " is too small for the requested mapping (file size: ", + actual_size.QuadPart, " bytes, requested offset + length: ", + requested_end, " bytes)."); + wil::unique_hfile file_mapping_handle{ CreateFileMappingW(file_handle.get(), nullptr, diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 5b5b38176d357..f6f0d5d80b9e9 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1747,10 +1747,36 @@ static Status LoadOrtModelBytes(const PathString& model_uri, return Status::OK(); } +static Status LoadOrtModelBytesMapped(const PathString& model_uri, + gsl::span& bytes, + Env::MappedMemoryPtr& mapped_memory) { + size_t num_bytes = 0; + ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes)); + ORT_RETURN_IF(num_bytes == 0, "Cannot memory-map an empty file: ", ToUTF8String(model_uri)); + + ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory)); + + bytes = gsl::span(reinterpret_cast(mapped_memory.get()), num_bytes); + + return Status::OK(); +} + Status InferenceSession::LoadOrtModel(const PathString& model_uri) { return LoadOrtModelWithLoader( [&]() { model_location_ = model_uri; + + const auto& config_options = GetSessionOptions().config_options; + const bool use_mmap = + config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1"; + + if (use_mmap) { + ORT_RETURN_IF_ERROR( + LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_)); + LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O."; + return Status::OK(); + } + ORT_RETURN_IF_ERROR( LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_)); return Status::OK(); @@ -1760,6 +1786,11 @@ Status InferenceSession::LoadOrtModel(const PathString& model_uri) { Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len) { return LoadOrtModelWithLoader([&]() { const auto& config_options = GetSessionOptions().config_options; + + if (config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1") { + LOGS(*session_logger_, WARNING) << "session.use_memory_mapped_ort_model is ignored when loading from a buffer."; + } + const auto use_ort_model_bytes_directly = config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "0") == "1"; @@ -1858,8 +1889,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model."); // if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user - // provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_ - // will be empty. + // provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file, + // ort_format_model_bytes_data_holder_ will be empty. // if that is the case we also allow creating initializers that directly use those bytes. const auto& config_options = session_options_.config_options; using_ort_model_bytes_for_initializers_ = @@ -2681,6 +2712,7 @@ common::Status InferenceSession::Initialize() { if (!using_ort_model_bytes_for_initializers_) { ort_format_model_bytes_ = gsl::span(); std::vector().swap(ort_format_model_bytes_data_holder_); + ort_format_model_mapped_memory_.reset(); } // once the model is saved, we may remove unnecessary attributes for inference diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index 705e420eb1137..1d2acb08241d3 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -15,6 +15,7 @@ #include "core/common/path_string.h" #include "core/common/profiler.h" #include "core/common/status.h" +#include "core/platform/env.h" #include "core/framework/execution_providers.h" #include "core/framework/framework_common.h" #include "core/framework/iexecutor.h" @@ -1025,6 +1026,8 @@ class InferenceSession { // We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize // behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model) // and some for the Initialize (create SessionState). + // If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the + // mapping in ort_format_model_mapped_memory_. // Short term we free them after Initialize. // Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy // those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away. @@ -1033,9 +1036,13 @@ class InferenceSession { // This holds the actual model data // In case if the session is started with an input byte array contains model data, and the caller // specifies that ORT should use the model bytes directly by setting the session config option - // "session.use_ort_model_bytes_directly" to "1", this will be empty + // "session.use_ort_model_bytes_directly" to "1", this will be empty. + // Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_. std::vector ort_format_model_bytes_data_holder_; + // Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set. + Env::MappedMemoryPtr ort_format_model_mapped_memory_; + bool using_ort_model_bytes_for_initializers_{false}; // Container to store pre-packed weights to share between sessions. diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc index 84e85c7bba7ee..ec4f8967fd2a3 100644 --- a/onnxruntime/test/framework/ort_model_only_test.cc +++ b/onnxruntime/test/framework/ort_model_only_test.cc @@ -37,6 +37,7 @@ struct OrtModelTestInfo { bool run_use_buffer{false}; bool disable_copy_ort_buffer{false}; bool use_buffer_for_initializers{false}; + bool use_memory_mapped_load{false}; TransformerLevel optimization_level = TransformerLevel::Level3; }; @@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) { if (test_info.disable_copy_ort_buffer) { ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1")); + } - if (test_info.use_buffer_for_initializers) { - ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1")); - } + if (test_info.use_memory_mapped_load) { + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1")); + } + + if (test_info.use_buffer_for_initializers && + (test_info.disable_copy_ort_buffer || (test_info.use_memory_mapped_load && !test_info.run_use_buffer))) { + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1")); } so.graph_optimization_level = test_info.optimization_level; @@ -557,6 +563,31 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer) RunOrtModel(test_info); } +// Load the model from a file using memory-mapped I/O +TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) { + OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel(); + test_info.use_memory_mapped_load = true; + RunOrtModel(test_info); +} + +// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes +TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) { + OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel(); + test_info.use_memory_mapped_load = true; + test_info.use_buffer_for_initializers = true; + RunOrtModel(test_info); +} + +// Verify that mmap loading fails gracefully on a non-existent file +TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedFailsOnMissingFile) { + SessionOptions so; + so.session_logid = "MemoryMappedMissingFile"; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1")); + InferenceSessionWrapper session_object{so, GetEnvironment()}; + auto status = session_object.Load(ORT_TSTR("nonexistent_model.ort")); + ASSERT_FALSE(status.IsOK()); +} + // regression test for 2 issues covered by PR #17000 (internally reported issue). // 1) allocation planner broke in minimal build when subgraph had no nodes. // 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index fe172acb24a34..3f878ac796ee3 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -187,6 +187,9 @@ ABSL_FLAG(int, spin_backoff_max, 1, "legacy single-SpinPause behavior. Values >= 2 enable exp-backoff (typical: 4 or 8) to reduce " "CPU/power density during the spin window. Values above 64 are clamped to 64."); ABSL_FLAG(bool, n, DefaultPerformanceTestConfig().run_config.exit_after_session_creation, "Allows user to measure session creation time to measure impact of enabling any initialization optimizations."); +ABSL_FLAG(uint32_t, hold_ms_after_session_creation, DefaultPerformanceTestConfig().run_config.hold_ms_after_session_creation, + "When used with -n, keeps the process alive for the specified number of milliseconds after session creation.\n" + "Prints 'SESSION_READY' to stdout before sleeping. Useful for multi-process memory measurements."); ABSL_FLAG(bool, l, DefaultPerformanceTestConfig().model_info.load_via_path, "Provides file as binary in memory by using fopen before session creation."); ABSL_FLAG(bool, g, DefaultPerformanceTestConfig().run_config.enable_cuda_io_binding, "[TensorRT RTX | TensorRT | CUDA] Enables tensor input and output bindings on CUDA before session run."); ABSL_FLAG(bool, X, DefaultPerformanceTestConfig().run_config.use_extensions, "Registers custom ops from onnxruntime-extensions."); @@ -529,6 +532,13 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a // -n test_config.run_config.exit_after_session_creation = absl::GetFlag(FLAGS_n); + // --hold_ms_after_session_creation + test_config.run_config.hold_ms_after_session_creation = absl::GetFlag(FLAGS_hold_ms_after_session_creation); + if (test_config.run_config.hold_ms_after_session_creation > 0 && + !test_config.run_config.exit_after_session_creation) { + fprintf(stderr, "WARNING: --hold_ms_after_session_creation has no effect without -n.\n"); + } + // -l test_config.model_info.load_via_path = absl::GetFlag(FLAGS_l); diff --git a/onnxruntime/test/perftest/main.cc b/onnxruntime/test/perftest/main.cc index 512f217a77151..2c7764507571d 100644 --- a/onnxruntime/test/perftest/main.cc +++ b/onnxruntime/test/perftest/main.cc @@ -3,7 +3,10 @@ // onnxruntime dependencies #include +#include +#include #include +#include #include "command_args_parser.h" #include "performance_runner.h" #include "utils.h" @@ -127,6 +130,11 @@ int RunPerfTest(Ort::Env& env, const perftest::PerformanceTestConfig& test_confi // Exit if user enabled -n option so that user can measure session creation time if (test_config.run_config.exit_after_session_creation) { perf_runner.LogSessionCreationTime(); + if (test_config.run_config.hold_ms_after_session_creation > 0) { + std::cout << "SESSION_READY" << std::endl; + std::this_thread::sleep_for( + std::chrono::milliseconds(test_config.run_config.hold_ms_after_session_creation)); + } return 0; } diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h index 643a4fbc539ec..dea0b196f6bcb 100644 --- a/onnxruntime/test/perftest/test_configuration.h +++ b/onnxruntime/test/perftest/test_configuration.h @@ -76,6 +76,7 @@ struct RunConfig { int spin_backoff_max = 1; // 1 means no backoff (default) bool spin_backoff_max_set = false; bool exit_after_session_creation = false; + uint32_t hold_ms_after_session_creation{0}; std::basic_string register_custom_op_path; bool enable_cuda_io_binding{false}; bool use_extensions = false; diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc index 924f9da41abef..cf110bd17b211 100644 --- a/onnxruntime/test/platform/file_io_test.cc +++ b/onnxruntime/test/platform/file_io_test.cc @@ -151,6 +151,11 @@ TEST(FileIoTest, MapFileIntoMemory) { // invalid - negative offset ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK()); + + // invalid - requested length exceeds file size + auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory); + ASSERT_FALSE(status.IsOK()); + ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos); } } #else @@ -184,6 +189,11 @@ TEST(FileIoTest, MapFileIntoMemory) { // invalid - negative offset ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory)); + + // invalid - requested length exceeds file size + auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory); + ASSERT_FALSE(status.IsOK()); + ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos); } } #endif diff --git a/tools/python/benchmark_mmap_ort.py b/tools/python/benchmark_mmap_ort.py new file mode 100644 index 0000000000000..c52dd52976662 --- /dev/null +++ b/tools/python/benchmark_mmap_ort.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +""" +Developer benchmark for memory-mapped .ort model loading. + +Compares session construction time and process memory across loading configurations: + 1. Standard .ort load (file read into heap buffer) + 2. Memory-mapped .ort load (session.use_memory_mapped_ort_model) + 3. Memory-mapped + direct initializers (+ session.use_ort_model_bytes_for_initializers) + +Not intended for CI gating or official performance measurement. + +Usage: + python benchmark_mmap_ort.py --perf-test --model + python benchmark_mmap_ort.py --perf-test --model --multi-process + +Requirements: + - Built onnxruntime_perf_test executable (with --hold_ms_after_session_creation support for --multi-process) + - .ort model file + - psutil package (pip install psutil) for memory measurements +""" + +import argparse +import json +import os +import re +import statistics +import subprocess +import sys +import time + +try: + import psutil + + HAS_PSUTIL = True +except ImportError: + HAS_PSUTIL = False + +IS_WINDOWS = sys.platform == "win32" + + +def _get_private_and_ws(ps: "psutil.Process") -> tuple[int, int]: + """Get private memory and working set for a process. + + On Windows, memory_info() exposes 'private' and 'wset' directly. + On POSIX, use memory_full_info().uss for true private (unique set size), + falling back to RSS if memory_full_info() is unavailable. + """ + if IS_WINDOWS: + mem = ps.memory_info() + return getattr(mem, "private", mem.rss), getattr(mem, "wset", mem.rss) + # POSIX: prefer USS (unique set size) for accurate private memory + try: + mem_full = ps.memory_full_info() + return mem_full.uss, mem_full.rss + except (psutil.AccessDenied, AttributeError): + mem = ps.memory_info() + return mem.rss, mem.rss + + +# -- Helpers -- + + +def parse_perf_test_output(output: str) -> dict: + """Parse onnxruntime_perf_test stdout for session creation time.""" + metrics = {} + for key, pattern in { + "session_creation_time_s": r"Session creation time cost:\s+([\d.]+)\s+s", + "peak_working_set_bytes": r"Peak working set size:\s+(\d+)\s+bytes", + }.items(): + match = re.search(pattern, output) + if match: + val = match.group(1) + metrics[key] = float(val) if "." in val else int(val) + return metrics + + +def build_perf_test_cmd(perf_test_exe: str, model_path: str, session_configs: dict) -> list[str]: + """Build the onnxruntime_perf_test command line for session-only mode.""" + cmd = [perf_test_exe] + if session_configs: + config_str = " ".join(f"{k}|{v}" for k, v in session_configs.items()) + cmd.extend(["-C", config_str]) + cmd.append("-n") + cmd.append(model_path) + return cmd + + +def run_session_benchmark(perf_test_exe: str, model_path: str, session_configs: dict) -> dict: + """Run a single session-creation benchmark, capturing timing and memory.""" + cmd = build_perf_test_cmd(perf_test_exe, model_path, session_configs) + + if HAS_PSUTIL: + # Launch and poll memory during execution + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + ps = psutil.Process(proc.pid) + peak_private = 0 + peak_ws = 0 + try: + while proc.poll() is None: + try: + private, ws = _get_private_and_ws(ps) + peak_private = max(peak_private, private) + peak_ws = max(peak_ws, ws) + except (psutil.NoSuchProcess, psutil.AccessDenied): + break + time.sleep(0.005) + except psutil.NoSuchProcess: + pass # process exited during polling, peak already captured + try: + stdout, _ = proc.communicate(timeout=30) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + return {} + if proc.returncode != 0: + return {} + metrics = parse_perf_test_output(stdout.decode(errors="replace") if isinstance(stdout, bytes) else stdout) + if peak_private > 0: + metrics["peak_private_bytes"] = peak_private + metrics["peak_working_set_bytes"] = peak_ws + return metrics + + # Fallback without psutil: timing only + result = subprocess.run(cmd, check=False, capture_output=True, text=True, timeout=300) + return parse_perf_test_output(result.stdout) if result.returncode == 0 else {} + + +# -- Single-process benchmark -- + + +def run_configuration( + perf_test_exe: str, + model_path: str, + config_name: str, + session_configs: dict, + num_iterations: int = 10, + warmup_iterations: int = 2, +) -> dict: + """Run a configuration multiple times and return aggregated results.""" + print(f"\n{'=' * 60}") + print(f" {config_name}") + print(f" Warmup: {warmup_iterations}, Iterations: {num_iterations}") + print(f"{'=' * 60}") + + for i in range(warmup_iterations): + run_session_benchmark(perf_test_exe, model_path, session_configs) + print(f" Warmup {i + 1}: done") + + session_times = [] + private_samples = [] + ws_samples = [] + + for i in range(num_iterations): + metrics = run_session_benchmark(perf_test_exe, model_path, session_configs) + if not metrics: + print(f" Run {i + 1}: FAILED") + continue + t = metrics.get("session_creation_time_s", 0) * 1000 + p = metrics.get("peak_private_bytes", 0) / 1024 / 1024 + w = metrics.get("peak_working_set_bytes", 0) / 1024 / 1024 + session_times.append(t) + private_samples.append(p) + ws_samples.append(w) + print(f" Run {i + 1}: session={t:.2f}ms, private={p:.1f}MB, ws={w:.1f}MB") + + result = {"config_name": config_name} + if session_times: + result["session_ms"] = { + "mean": statistics.mean(session_times), + "stdev": statistics.stdev(session_times) if len(session_times) > 1 else 0, + } + if any(p > 0 for p in private_samples): + result["private_mb"] = {"mean": statistics.mean(private_samples)} + if any(w > 0 for w in ws_samples): + result["ws_mb"] = {"mean": statistics.mean(ws_samples)} + return result + + +# -- Multi-process benchmark -- + + +def run_multi_process_benchmark( + perf_test_exe: str, + model_path: str, + session_configs: dict, + num_processes: int = 4, + config_name: str = "", +) -> dict: + """Launch N processes with live ORT sessions and measure concurrent memory. + + Requires onnxruntime_perf_test built with --hold_ms_after_session_creation support + and psutil for memory measurement. + """ + if not HAS_PSUTIL: + print(" WARNING: psutil not installed, skipping multi-process benchmark") + return {} + + print(f"\n{'=' * 60}") + print(f" {config_name} ({num_processes} processes)") + print(f"{'=' * 60}") + + cmd = build_perf_test_cmd(perf_test_exe, model_path, session_configs) + cmd.insert(-1, "--hold_ms_after_session_creation=30000") # insert before model_path + + # Launch all processes + ps_processes = [] + try: + for i in range(num_processes): + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + ps = psutil.Process(proc.pid) + except psutil.NoSuchProcess: + ps = None + ps_processes.append((i, proc, ps)) + print(f" Started process {i + 1} (PID={proc.pid})") + + # Wait for each process to signal SESSION_READY + for i, proc, _ps in ps_processes: + for line in proc.stdout: + if b"SESSION_READY" in line: + print(f" Process {i + 1}: ready") + break + + time.sleep(0.5) # stabilization + + # Measure memory (all processes alive with loaded sessions) + total_private = 0 + total_ws = 0 + per_process = [] + for i, proc, ps in ps_processes: + if ps and proc.poll() is None: + try: + private, ws = _get_private_and_ws(ps) + private_mb = private / 1024 / 1024 + ws_mb = ws / 1024 / 1024 + total_private += private_mb + total_ws += ws_mb + per_process.append({"pid": proc.pid, "private_mb": private_mb, "ws_mb": ws_mb}) + print(f" Process {i + 1} (PID={proc.pid}): private={private_mb:.1f}MB, ws={ws_mb:.1f}MB") + except (psutil.NoSuchProcess, psutil.AccessDenied) as e: + print(f" Process {i + 1}: could not read memory ({e})") + else: + print(f" Process {i + 1}: not running") + finally: + # Cleanup: ensure all child processes are terminated + for _, proc, _ in ps_processes: + proc.terminate() + for _, proc, _ in ps_processes: + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + + return { + "config_name": config_name, + "num_processes": num_processes, + "total_private_mb": total_private, + "total_ws_mb": total_ws, + "per_process": per_process, + } + + +# -- Output -- + + +def print_summary(results: list[dict]): + """Print results table with relative comparison.""" + print(f"\n{'=' * 90}") + print("BENCHMARK RESULTS SUMMARY") + print(f"{'=' * 90}") + + header = f"{'Configuration':<45} {'Session (ms)':<15} {'Private (MB)':<15} {'WS (MB)':<15}" + print(header) + print("-" * len(header)) + + for r in results: + name = r.get("config_name", "?") + t = r.get("session_ms", {}).get("mean", 0) + p = r.get("private_mb", {}).get("mean", 0) + w = r.get("ws_mb", {}).get("mean", 0) + print(f"{name:<45} {t:<15.2f} {p:<15.1f} {w:<15.1f}") + + # Relative to .ort standard baseline + if len(results) >= 2: + baseline = next((r for r in results if r.get("config_name", "").startswith("1.")), results[0]) + bt = baseline.get("session_ms", {}).get("mean", 0) + bp = baseline.get("private_mb", {}).get("mean", 0) + print(f"\nRelative to {baseline['config_name']}:") + print("-" * 60) + for r in results: + if r is baseline: + continue + name = r.get("config_name", "?") + rt = r.get("session_ms", {}).get("mean", 0) + rp = r.get("private_mb", {}).get("mean", 0) + parts = [f" {name}:"] + if bt > 0: + parts.append(f" Session: {(rt - bt) / bt * 100:+.1f}%") + if bp > 0: + parts.append(f" Private: {(rp - bp) / bp * 100:+.1f}%") + print("\n".join(parts)) + + +# -- Main -- + + +CONFIGS = [ + ("1. .ort standard load (baseline)", {}), + ("2. .ort memory-mapped load", {"session.use_memory_mapped_ort_model": "1"}), + ( + "3. .ort mmap + direct initializers", + {"session.use_memory_mapped_ort_model": "1", "session.use_ort_model_bytes_for_initializers": "1"}, + ), +] + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark memory-mapped .ort model loading") + parser.add_argument("--perf-test", required=True, help="Path to onnxruntime_perf_test executable") + parser.add_argument("--model", required=True, help="Path to .ort model file") + parser.add_argument("--iterations", type=int, default=10, help="Number of measured iterations per config") + parser.add_argument("--multi-process", action="store_true", help="Run multi-process memory sharing benchmark") + parser.add_argument("--num-processes", type=int, default=4, help="Number of processes for --multi-process") + parser.add_argument("--output", help="Save results to JSON file") + args = parser.parse_args() + + perf_test = os.path.abspath(args.perf_test) + model_path = os.path.abspath(args.model) + + for path, label in [(perf_test, "perf_test"), (model_path, "model")]: + if not os.path.exists(path): + print(f"ERROR: {label} not found: {path}") + sys.exit(1) + + model_size_mb = os.path.getsize(model_path) / 1024 / 1024 + print(f"\nModel: {os.path.basename(model_path)} ({model_size_mb:.1f} MB)") + print(f"Perf test: {perf_test}") + print(f"Iterations: {args.iterations}") + if not HAS_PSUTIL: + print("WARNING: psutil not installed — memory metrics will not be collected") + + # Single-process benchmarks + results = [] + for config_name, session_configs in CONFIGS: + results.append( + run_configuration(perf_test, model_path, config_name, session_configs, num_iterations=args.iterations) + ) + print_summary(results) + + # Multi-process benchmarks + mp_results = [] + if args.multi_process: + for config_name, session_configs in CONFIGS: + mp_results.append( + run_multi_process_benchmark( + perf_test, model_path, session_configs, num_processes=args.num_processes, config_name=config_name + ) + ) + if mp_results: + print(f"\n{'=' * 60}") + print("MULTI-PROCESS RESULTS") + print(f"{'=' * 60}") + for r in mp_results: + if r: + print(f" {r['config_name']} ({r['num_processes']} processes):") + print(f" Total private: {r['total_private_mb']:.1f} MB") + print(f" Total WS: {r['total_ws_mb']:.1f} MB") + + if args.output: + with open(args.output, "w") as f: + json.dump( + { + "model": os.path.basename(model_path), + "model_size_mb": model_size_mb, + "single": results, + "multi": mp_results, + }, + f, + indent=2, + ) + print(f"\nResults saved to: {args.output}") + + +if __name__ == "__main__": + main()