diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 001ffe6cc8f92..24557bb81bce3 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -211,13 +211,23 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses
///
/// Key for using the ORT format model flatbuffer bytes directly for initializers.
/// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
-/// Requires `session.use_ort_model_bytes_directly` to be true.
+/// Requires `session.use_ort_model_bytes_directly` or `session.use_memory_mapped_ort_model` to be true.
/// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
/// duration of the InferenceSession.
///
static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
"session.use_ort_model_bytes_for_initializers";
+///
+/// Key for using memory-mapped I/O to load ORT format model files.
+/// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O
+/// to load the .ort model file instead of reading it into a heap-allocated buffer.
+/// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes,
+/// although the mapping must remain valid and model weights will be immutable.
+/// The model load will fail if the mapping fails; fallbacks should be caller-handled.
+///
+static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model";
+
// This should only be specified when exporting an ORT format model for use on a different platform.
// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
// Available since version 1.11.
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index aeddef0c5188f..8faec1423cd3c 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -54,6 +54,7 @@ limitations under the License.
#include
#include "core/common/logging/logging.h"
#include "core/common/narrow.h"
+#include "core/common/safeint.h"
#include "core/platform/scoped_resource.h"
#include "core/platform/EigenNonBlockingThreadPool.h"
@@ -430,9 +431,21 @@ class PosixEnv : public Env {
return Status::OK();
}
+ // Validate that the file is large enough for the requested mapping.
+ struct stat file_stat;
+ if (fstat(file_descriptor.Get(), &file_stat) != 0) {
+ return ReportSystemError("fstat", file_path);
+ }
+ const size_t requested_end = SafeInt(offset) + length;
+ ORT_RETURN_IF(static_cast(file_stat.st_size) < requested_end,
+ "File \"", file_path,
+ "\" is too small for the requested mapping (file size: ",
+ file_stat.st_size, " bytes, requested offset + length: ",
+ requested_end, " bytes).");
+
static const size_t page_size = narrow(sysconf(_SC_PAGESIZE));
const FileOffsetType offset_to_page = offset % static_cast(page_size);
- const size_t mapped_length = length + static_cast(offset_to_page);
+ const size_t mapped_length = SafeInt(length) + static_cast(offset_to_page);
const FileOffsetType mapped_offset = offset - offset_to_page;
void* const mapped_base =
mmap(nullptr, mapped_length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file_descriptor.Get(), mapped_offset);
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 91255f6ed7376..4d80b5afff4b8 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -424,6 +424,22 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
" - ", std::system_category().message(error_code));
}
+ // Validate that the file is large enough for the requested mapping.
+ LARGE_INTEGER actual_size;
+ if (!GetFileSizeEx(file_handle.get(), &actual_size)) {
+ const auto error_code = GetLastError();
+ return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+ "GetFileSizeEx ", ToUTF8String(Basename(file_path)),
+ " fail, errcode = ", error_code,
+ " - ", std::system_category().message(error_code));
+ }
+ const size_t requested_end = SafeInt(offset) + length;
+ ORT_RETURN_IF(static_cast(actual_size.QuadPart) < requested_end,
+ "File ", ToUTF8String(Basename(file_path)),
+ " is too small for the requested mapping (file size: ",
+ actual_size.QuadPart, " bytes, requested offset + length: ",
+ requested_end, " bytes).");
+
wil::unique_hfile file_mapping_handle{
CreateFileMappingW(file_handle.get(),
nullptr,
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 5b5b38176d357..f6f0d5d80b9e9 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1747,10 +1747,36 @@ static Status LoadOrtModelBytes(const PathString& model_uri,
return Status::OK();
}
+static Status LoadOrtModelBytesMapped(const PathString& model_uri,
+ gsl::span& bytes,
+ Env::MappedMemoryPtr& mapped_memory) {
+ size_t num_bytes = 0;
+ ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes));
+ ORT_RETURN_IF(num_bytes == 0, "Cannot memory-map an empty file: ", ToUTF8String(model_uri));
+
+ ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory));
+
+ bytes = gsl::span(reinterpret_cast(mapped_memory.get()), num_bytes);
+
+ return Status::OK();
+}
+
Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
return LoadOrtModelWithLoader(
[&]() {
model_location_ = model_uri;
+
+ const auto& config_options = GetSessionOptions().config_options;
+ const bool use_mmap =
+ config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1";
+
+ if (use_mmap) {
+ ORT_RETURN_IF_ERROR(
+ LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_));
+ LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O.";
+ return Status::OK();
+ }
+
ORT_RETURN_IF_ERROR(
LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_));
return Status::OK();
@@ -1760,6 +1786,11 @@ Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len) {
return LoadOrtModelWithLoader([&]() {
const auto& config_options = GetSessionOptions().config_options;
+
+ if (config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1") {
+ LOGS(*session_logger_, WARNING) << "session.use_memory_mapped_ort_model is ignored when loading from a buffer.";
+ }
+
const auto use_ort_model_bytes_directly =
config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "0") == "1";
@@ -1858,8 +1889,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort
ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model.");
// if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user
- // provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_
- // will be empty.
+ // provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file,
+ // ort_format_model_bytes_data_holder_ will be empty.
// if that is the case we also allow creating initializers that directly use those bytes.
const auto& config_options = session_options_.config_options;
using_ort_model_bytes_for_initializers_ =
@@ -2681,6 +2712,7 @@ common::Status InferenceSession::Initialize() {
if (!using_ort_model_bytes_for_initializers_) {
ort_format_model_bytes_ = gsl::span();
std::vector().swap(ort_format_model_bytes_data_holder_);
+ ort_format_model_mapped_memory_.reset();
}
// once the model is saved, we may remove unnecessary attributes for inference
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 705e420eb1137..1d2acb08241d3 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -15,6 +15,7 @@
#include "core/common/path_string.h"
#include "core/common/profiler.h"
#include "core/common/status.h"
+#include "core/platform/env.h"
#include "core/framework/execution_providers.h"
#include "core/framework/framework_common.h"
#include "core/framework/iexecutor.h"
@@ -1025,6 +1026,8 @@ class InferenceSession {
// We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize
// behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model)
// and some for the Initialize (create SessionState).
+ // If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the
+ // mapping in ort_format_model_mapped_memory_.
// Short term we free them after Initialize.
// Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy
// those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away.
@@ -1033,9 +1036,13 @@ class InferenceSession {
// This holds the actual model data
// In case if the session is started with an input byte array contains model data, and the caller
// specifies that ORT should use the model bytes directly by setting the session config option
- // "session.use_ort_model_bytes_directly" to "1", this will be empty
+ // "session.use_ort_model_bytes_directly" to "1", this will be empty.
+ // Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_.
std::vector ort_format_model_bytes_data_holder_;
+ // Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set.
+ Env::MappedMemoryPtr ort_format_model_mapped_memory_;
+
bool using_ort_model_bytes_for_initializers_{false};
// Container to store pre-packed weights to share between sessions.
diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index 84e85c7bba7ee..ec4f8967fd2a3 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -37,6 +37,7 @@ struct OrtModelTestInfo {
bool run_use_buffer{false};
bool disable_copy_ort_buffer{false};
bool use_buffer_for_initializers{false};
+ bool use_memory_mapped_load{false};
TransformerLevel optimization_level = TransformerLevel::Level3;
};
@@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
if (test_info.disable_copy_ort_buffer) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1"));
+ }
- if (test_info.use_buffer_for_initializers) {
- ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
- }
+ if (test_info.use_memory_mapped_load) {
+ ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
+ }
+
+ if (test_info.use_buffer_for_initializers &&
+ (test_info.disable_copy_ort_buffer || (test_info.use_memory_mapped_load && !test_info.run_use_buffer))) {
+ ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
}
so.graph_optimization_level = test_info.optimization_level;
@@ -557,6 +563,31 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer)
RunOrtModel(test_info);
}
+// Load the model from a file using memory-mapped I/O
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) {
+ OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+ test_info.use_memory_mapped_load = true;
+ RunOrtModel(test_info);
+}
+
+// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) {
+ OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+ test_info.use_memory_mapped_load = true;
+ test_info.use_buffer_for_initializers = true;
+ RunOrtModel(test_info);
+}
+
+// Verify that mmap loading fails gracefully on a non-existent file
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedFailsOnMissingFile) {
+ SessionOptions so;
+ so.session_logid = "MemoryMappedMissingFile";
+ ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
+ InferenceSessionWrapper session_object{so, GetEnvironment()};
+ auto status = session_object.Load(ORT_TSTR("nonexistent_model.ort"));
+ ASSERT_FALSE(status.IsOK());
+}
+
// regression test for 2 issues covered by PR #17000 (internally reported issue).
// 1) allocation planner broke in minimal build when subgraph had no nodes.
// 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index fe172acb24a34..3f878ac796ee3 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -187,6 +187,9 @@ ABSL_FLAG(int, spin_backoff_max, 1,
"legacy single-SpinPause behavior. Values >= 2 enable exp-backoff (typical: 4 or 8) to reduce "
"CPU/power density during the spin window. Values above 64 are clamped to 64.");
ABSL_FLAG(bool, n, DefaultPerformanceTestConfig().run_config.exit_after_session_creation, "Allows user to measure session creation time to measure impact of enabling any initialization optimizations.");
+ABSL_FLAG(uint32_t, hold_ms_after_session_creation, DefaultPerformanceTestConfig().run_config.hold_ms_after_session_creation,
+ "When used with -n, keeps the process alive for the specified number of milliseconds after session creation.\n"
+ "Prints 'SESSION_READY' to stdout before sleeping. Useful for multi-process memory measurements.");
ABSL_FLAG(bool, l, DefaultPerformanceTestConfig().model_info.load_via_path, "Provides file as binary in memory by using fopen before session creation.");
ABSL_FLAG(bool, g, DefaultPerformanceTestConfig().run_config.enable_cuda_io_binding, "[TensorRT RTX | TensorRT | CUDA] Enables tensor input and output bindings on CUDA before session run.");
ABSL_FLAG(bool, X, DefaultPerformanceTestConfig().run_config.use_extensions, "Registers custom ops from onnxruntime-extensions.");
@@ -529,6 +532,13 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a
// -n
test_config.run_config.exit_after_session_creation = absl::GetFlag(FLAGS_n);
+ // --hold_ms_after_session_creation
+ test_config.run_config.hold_ms_after_session_creation = absl::GetFlag(FLAGS_hold_ms_after_session_creation);
+ if (test_config.run_config.hold_ms_after_session_creation > 0 &&
+ !test_config.run_config.exit_after_session_creation) {
+ fprintf(stderr, "WARNING: --hold_ms_after_session_creation has no effect without -n.\n");
+ }
+
// -l
test_config.model_info.load_via_path = absl::GetFlag(FLAGS_l);
diff --git a/onnxruntime/test/perftest/main.cc b/onnxruntime/test/perftest/main.cc
index 512f217a77151..2c7764507571d 100644
--- a/onnxruntime/test/perftest/main.cc
+++ b/onnxruntime/test/perftest/main.cc
@@ -3,7 +3,10 @@
// onnxruntime dependencies
#include
+#include
+#include
#include
+#include
#include "command_args_parser.h"
#include "performance_runner.h"
#include "utils.h"
@@ -127,6 +130,11 @@ int RunPerfTest(Ort::Env& env, const perftest::PerformanceTestConfig& test_confi
// Exit if user enabled -n option so that user can measure session creation time
if (test_config.run_config.exit_after_session_creation) {
perf_runner.LogSessionCreationTime();
+ if (test_config.run_config.hold_ms_after_session_creation > 0) {
+ std::cout << "SESSION_READY" << std::endl;
+ std::this_thread::sleep_for(
+ std::chrono::milliseconds(test_config.run_config.hold_ms_after_session_creation));
+ }
return 0;
}
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 643a4fbc539ec..dea0b196f6bcb 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -76,6 +76,7 @@ struct RunConfig {
int spin_backoff_max = 1; // 1 means no backoff (default)
bool spin_backoff_max_set = false;
bool exit_after_session_creation = false;
+ uint32_t hold_ms_after_session_creation{0};
std::basic_string register_custom_op_path;
bool enable_cuda_io_binding{false};
bool use_extensions = false;
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
index 924f9da41abef..cf110bd17b211 100644
--- a/onnxruntime/test/platform/file_io_test.cc
+++ b/onnxruntime/test/platform/file_io_test.cc
@@ -151,6 +151,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
// invalid - negative offset
ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
+
+ // invalid - requested length exceeds file size
+ auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
+ ASSERT_FALSE(status.IsOK());
+ ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
}
}
#else
@@ -184,6 +189,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
// invalid - negative offset
ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory));
+
+ // invalid - requested length exceeds file size
+ auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
+ ASSERT_FALSE(status.IsOK());
+ ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
}
}
#endif
diff --git a/tools/python/benchmark_mmap_ort.py b/tools/python/benchmark_mmap_ort.py
new file mode 100644
index 0000000000000..c52dd52976662
--- /dev/null
+++ b/tools/python/benchmark_mmap_ort.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""
+Developer benchmark for memory-mapped .ort model loading.
+
+Compares session construction time and process memory across loading configurations:
+ 1. Standard .ort load (file read into heap buffer)
+ 2. Memory-mapped .ort load (session.use_memory_mapped_ort_model)
+ 3. Memory-mapped + direct initializers (+ session.use_ort_model_bytes_for_initializers)
+
+Not intended for CI gating or official performance measurement.
+
+Usage:
+ python benchmark_mmap_ort.py --perf-test --model
+ python benchmark_mmap_ort.py --perf-test --model --multi-process
+
+Requirements:
+ - Built onnxruntime_perf_test executable (with --hold_ms_after_session_creation support for --multi-process)
+ - .ort model file
+ - psutil package (pip install psutil) for memory measurements
+"""
+
+import argparse
+import json
+import os
+import re
+import statistics
+import subprocess
+import sys
+import time
+
+try:
+ import psutil
+
+ HAS_PSUTIL = True
+except ImportError:
+ HAS_PSUTIL = False
+
+IS_WINDOWS = sys.platform == "win32"
+
+
+def _get_private_and_ws(ps: "psutil.Process") -> tuple[int, int]:
+ """Get private memory and working set for a process.
+
+ On Windows, memory_info() exposes 'private' and 'wset' directly.
+ On POSIX, use memory_full_info().uss for true private (unique set size),
+ falling back to RSS if memory_full_info() is unavailable.
+ """
+ if IS_WINDOWS:
+ mem = ps.memory_info()
+ return getattr(mem, "private", mem.rss), getattr(mem, "wset", mem.rss)
+ # POSIX: prefer USS (unique set size) for accurate private memory
+ try:
+ mem_full = ps.memory_full_info()
+ return mem_full.uss, mem_full.rss
+ except (psutil.AccessDenied, AttributeError):
+ mem = ps.memory_info()
+ return mem.rss, mem.rss
+
+
+# -- Helpers --
+
+
+def parse_perf_test_output(output: str) -> dict:
+ """Parse onnxruntime_perf_test stdout for session creation time."""
+ metrics = {}
+ for key, pattern in {
+ "session_creation_time_s": r"Session creation time cost:\s+([\d.]+)\s+s",
+ "peak_working_set_bytes": r"Peak working set size:\s+(\d+)\s+bytes",
+ }.items():
+ match = re.search(pattern, output)
+ if match:
+ val = match.group(1)
+ metrics[key] = float(val) if "." in val else int(val)
+ return metrics
+
+
+def build_perf_test_cmd(perf_test_exe: str, model_path: str, session_configs: dict) -> list[str]:
+ """Build the onnxruntime_perf_test command line for session-only mode."""
+ cmd = [perf_test_exe]
+ if session_configs:
+ config_str = " ".join(f"{k}|{v}" for k, v in session_configs.items())
+ cmd.extend(["-C", config_str])
+ cmd.append("-n")
+ cmd.append(model_path)
+ return cmd
+
+
+def run_session_benchmark(perf_test_exe: str, model_path: str, session_configs: dict) -> dict:
+ """Run a single session-creation benchmark, capturing timing and memory."""
+ cmd = build_perf_test_cmd(perf_test_exe, model_path, session_configs)
+
+ if HAS_PSUTIL:
+ # Launch and poll memory during execution
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ ps = psutil.Process(proc.pid)
+ peak_private = 0
+ peak_ws = 0
+ try:
+ while proc.poll() is None:
+ try:
+ private, ws = _get_private_and_ws(ps)
+ peak_private = max(peak_private, private)
+ peak_ws = max(peak_ws, ws)
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
+ break
+ time.sleep(0.005)
+ except psutil.NoSuchProcess:
+ pass # process exited during polling, peak already captured
+ try:
+ stdout, _ = proc.communicate(timeout=30)
+ except subprocess.TimeoutExpired:
+ proc.kill()
+ proc.wait()
+ return {}
+ if proc.returncode != 0:
+ return {}
+ metrics = parse_perf_test_output(stdout.decode(errors="replace") if isinstance(stdout, bytes) else stdout)
+ if peak_private > 0:
+ metrics["peak_private_bytes"] = peak_private
+ metrics["peak_working_set_bytes"] = peak_ws
+ return metrics
+
+ # Fallback without psutil: timing only
+ result = subprocess.run(cmd, check=False, capture_output=True, text=True, timeout=300)
+ return parse_perf_test_output(result.stdout) if result.returncode == 0 else {}
+
+
+# -- Single-process benchmark --
+
+
+def run_configuration(
+ perf_test_exe: str,
+ model_path: str,
+ config_name: str,
+ session_configs: dict,
+ num_iterations: int = 10,
+ warmup_iterations: int = 2,
+) -> dict:
+ """Run a configuration multiple times and return aggregated results."""
+ print(f"\n{'=' * 60}")
+ print(f" {config_name}")
+ print(f" Warmup: {warmup_iterations}, Iterations: {num_iterations}")
+ print(f"{'=' * 60}")
+
+ for i in range(warmup_iterations):
+ run_session_benchmark(perf_test_exe, model_path, session_configs)
+ print(f" Warmup {i + 1}: done")
+
+ session_times = []
+ private_samples = []
+ ws_samples = []
+
+ for i in range(num_iterations):
+ metrics = run_session_benchmark(perf_test_exe, model_path, session_configs)
+ if not metrics:
+ print(f" Run {i + 1}: FAILED")
+ continue
+ t = metrics.get("session_creation_time_s", 0) * 1000
+ p = metrics.get("peak_private_bytes", 0) / 1024 / 1024
+ w = metrics.get("peak_working_set_bytes", 0) / 1024 / 1024
+ session_times.append(t)
+ private_samples.append(p)
+ ws_samples.append(w)
+ print(f" Run {i + 1}: session={t:.2f}ms, private={p:.1f}MB, ws={w:.1f}MB")
+
+ result = {"config_name": config_name}
+ if session_times:
+ result["session_ms"] = {
+ "mean": statistics.mean(session_times),
+ "stdev": statistics.stdev(session_times) if len(session_times) > 1 else 0,
+ }
+ if any(p > 0 for p in private_samples):
+ result["private_mb"] = {"mean": statistics.mean(private_samples)}
+ if any(w > 0 for w in ws_samples):
+ result["ws_mb"] = {"mean": statistics.mean(ws_samples)}
+ return result
+
+
+# -- Multi-process benchmark --
+
+
+def run_multi_process_benchmark(
+ perf_test_exe: str,
+ model_path: str,
+ session_configs: dict,
+ num_processes: int = 4,
+ config_name: str = "",
+) -> dict:
+ """Launch N processes with live ORT sessions and measure concurrent memory.
+
+ Requires onnxruntime_perf_test built with --hold_ms_after_session_creation support
+ and psutil for memory measurement.
+ """
+ if not HAS_PSUTIL:
+ print(" WARNING: psutil not installed, skipping multi-process benchmark")
+ return {}
+
+ print(f"\n{'=' * 60}")
+ print(f" {config_name} ({num_processes} processes)")
+ print(f"{'=' * 60}")
+
+ cmd = build_perf_test_cmd(perf_test_exe, model_path, session_configs)
+ cmd.insert(-1, "--hold_ms_after_session_creation=30000") # insert before model_path
+
+ # Launch all processes
+ ps_processes = []
+ try:
+ for i in range(num_processes):
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ try:
+ ps = psutil.Process(proc.pid)
+ except psutil.NoSuchProcess:
+ ps = None
+ ps_processes.append((i, proc, ps))
+ print(f" Started process {i + 1} (PID={proc.pid})")
+
+ # Wait for each process to signal SESSION_READY
+ for i, proc, _ps in ps_processes:
+ for line in proc.stdout:
+ if b"SESSION_READY" in line:
+ print(f" Process {i + 1}: ready")
+ break
+
+ time.sleep(0.5) # stabilization
+
+ # Measure memory (all processes alive with loaded sessions)
+ total_private = 0
+ total_ws = 0
+ per_process = []
+ for i, proc, ps in ps_processes:
+ if ps and proc.poll() is None:
+ try:
+ private, ws = _get_private_and_ws(ps)
+ private_mb = private / 1024 / 1024
+ ws_mb = ws / 1024 / 1024
+ total_private += private_mb
+ total_ws += ws_mb
+ per_process.append({"pid": proc.pid, "private_mb": private_mb, "ws_mb": ws_mb})
+ print(f" Process {i + 1} (PID={proc.pid}): private={private_mb:.1f}MB, ws={ws_mb:.1f}MB")
+ except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
+ print(f" Process {i + 1}: could not read memory ({e})")
+ else:
+ print(f" Process {i + 1}: not running")
+ finally:
+ # Cleanup: ensure all child processes are terminated
+ for _, proc, _ in ps_processes:
+ proc.terminate()
+ for _, proc, _ in ps_processes:
+ try:
+ proc.wait(timeout=10)
+ except subprocess.TimeoutExpired:
+ proc.kill()
+
+ return {
+ "config_name": config_name,
+ "num_processes": num_processes,
+ "total_private_mb": total_private,
+ "total_ws_mb": total_ws,
+ "per_process": per_process,
+ }
+
+
+# -- Output --
+
+
+def print_summary(results: list[dict]):
+ """Print results table with relative comparison."""
+ print(f"\n{'=' * 90}")
+ print("BENCHMARK RESULTS SUMMARY")
+ print(f"{'=' * 90}")
+
+ header = f"{'Configuration':<45} {'Session (ms)':<15} {'Private (MB)':<15} {'WS (MB)':<15}"
+ print(header)
+ print("-" * len(header))
+
+ for r in results:
+ name = r.get("config_name", "?")
+ t = r.get("session_ms", {}).get("mean", 0)
+ p = r.get("private_mb", {}).get("mean", 0)
+ w = r.get("ws_mb", {}).get("mean", 0)
+ print(f"{name:<45} {t:<15.2f} {p:<15.1f} {w:<15.1f}")
+
+ # Relative to .ort standard baseline
+ if len(results) >= 2:
+ baseline = next((r for r in results if r.get("config_name", "").startswith("1.")), results[0])
+ bt = baseline.get("session_ms", {}).get("mean", 0)
+ bp = baseline.get("private_mb", {}).get("mean", 0)
+ print(f"\nRelative to {baseline['config_name']}:")
+ print("-" * 60)
+ for r in results:
+ if r is baseline:
+ continue
+ name = r.get("config_name", "?")
+ rt = r.get("session_ms", {}).get("mean", 0)
+ rp = r.get("private_mb", {}).get("mean", 0)
+ parts = [f" {name}:"]
+ if bt > 0:
+ parts.append(f" Session: {(rt - bt) / bt * 100:+.1f}%")
+ if bp > 0:
+ parts.append(f" Private: {(rp - bp) / bp * 100:+.1f}%")
+ print("\n".join(parts))
+
+
+# -- Main --
+
+
+CONFIGS = [
+ ("1. .ort standard load (baseline)", {}),
+ ("2. .ort memory-mapped load", {"session.use_memory_mapped_ort_model": "1"}),
+ (
+ "3. .ort mmap + direct initializers",
+ {"session.use_memory_mapped_ort_model": "1", "session.use_ort_model_bytes_for_initializers": "1"},
+ ),
+]
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Benchmark memory-mapped .ort model loading")
+ parser.add_argument("--perf-test", required=True, help="Path to onnxruntime_perf_test executable")
+ parser.add_argument("--model", required=True, help="Path to .ort model file")
+ parser.add_argument("--iterations", type=int, default=10, help="Number of measured iterations per config")
+ parser.add_argument("--multi-process", action="store_true", help="Run multi-process memory sharing benchmark")
+ parser.add_argument("--num-processes", type=int, default=4, help="Number of processes for --multi-process")
+ parser.add_argument("--output", help="Save results to JSON file")
+ args = parser.parse_args()
+
+ perf_test = os.path.abspath(args.perf_test)
+ model_path = os.path.abspath(args.model)
+
+ for path, label in [(perf_test, "perf_test"), (model_path, "model")]:
+ if not os.path.exists(path):
+ print(f"ERROR: {label} not found: {path}")
+ sys.exit(1)
+
+ model_size_mb = os.path.getsize(model_path) / 1024 / 1024
+ print(f"\nModel: {os.path.basename(model_path)} ({model_size_mb:.1f} MB)")
+ print(f"Perf test: {perf_test}")
+ print(f"Iterations: {args.iterations}")
+ if not HAS_PSUTIL:
+ print("WARNING: psutil not installed — memory metrics will not be collected")
+
+ # Single-process benchmarks
+ results = []
+ for config_name, session_configs in CONFIGS:
+ results.append(
+ run_configuration(perf_test, model_path, config_name, session_configs, num_iterations=args.iterations)
+ )
+ print_summary(results)
+
+ # Multi-process benchmarks
+ mp_results = []
+ if args.multi_process:
+ for config_name, session_configs in CONFIGS:
+ mp_results.append(
+ run_multi_process_benchmark(
+ perf_test, model_path, session_configs, num_processes=args.num_processes, config_name=config_name
+ )
+ )
+ if mp_results:
+ print(f"\n{'=' * 60}")
+ print("MULTI-PROCESS RESULTS")
+ print(f"{'=' * 60}")
+ for r in mp_results:
+ if r:
+ print(f" {r['config_name']} ({r['num_processes']} processes):")
+ print(f" Total private: {r['total_private_mb']:.1f} MB")
+ print(f" Total WS: {r['total_ws_mb']:.1f} MB")
+
+ if args.output:
+ with open(args.output, "w") as f:
+ json.dump(
+ {
+ "model": os.path.basename(model_path),
+ "model_size_mb": model_size_mb,
+ "single": results,
+ "multi": mp_results,
+ },
+ f,
+ indent=2,
+ )
+ print(f"\nResults saved to: {args.output}")
+
+
+if __name__ == "__main__":
+ main()