diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 001ffe6cc8f92..24557bb81bce3 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -211,13 +211,23 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses
 /// <summary>
 /// Key for using the ORT format model flatbuffer bytes directly for initializers.
 /// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
-/// Requires `session.use_ort_model_bytes_directly` to be true.
+/// Requires `session.use_ort_model_bytes_directly` or `session.use_memory_mapped_ort_model` to be true.
 /// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
 /// duration of the InferenceSession.
 /// </summary>
 static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
     "session.use_ort_model_bytes_for_initializers";
 
+/// <summary>
+/// Key for using memory-mapped I/O to load ORT format model files.
+/// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O
+/// to load the .ort model file instead of reading it into a heap-allocated buffer.
+/// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes,
+/// although the mapping must remain valid and model weights will be immutable.
+/// The model load will fail if the mapping fails; fallbacks should be caller-handled.
+/// </summary>
+static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model";
+
 // This should only be specified when exporting an ORT format model for use on a different platform.
 // If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
 // Available since version 1.11.
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index aeddef0c5188f..8faec1423cd3c 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include <gsl/gsl>
 #include "core/common/logging/logging.h"
 #include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/platform/scoped_resource.h"
 #include "core/platform/EigenNonBlockingThreadPool.h"
 
@@ -430,9 +431,21 @@ class PosixEnv : public Env {
       return Status::OK();
     }
 
+    // Validate that the file is large enough for the requested mapping.
+    struct stat file_stat;
+    if (fstat(file_descriptor.Get(), &file_stat) != 0) {
+      return ReportSystemError("fstat", file_path);
+    }
+    const size_t requested_end = SafeInt<size_t>(offset) + length;
+    ORT_RETURN_IF(static_cast<size_t>(file_stat.st_size) < requested_end,
+                  "File \"", file_path,
+                  "\" is too small for the requested mapping (file size: ",
+                  file_stat.st_size, " bytes, requested offset + length: ",
+                  requested_end, " bytes).");
+
     static const size_t page_size = narrow<size_t>(sysconf(_SC_PAGESIZE));
     const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
-    const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
+    const size_t mapped_length = SafeInt<size_t>(length) + static_cast<size_t>(offset_to_page);
     const FileOffsetType mapped_offset = offset - offset_to_page;
     void* const mapped_base =
         mmap(nullptr, mapped_length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file_descriptor.Get(), mapped_offset);
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 91255f6ed7376..4d80b5afff4b8 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -424,6 +424,22 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
                            " - ", std::system_category().message(error_code));
   }
 
+  // Validate that the file is large enough for the requested mapping.
+  LARGE_INTEGER actual_size;
+  if (!GetFileSizeEx(file_handle.get(), &actual_size)) {
+    const auto error_code = GetLastError();
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "GetFileSizeEx ", ToUTF8String(Basename(file_path)),
+                           " fail, errcode = ", error_code,
+                           " - ", std::system_category().message(error_code));
+  }
+  const size_t requested_end = SafeInt<size_t>(offset) + length;
+  ORT_RETURN_IF(static_cast<ULONGLONG>(actual_size.QuadPart) < requested_end,
+                "File ", ToUTF8String(Basename(file_path)),
+                " is too small for the requested mapping (file size: ",
+                actual_size.QuadPart, " bytes, requested offset + length: ",
+                requested_end, " bytes).");
+
   wil::unique_hfile file_mapping_handle{
       CreateFileMappingW(file_handle.get(),
                          nullptr,
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 5b5b38176d357..f6f0d5d80b9e9 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1747,10 +1747,36 @@ static Status LoadOrtModelBytes(const PathString& model_uri,
   return Status::OK();
 }
 
+static Status LoadOrtModelBytesMapped(const PathString& model_uri,
+                                      gsl::span<const uint8_t>& bytes,
+                                      Env::MappedMemoryPtr& mapped_memory) {
+  size_t num_bytes = 0;
+  ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes));
+  ORT_RETURN_IF(num_bytes == 0, "Cannot memory-map an empty file: ", ToUTF8String(model_uri));
+
+  ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory));
+
+  bytes = gsl::span<const uint8_t>(reinterpret_cast<const uint8_t*>(mapped_memory.get()), num_bytes);
+
+  return Status::OK();
+}
+
 Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
   return LoadOrtModelWithLoader(
       [&]() {
         model_location_ = model_uri;
+
+        const auto& config_options = GetSessionOptions().config_options;
+        const bool use_mmap =
+            config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1";
+
+        if (use_mmap) {
+          ORT_RETURN_IF_ERROR(
+              LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_));
+          LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O.";
+          return Status::OK();
+        }
+
         ORT_RETURN_IF_ERROR(
             LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_));
         return Status::OK();
@@ -1760,6 +1786,11 @@ Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
 Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len) {
   return LoadOrtModelWithLoader([&]() {
     const auto& config_options = GetSessionOptions().config_options;
+
+    if (config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1") {
+      LOGS(*session_logger_, WARNING) << "session.use_memory_mapped_ort_model is ignored when loading from a buffer.";
+    }
+
     const auto use_ort_model_bytes_directly =
         config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "0") == "1";
 
@@ -1858,8 +1889,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
   ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model.");
 
   // if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user
-  // provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_
-  // will be empty.
+  // provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file,
+  // ort_format_model_bytes_data_holder_ will be empty.
   // if that is the case we also allow creating initializers that directly use those bytes.
   const auto& config_options = session_options_.config_options;
   using_ort_model_bytes_for_initializers_ =
@@ -2681,6 +2712,7 @@ common::Status InferenceSession::Initialize() {
     if (!using_ort_model_bytes_for_initializers_) {
       ort_format_model_bytes_ = gsl::span<const uint8_t>();
       std::vector<uint8_t>().swap(ort_format_model_bytes_data_holder_);
+      ort_format_model_mapped_memory_.reset();
     }
 
     // once the model is saved, we may remove unnecessary attributes for inference
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 705e420eb1137..1d2acb08241d3 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -15,6 +15,7 @@
 #include "core/common/path_string.h"
 #include "core/common/profiler.h"
 #include "core/common/status.h"
+#include "core/platform/env.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/framework_common.h"
 #include "core/framework/iexecutor.h"
@@ -1025,6 +1026,8 @@ class InferenceSession {
   //   We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize
   //   behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model)
   //   and some for the Initialize (create SessionState).
+  //   If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the
+  //   mapping in ort_format_model_mapped_memory_.
   // Short term we free them after Initialize.
   // Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy
   // those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away.
@@ -1033,9 +1036,13 @@ class InferenceSession {
   // This holds the actual model data
   // In case if the session is started with an input byte array contains model data, and the caller
   // specifies that ORT should use the model bytes directly by setting the session config option
-  // "session.use_ort_model_bytes_directly" to "1", this will be empty
+  // "session.use_ort_model_bytes_directly" to "1", this will be empty.
+  // Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_.
   std::vector<uint8_t> ort_format_model_bytes_data_holder_;
 
+  // Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set.
+  Env::MappedMemoryPtr ort_format_model_mapped_memory_;
+
   bool using_ort_model_bytes_for_initializers_{false};
 
   // Container to store pre-packed weights to share between sessions.
diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index 84e85c7bba7ee..ec4f8967fd2a3 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -37,6 +37,7 @@ struct OrtModelTestInfo {
   bool run_use_buffer{false};
   bool disable_copy_ort_buffer{false};
   bool use_buffer_for_initializers{false};
+  bool use_memory_mapped_load{false};
   TransformerLevel optimization_level = TransformerLevel::Level3;
 };
 
@@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
 
   if (test_info.disable_copy_ort_buffer) {
     ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1"));
+  }
 
-    if (test_info.use_buffer_for_initializers) {
-      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
-    }
+  if (test_info.use_memory_mapped_load) {
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
+  }
+
+  if (test_info.use_buffer_for_initializers &&
+      (test_info.disable_copy_ort_buffer || (test_info.use_memory_mapped_load && !test_info.run_use_buffer))) {
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
   }
 
   so.graph_optimization_level = test_info.optimization_level;
@@ -557,6 +563,31 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer)
   RunOrtModel(test_info);
 }
 
+// Load the model from a file using memory-mapped I/O
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) {
+  OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+  test_info.use_memory_mapped_load = true;
+  RunOrtModel(test_info);
+}
+
+// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) {
+  OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+  test_info.use_memory_mapped_load = true;
+  test_info.use_buffer_for_initializers = true;
+  RunOrtModel(test_info);
+}
+
+// Verify that mmap loading fails gracefully on a non-existent file
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedFailsOnMissingFile) {
+  SessionOptions so;
+  so.session_logid = "MemoryMappedMissingFile";
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+  auto status = session_object.Load(ORT_TSTR("nonexistent_model.ort"));
+  ASSERT_FALSE(status.IsOK());
+}
+
 // regression test for 2 issues covered by PR #17000 (internally reported issue).
 // 1) allocation planner broke in minimal build when subgraph had no nodes.
 // 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index fe172acb24a34..3f878ac796ee3 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -187,6 +187,9 @@ ABSL_FLAG(int, spin_backoff_max, 1,
           "legacy single-SpinPause behavior. Values >= 2 enable exp-backoff (typical: 4 or 8) to reduce "
           "CPU/power density during the spin window. Values above 64 are clamped to 64.");
 ABSL_FLAG(bool, n, DefaultPerformanceTestConfig().run_config.exit_after_session_creation, "Allows user to measure session creation time to measure impact of enabling any initialization optimizations.");
+ABSL_FLAG(uint32_t, hold_ms_after_session_creation, DefaultPerformanceTestConfig().run_config.hold_ms_after_session_creation,
+          "When used with -n, keeps the process alive for the specified number of milliseconds after session creation.\n"
+          "Prints 'SESSION_READY' to stdout before sleeping. Useful for multi-process memory measurements.");
 ABSL_FLAG(bool, l, DefaultPerformanceTestConfig().model_info.load_via_path, "Provides file as binary in memory by using fopen before session creation.");
 ABSL_FLAG(bool, g, DefaultPerformanceTestConfig().run_config.enable_cuda_io_binding, "[TensorRT RTX | TensorRT | CUDA] Enables tensor input and output bindings on CUDA before session run.");
 ABSL_FLAG(bool, X, DefaultPerformanceTestConfig().run_config.use_extensions, "Registers custom ops from onnxruntime-extensions.");
@@ -529,6 +532,13 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a
   // -n
   test_config.run_config.exit_after_session_creation = absl::GetFlag(FLAGS_n);
 
+  // --hold_ms_after_session_creation
+  test_config.run_config.hold_ms_after_session_creation = absl::GetFlag(FLAGS_hold_ms_after_session_creation);
+  if (test_config.run_config.hold_ms_after_session_creation > 0 &&
+      !test_config.run_config.exit_after_session_creation) {
+    fprintf(stderr, "WARNING: --hold_ms_after_session_creation has no effect without -n.\n");
+  }
+
   // -l
   test_config.model_info.load_via_path = absl::GetFlag(FLAGS_l);
 
diff --git a/onnxruntime/test/perftest/main.cc b/onnxruntime/test/perftest/main.cc
index 512f217a77151..2c7764507571d 100644
--- a/onnxruntime/test/perftest/main.cc
+++ b/onnxruntime/test/perftest/main.cc
@@ -3,7 +3,10 @@
 
 // onnxruntime dependencies
 #include <core/session/onnxruntime_c_api.h>
+#include <chrono>
+#include <iostream>
 #include <random>
+#include <thread>
 #include "command_args_parser.h"
 #include "performance_runner.h"
 #include "utils.h"
@@ -127,6 +130,11 @@ int RunPerfTest(Ort::Env& env, const perftest::PerformanceTestConfig& test_confi
   // Exit if user enabled -n option so that user can measure session creation time
   if (test_config.run_config.exit_after_session_creation) {
     perf_runner.LogSessionCreationTime();
+    if (test_config.run_config.hold_ms_after_session_creation > 0) {
+      std::cout << "SESSION_READY" << std::endl;
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(test_config.run_config.hold_ms_after_session_creation));
+    }
     return 0;
   }
 
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 643a4fbc539ec..dea0b196f6bcb 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -76,6 +76,7 @@ struct RunConfig {
   int spin_backoff_max = 1;  // 1 means no backoff (default)
   bool spin_backoff_max_set = false;
   bool exit_after_session_creation = false;
+  uint32_t hold_ms_after_session_creation{0};
   std::basic_string<ORTCHAR_T> register_custom_op_path;
   bool enable_cuda_io_binding{false};
   bool use_extensions = false;
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
index 924f9da41abef..cf110bd17b211 100644
--- a/onnxruntime/test/platform/file_io_test.cc
+++ b/onnxruntime/test/platform/file_io_test.cc
@@ -151,6 +151,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
 
     // invalid - negative offset
     ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
+
+    // invalid - requested length exceeds file size
+    auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
+    ASSERT_FALSE(status.IsOK());
+    ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
   }
 }
 #else
@@ -184,6 +189,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
 
     // invalid - negative offset
     ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory));
+
+    // invalid - requested length exceeds file size
+    auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
+    ASSERT_FALSE(status.IsOK());
+    ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
   }
 }
 #endif
diff --git a/tools/python/benchmark_mmap_ort.py b/tools/python/benchmark_mmap_ort.py
new file mode 100644
index 0000000000000..c52dd52976662
--- /dev/null
+++ b/tools/python/benchmark_mmap_ort.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""
+Developer benchmark for memory-mapped .ort model loading.
+
+Compares session construction time and process memory across loading configurations:
+  1. Standard .ort load (file read into heap buffer)
+  2. Memory-mapped .ort load (session.use_memory_mapped_ort_model)
+  3. Memory-mapped + direct initializers (+ session.use_ort_model_bytes_for_initializers)
+
+Not intended for CI gating or official performance measurement.
+
+Usage:
+    python benchmark_mmap_ort.py --perf-test <path_to_onnxruntime_perf_test> --model <model.ort>
+    python benchmark_mmap_ort.py --perf-test <path> --model <model.ort> --multi-process
+
+Requirements:
+    - Built onnxruntime_perf_test executable (with --hold_ms_after_session_creation support for --multi-process)
+    - .ort model file
+    - psutil package (pip install psutil) for memory measurements
+"""
+
+import argparse
+import json
+import os
+import re
+import statistics
+import subprocess
+import sys
+import time
+
+try:
+    import psutil
+
+    HAS_PSUTIL = True
+except ImportError:
+    HAS_PSUTIL = False
+
+IS_WINDOWS = sys.platform == "win32"
+
+
+def _get_private_and_ws(ps: "psutil.Process") -> tuple[int, int]:
+    """Get private memory and working set for a process.
+
+    On Windows, memory_info() exposes 'private' and 'wset' directly.
+    On POSIX, use memory_full_info().uss for true private (unique set size),
+    falling back to RSS if memory_full_info() is unavailable.
+    """
+    if IS_WINDOWS:
+        mem = ps.memory_info()
+        return getattr(mem, "private", mem.rss), getattr(mem, "wset", mem.rss)
+    # POSIX: prefer USS (unique set size) for accurate private memory
+    try:
+        mem_full = ps.memory_full_info()
+        return mem_full.uss, mem_full.rss
+    except (psutil.AccessDenied, AttributeError):
+        mem = ps.memory_info()
+        return mem.rss, mem.rss
+
+
+# -- Helpers --
+
+
+def parse_perf_test_output(output: str) -> dict:
+    """Parse onnxruntime_perf_test stdout for session creation time."""
+    metrics = {}
+    for key, pattern in {
+        "session_creation_time_s": r"Session creation time cost:\s+([\d.]+)\s+s",
+        "peak_working_set_bytes": r"Peak working set size:\s+(\d+)\s+bytes",
+    }.items():
+        match = re.search(pattern, output)
+        if match:
+            val = match.group(1)
+            metrics[key] = float(val) if "." in val else int(val)
+    return metrics
+
+
+def build_perf_test_cmd(perf_test_exe: str, model_path: str, session_configs: dict) -> list[str]:
+    """Build the onnxruntime_perf_test command line for session-only mode."""
+    cmd = [perf_test_exe]
+    if session_configs:
+        config_str = " ".join(f"{k}|{v}" for k, v in session_configs.items())
+        cmd.extend(["-C", config_str])
+    cmd.append("-n")
+    cmd.append(model_path)
+    return cmd
+
+
+def run_session_benchmark(perf_test_exe: str, model_path: str, session_configs: dict) -> dict:
+    """Run a single session-creation benchmark, capturing timing and memory."""
+    cmd = build_perf_test_cmd(perf_test_exe, model_path, session_configs)
+
+    if HAS_PSUTIL:
+        # Launch and poll memory during execution
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps = psutil.Process(proc.pid)
+        peak_private = 0
+        peak_ws = 0
+        try:
+            while proc.poll() is None:
+                try:
+                    private, ws = _get_private_and_ws(ps)
+                    peak_private = max(peak_private, private)
+                    peak_ws = max(peak_ws, ws)
+                except (psutil.NoSuchProcess, psutil.AccessDenied):
+                    break
+                time.sleep(0.005)
+        except psutil.NoSuchProcess:
+            pass  # process exited during polling, peak already captured
+        try:
+            stdout, _ = proc.communicate(timeout=30)
+        except subprocess.TimeoutExpired:
+            proc.kill()
+            proc.wait()
+            return {}
+        if proc.returncode != 0:
+            return {}
+        metrics = parse_perf_test_output(stdout.decode(errors="replace") if isinstance(stdout, bytes) else stdout)
+        if peak_private > 0:
+            metrics["peak_private_bytes"] = peak_private
+            metrics["peak_working_set_bytes"] = peak_ws
+        return metrics
+
+    # Fallback without psutil: timing only
+    result = subprocess.run(cmd, check=False, capture_output=True, text=True, timeout=300)
+    return parse_perf_test_output(result.stdout) if result.returncode == 0 else {}
+
+
+# -- Single-process benchmark --
+
+
+def run_configuration(
+    perf_test_exe: str,
+    model_path: str,
+    config_name: str,
+    session_configs: dict,
+    num_iterations: int = 10,
+    warmup_iterations: int = 2,
+) -> dict:
+    """Run a configuration multiple times and return aggregated results."""
+    print(f"\n{'=' * 60}")
+    print(f"  {config_name}")
+    print(f"  Warmup: {warmup_iterations}, Iterations: {num_iterations}")
+    print(f"{'=' * 60}")
+
+    for i in range(warmup_iterations):
+        run_session_benchmark(perf_test_exe, model_path, session_configs)
+        print(f"  Warmup {i + 1}: done")
+
+    session_times = []
+    private_samples = []
+    ws_samples = []
+
+    for i in range(num_iterations):
+        metrics = run_session_benchmark(perf_test_exe, model_path, session_configs)
+        if not metrics:
+            print(f"  Run {i + 1}: FAILED")
+            continue
+        t = metrics.get("session_creation_time_s", 0) * 1000
+        p = metrics.get("peak_private_bytes", 0) / 1024 / 1024
+        w = metrics.get("peak_working_set_bytes", 0) / 1024 / 1024
+        session_times.append(t)
+        private_samples.append(p)
+        ws_samples.append(w)
+        print(f"  Run {i + 1}: session={t:.2f}ms, private={p:.1f}MB, ws={w:.1f}MB")
+
+    result = {"config_name": config_name}
+    if session_times:
+        result["session_ms"] = {
+            "mean": statistics.mean(session_times),
+            "stdev": statistics.stdev(session_times) if len(session_times) > 1 else 0,
+        }
+    if any(p > 0 for p in private_samples):
+        result["private_mb"] = {"mean": statistics.mean(private_samples)}
+    if any(w > 0 for w in ws_samples):
+        result["ws_mb"] = {"mean": statistics.mean(ws_samples)}
+    return result
+
+
+# -- Multi-process benchmark --
+
+
+def run_multi_process_benchmark(
+    perf_test_exe: str,
+    model_path: str,
+    session_configs: dict,
+    num_processes: int = 4,
+    config_name: str = "",
+) -> dict:
+    """Launch N processes with live ORT sessions and measure concurrent memory.
+
+    Requires onnxruntime_perf_test built with --hold_ms_after_session_creation support
+    and psutil for memory measurement.
+    """
+    if not HAS_PSUTIL:
+        print("  WARNING: psutil not installed, skipping multi-process benchmark")
+        return {}
+
+    print(f"\n{'=' * 60}")
+    print(f"  {config_name} ({num_processes} processes)")
+    print(f"{'=' * 60}")
+
+    cmd = build_perf_test_cmd(perf_test_exe, model_path, session_configs)
+    cmd.insert(-1, "--hold_ms_after_session_creation=30000")  # insert before model_path
+
+    # Launch all processes
+    ps_processes = []
+    try:
+        for i in range(num_processes):
+            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            try:
+                ps = psutil.Process(proc.pid)
+            except psutil.NoSuchProcess:
+                ps = None
+            ps_processes.append((i, proc, ps))
+            print(f"  Started process {i + 1} (PID={proc.pid})")
+
+        # Wait for each process to signal SESSION_READY
+        for i, proc, _ps in ps_processes:
+            for line in proc.stdout:
+                if b"SESSION_READY" in line:
+                    print(f"  Process {i + 1}: ready")
+                    break
+
+        time.sleep(0.5)  # stabilization
+
+        # Measure memory (all processes alive with loaded sessions)
+        total_private = 0
+        total_ws = 0
+        per_process = []
+        for i, proc, ps in ps_processes:
+            if ps and proc.poll() is None:
+                try:
+                    private, ws = _get_private_and_ws(ps)
+                    private_mb = private / 1024 / 1024
+                    ws_mb = ws / 1024 / 1024
+                    total_private += private_mb
+                    total_ws += ws_mb
+                    per_process.append({"pid": proc.pid, "private_mb": private_mb, "ws_mb": ws_mb})
+                    print(f"  Process {i + 1} (PID={proc.pid}): private={private_mb:.1f}MB, ws={ws_mb:.1f}MB")
+                except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
+                    print(f"  Process {i + 1}: could not read memory ({e})")
+            else:
+                print(f"  Process {i + 1}: not running")
+    finally:
+        # Cleanup: ensure all child processes are terminated
+        for _, proc, _ in ps_processes:
+            proc.terminate()
+        for _, proc, _ in ps_processes:
+            try:
+                proc.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+
+    return {
+        "config_name": config_name,
+        "num_processes": num_processes,
+        "total_private_mb": total_private,
+        "total_ws_mb": total_ws,
+        "per_process": per_process,
+    }
+
+
+# -- Output --
+
+
+def print_summary(results: list[dict]):
+    """Print results table with relative comparison."""
+    print(f"\n{'=' * 90}")
+    print("BENCHMARK RESULTS SUMMARY")
+    print(f"{'=' * 90}")
+
+    header = f"{'Configuration':<45} {'Session (ms)':<15} {'Private (MB)':<15} {'WS (MB)':<15}"
+    print(header)
+    print("-" * len(header))
+
+    for r in results:
+        name = r.get("config_name", "?")
+        t = r.get("session_ms", {}).get("mean", 0)
+        p = r.get("private_mb", {}).get("mean", 0)
+        w = r.get("ws_mb", {}).get("mean", 0)
+        print(f"{name:<45} {t:<15.2f} {p:<15.1f} {w:<15.1f}")
+
+    # Relative to .ort standard baseline
+    if len(results) >= 2:
+        baseline = next((r for r in results if r.get("config_name", "").startswith("1.")), results[0])
+        bt = baseline.get("session_ms", {}).get("mean", 0)
+        bp = baseline.get("private_mb", {}).get("mean", 0)
+        print(f"\nRelative to {baseline['config_name']}:")
+        print("-" * 60)
+        for r in results:
+            if r is baseline:
+                continue
+            name = r.get("config_name", "?")
+            rt = r.get("session_ms", {}).get("mean", 0)
+            rp = r.get("private_mb", {}).get("mean", 0)
+            parts = [f"  {name}:"]
+            if bt > 0:
+                parts.append(f"    Session: {(rt - bt) / bt * 100:+.1f}%")
+            if bp > 0:
+                parts.append(f"    Private: {(rp - bp) / bp * 100:+.1f}%")
+            print("\n".join(parts))
+
+
+# -- Main --
+
+
+CONFIGS = [
+    ("1. .ort standard load (baseline)", {}),
+    ("2. .ort memory-mapped load", {"session.use_memory_mapped_ort_model": "1"}),
+    (
+        "3. .ort mmap + direct initializers",
+        {"session.use_memory_mapped_ort_model": "1", "session.use_ort_model_bytes_for_initializers": "1"},
+    ),
+]
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark memory-mapped .ort model loading")
+    parser.add_argument("--perf-test", required=True, help="Path to onnxruntime_perf_test executable")
+    parser.add_argument("--model", required=True, help="Path to .ort model file")
+    parser.add_argument("--iterations", type=int, default=10, help="Number of measured iterations per config")
+    parser.add_argument("--multi-process", action="store_true", help="Run multi-process memory sharing benchmark")
+    parser.add_argument("--num-processes", type=int, default=4, help="Number of processes for --multi-process")
+    parser.add_argument("--output", help="Save results to JSON file")
+    args = parser.parse_args()
+
+    perf_test = os.path.abspath(args.perf_test)
+    model_path = os.path.abspath(args.model)
+
+    for path, label in [(perf_test, "perf_test"), (model_path, "model")]:
+        if not os.path.exists(path):
+            print(f"ERROR: {label} not found: {path}")
+            sys.exit(1)
+
+    model_size_mb = os.path.getsize(model_path) / 1024 / 1024
+    print(f"\nModel: {os.path.basename(model_path)} ({model_size_mb:.1f} MB)")
+    print(f"Perf test: {perf_test}")
+    print(f"Iterations: {args.iterations}")
+    if not HAS_PSUTIL:
+        print("WARNING: psutil not installed — memory metrics will not be collected")
+
+    # Single-process benchmarks
+    results = []
+    for config_name, session_configs in CONFIGS:
+        results.append(
+            run_configuration(perf_test, model_path, config_name, session_configs, num_iterations=args.iterations)
+        )
+    print_summary(results)
+
+    # Multi-process benchmarks
+    mp_results = []
+    if args.multi_process:
+        for config_name, session_configs in CONFIGS:
+            mp_results.append(
+                run_multi_process_benchmark(
+                    perf_test, model_path, session_configs, num_processes=args.num_processes, config_name=config_name
+                )
+            )
+        if mp_results:
+            print(f"\n{'=' * 60}")
+            print("MULTI-PROCESS RESULTS")
+            print(f"{'=' * 60}")
+            for r in mp_results:
+                if r:
+                    print(f"  {r['config_name']} ({r['num_processes']} processes):")
+                    print(f"    Total private: {r['total_private_mb']:.1f} MB")
+                    print(f"    Total WS:      {r['total_ws_mb']:.1f} MB")
+
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(
+                {
+                    "model": os.path.basename(model_path),
+                    "model_size_mb": model_size_mb,
+                    "single": results,
+                    "multi": mp_results,
+                },
+                f,
+                indent=2,
+            )
+        print(f"\nResults saved to: {args.output}")
+
+
+if __name__ == "__main__":
+    main()