Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions onnxruntime/core/framework/tensorprotoutils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -329,22 +329,48 @@ Status TensorProtoWithExternalDataToTensorProto(
}

Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
const std::filesystem::path& location) {
const std::filesystem::path& location,
const std::filesystem::path& model_path) {
// Reject absolute paths
ORT_RETURN_IF(location.is_absolute(),
"Absolute paths not allowed for external data location");
if (!base_dir.empty()) {
// Resolve and verify the path stays within model directory
auto base_canonical = std::filesystem::weakly_canonical(base_dir);
// If the symlink exists, it resolves to the target path;
// so if the symllink is outside the directory it would be caught here.
// so if the symlink is outside the directory it would be caught here.
auto resolved = std::filesystem::weakly_canonical(base_dir / location);

// Check that resolved path starts with base directory
auto [base_end, resolved_it] = std::mismatch(
base_canonical.begin(), base_canonical.end(),
resolved.begin(), resolved.end());
ORT_RETURN_IF(base_end != base_canonical.end(),
"External data path: ", location, " escapes model directory: ", base_dir);

if (base_end != base_canonical.end()) {
// If validation against logical base_dir fails, we check against the
// real (canonical) path of the model file to support symlinked models
// (e.g. models in Hugging Face Hub local cache).
if (!model_path.empty()) {
auto real_model_dir = std::filesystem::weakly_canonical(model_path).parent_path();

auto [real_base_end, real_resolved_it] = std::mismatch(
real_model_dir.begin(), real_model_dir.end(),
resolved.begin(), resolved.end());

if (real_base_end == real_model_dir.end()) {
return Status::OK();
}

return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"External data path: ", location, " (resolved path: ", resolved,
") escapes both model directory: ", base_dir,
" and real model directory: ", real_model_dir);
}

return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"External data path: ", location, " (resolved path: ", resolved,
") escapes model directory: ", base_dir);
}
}
return Status::OK();
}
Expand Down
17 changes: 10 additions & 7 deletions onnxruntime/core/framework/tensorprotoutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,16 +526,19 @@ Status TensorProtoWithExternalDataToTensorProto(
ONNX_NAMESPACE::TensorProto& new_tensor_proto);

/// <summary>
/// The functions will make sure the 'location' specified in the external data is under the 'base_dir'.
/// Validates if the external data path is under the model directory.
/// If the model is a symlink, it checks against both the logical model directory (base_dir)
/// and the real/canonical directory of the model.
/// If the `base_dir` is empty, the function only ensures that `location` is not an absolute path.
/// </summary>
/// <param name="base_dir">model location directory</param>
/// <param name="location">location is a string retrieved from TensorProto external data that is not
/// an in-memory tag</param>
/// <returns>The function will fail if the resolved full path is not under the model directory
/// or one of the subdirectories</returns>
/// <param name="base_dir">Logical model location directory</param>
/// <param name="location">Location string retrieved from TensorProto external data</param>
/// <param name="model_path">Optional path to the model file, used for canonical path validation if base_dir check fails</param>
/// <returns>The function will fail if the resolved full path is not under the logical model directory
/// nor the real directory of the model path</returns>
Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
const std::filesystem::path& location);
const std::filesystem::path& location,
const std::filesystem::path& model_path = {});

#endif // !defined(SHARED_PROVIDER)

Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/graph/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3771,7 +3771,7 @@ Status Graph::ConvertInitializersIntoOrtValues() {
std::unique_ptr<onnxruntime::ExternalDataInfo> external_data_info;
ORT_RETURN_IF_ERROR(onnxruntime::ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));
const auto& location = external_data_info->GetRelPath();
auto st = utils::ValidateExternalDataPath(model_dir, location);
auto st = utils::ValidateExternalDataPath(model_dir, location, model_path);
if (!st.IsOK()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"External data path validation failed for initializer: ", tensor_proto.name(),
Expand Down
5 changes: 0 additions & 5 deletions onnxruntime/core/providers/shared_library/provider_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,11 +453,6 @@ inline bool HasExternalDataInMemory(const ONNX_NAMESPACE::TensorProto& ten_proto
return g_host->Utils__HasExternalDataInMemory(ten_proto);
}

inline Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
const std::filesystem::path& location) {
return g_host->Utils__ValidateExternalDataPath(base_dir, location);
}

} // namespace utils

namespace graph_utils {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1004,9 +1004,6 @@ struct ProviderHost {

virtual bool Utils__HasExternalDataInMemory(const ONNX_NAMESPACE::TensorProto& ten_proto) = 0;

virtual Status Utils__ValidateExternalDataPath(const std::filesystem::path& base_path,
const std::filesystem::path& location) = 0;

// Model
virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
const IOnnxRuntimeOpSchemaRegistryList* local_registries,
Expand Down
5 changes: 0 additions & 5 deletions onnxruntime/core/session/provider_bridge_ort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1295,11 +1295,6 @@ struct ProviderHostImpl : ProviderHost {
return onnxruntime::utils::HasExternalDataInMemory(ten_proto);
}

Status Utils__ValidateExternalDataPath(const std::filesystem::path& base_path,
const std::filesystem::path& location) override {
return onnxruntime::utils::ValidateExternalDataPath(base_path, location);
}

// Model (wrapped)
std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
const IOnnxRuntimeOpSchemaRegistryList* local_registries,
Expand Down
250 changes: 250 additions & 0 deletions onnxruntime/test/python/onnxruntime_test_python_symlink_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import os
import shutil
import struct
import tempfile
import unittest

import numpy as np
from onnx import TensorProto, helper, save

import onnxruntime as ort


class TestSymLinkOnnxModelExternalData(unittest.TestCase):
def test_symlink_model_and_data_under_same_directory(self):
# The following directory structure simulates huggingface hub local cache:
# temp_dir/ (This corresponds to .cache/huggingface/hub/model_id/)
# blobs/
# guid1
# guid2
# snapshots/version/
# model.onnx -> ../../blobs/guid1
# data.bin -> ../../blobs/guid2

self.temp_dir = tempfile.mkdtemp()
try:
blobs_dir = os.path.join(self.temp_dir, "blobs")
os.makedirs(blobs_dir)

snapshots_dir = os.path.join(self.temp_dir, "snapshots", "version")
os.makedirs(snapshots_dir)

# Create real files in blobs
# We'll use the helper to create the model, but we need to control where files end up.
# Let's manually create the data file in blobs
data_blob_path = os.path.join(blobs_dir, "guid2")
vals = [float(i) for i in range(10)]
with open(data_blob_path, "wb") as f:
f.writelines(struct.pack("f", v) for v in vals)

# Create model in blobs (referencing "data.bin" as external data)
# When loaded from snapshots/version/model.onnx, ORT looks for snapshots/version/data.bin

input_ = helper.make_tensor_value_info("input", TensorProto.FLOAT, [10])
output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [10])
tensor = helper.make_tensor("external_data", TensorProto.FLOAT, [10], vals)
tensor.data_location = TensorProto.EXTERNAL
tensor.ClearField("float_data")
tensor.ClearField("raw_data")

k = tensor.external_data.add()
k.key = "location"
k.value = "data.bin" # Relative path

offset = tensor.external_data.add()
offset.key = "offset"
offset.value = "0"

length = tensor.external_data.add()
length.key = "length"
length.value = str(len(vals) * 4)

const_node = helper.make_node("Constant", [], ["const_out"], value=tensor)
add_node = helper.make_node("Add", ["input", "const_out"], ["output"])
graph = helper.make_graph([const_node, add_node], "test_graph", [input_], [output])
model = helper.make_model(graph)

model_blob_path = os.path.join(blobs_dir, "guid1")
save(model, model_blob_path)

# Now create symlinks in snapshots
model_symlink_path = os.path.join(snapshots_dir, "model.onnx")
data_symlink_path = os.path.join(snapshots_dir, "data.bin")

try:
os.symlink(model_blob_path, model_symlink_path)
os.symlink(data_blob_path, data_symlink_path)
except (OSError, NotImplementedError) as e:
self.skipTest(f"Skipping symlink test: symlink creation is not supported in this environment: {e}")

sess = ort.InferenceSession(model_symlink_path, providers=["CPUExecutionProvider"])

input_data = np.zeros(10, dtype=np.float32)
res = sess.run(["output"], {"input": input_data})
expected = np.array([float(i) for i in range(10)], dtype=np.float32)
np.testing.assert_allclose(res[0], expected)

finally:
shutil.rmtree(self.temp_dir)

def test_symlink_with_data_in_model_sub_dir(self):
# working directory structure (data is in model sub directory):
# temp_dir/
# blobs/
# guid1
# data/guid2
# snapshots/version/
# model.onnx -> ../../blobs/guid1
# data.bin -> ../../blobs/data/guid2

self.temp_dir = tempfile.mkdtemp()
try:
blobs_dir = os.path.join(self.temp_dir, "blobs")
os.makedirs(blobs_dir)
data_dir = os.path.join(blobs_dir, "data")
os.makedirs(data_dir)

snapshots_dir = os.path.join(self.temp_dir, "snapshots", "version")
os.makedirs(snapshots_dir)

# Create real files in blobs
# We'll use the helper to create the model, but we need to control where files end up.
# Let's manually create the data file in blobs
data_blob_path = os.path.join(data_dir, "guid2")
vals = [float(i) for i in range(10)]
with open(data_blob_path, "wb") as f:
f.writelines(struct.pack("f", v) for v in vals)

# Create model in blobs (referencing "data.bin" as external data)
# When loaded from snapshots/version/model.onnx, ORT looks for snapshots/version/data.bin

input_ = helper.make_tensor_value_info("input", TensorProto.FLOAT, [10])
output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [10])
tensor = helper.make_tensor("external_data", TensorProto.FLOAT, [10], vals)
tensor.data_location = TensorProto.EXTERNAL
tensor.ClearField("float_data")
tensor.ClearField("raw_data")

k = tensor.external_data.add()
k.key = "location"
k.value = "data.bin" # Relative path

offset = tensor.external_data.add()
offset.key = "offset"
offset.value = "0"

length = tensor.external_data.add()
length.key = "length"
length.value = str(len(vals) * 4)

const_node = helper.make_node("Constant", [], ["const_out"], value=tensor)
add_node = helper.make_node("Add", ["input", "const_out"], ["output"])
graph = helper.make_graph([const_node, add_node], "test_graph", [input_], [output])
model = helper.make_model(graph)

model_blob_path = os.path.join(blobs_dir, "guid1")
save(model, model_blob_path)

# Now create symlinks in snapshots
model_symlink_path = os.path.join(snapshots_dir, "model.onnx")
data_symlink_path = os.path.join(snapshots_dir, "data.bin")

try:
os.symlink(model_blob_path, model_symlink_path)
os.symlink(data_blob_path, data_symlink_path)
except (OSError, NotImplementedError) as e:
self.skipTest(f"Skipping symlink test: symlink creation is not supported in this environment: {e}")

sess = ort.InferenceSession(model_symlink_path, providers=["CPUExecutionProvider"])

input_data = np.zeros(10, dtype=np.float32)
res = sess.run(["output"], {"input": input_data})
expected = np.array([float(i) for i in range(10)], dtype=np.float32)
np.testing.assert_allclose(res[0], expected)

finally:
shutil.rmtree(self.temp_dir)

def test_symlink_with_data_not_in_model_sub_dir(self):
# working directory structure (data is not in model directory or its sub directories):
# temp_dir/
# model/
# guid1
# data/
# guid2
# snapshots/version/
# model.onnx -> ../../model/guid1
# data.bin -> ../../data/guid2

self.temp_dir = tempfile.mkdtemp()
try:
model_dir = os.path.join(self.temp_dir, "model")
os.makedirs(model_dir)
data_dir = os.path.join(self.temp_dir, "data")
os.makedirs(data_dir)

snapshots_dir = os.path.join(self.temp_dir, "snapshots", "version")
os.makedirs(snapshots_dir)

# Create real files in data_dir
# We'll use the helper to create the model, but we need to control where files end up.
# Let's manually create the data file in data_dir
data_blob_path = os.path.join(data_dir, "guid2")
vals = [float(i) for i in range(10)]
with open(data_blob_path, "wb") as f:
f.writelines(struct.pack("f", v) for v in vals)

# Create model in model_dir (referencing "data.bin" as external data)
# When loaded from snapshots/version/model.onnx, ORT looks for snapshots/version/data.bin

input_ = helper.make_tensor_value_info("input", TensorProto.FLOAT, [10])
output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [10])
tensor = helper.make_tensor("external_data", TensorProto.FLOAT, [10], vals)
tensor.data_location = TensorProto.EXTERNAL
tensor.ClearField("float_data")
tensor.ClearField("raw_data")

k = tensor.external_data.add()
k.key = "location"
k.value = "data.bin" # Relative path

offset = tensor.external_data.add()
offset.key = "offset"
offset.value = "0"

length = tensor.external_data.add()
length.key = "length"
length.value = str(len(vals) * 4)

const_node = helper.make_node("Constant", [], ["const_out"], value=tensor)
add_node = helper.make_node("Add", ["input", "const_out"], ["output"])
graph = helper.make_graph([const_node, add_node], "test_graph", [input_], [output])
model = helper.make_model(graph)

model_blob_path = os.path.join(model_dir, "guid1")
save(model, model_blob_path)

# Now create symlinks in snapshots
model_symlink_path = os.path.join(snapshots_dir, "model.onnx")
data_symlink_path = os.path.join(snapshots_dir, "data.bin")

try:
os.symlink(model_blob_path, model_symlink_path)
os.symlink(data_blob_path, data_symlink_path)
except (OSError, NotImplementedError) as e:
self.skipTest(f"Skipping symlink test: symlink creation is not supported in this environment: {e}")

with self.assertRaises(Exception) as cm:
ort.InferenceSession(model_symlink_path, providers=["CPUExecutionProvider"])

# We expect an error about external data not under model directory or the real model directory.
self.assertIn("External data path validation failed", str(cm.exception))
finally:
shutil.rmtree(self.temp_dir)


if __name__ == "__main__":
unittest.main()
3 changes: 3 additions & 0 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -1816,6 +1816,9 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
onnx_test = False

if onnx_test:
log.info("Testing Symlink ONNX Model and External Data")
run_subprocess([sys.executable, "onnxruntime_test_python_symlink_data.py"], cwd=cwd, dll_path=dll_path)

# Disable python onnx tests for TensorRT and CANN EP, because many tests are
# not supported yet.
if args.use_tensorrt or args.use_cann:
Expand Down
Loading