Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion onnxruntime/core/framework/tensorprotoutils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,8 @@ Status TensorProtoWithExternalDataToTensorProto(
}

Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
const std::filesystem::path& location) {
const std::filesystem::path& location,
const std::filesystem::path& model_path) {
// Reject absolute paths
ORT_RETURN_IF(location.is_absolute(),
"Absolute paths not allowed for external data location");
Expand All @@ -339,10 +340,29 @@ Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
// If the symlink exists, it resolves to the target path;
// so if the symllink is outside the directory it would be caught here.
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
auto resolved = std::filesystem::weakly_canonical(base_dir / location);

// Check that resolved path starts with base directory
auto [base_end, resolved_it] = std::mismatch(
base_canonical.begin(), base_canonical.end(),
resolved.begin(), resolved.end());

if (base_end != base_canonical.end()) {
// If validation against logical base_dir fails, we check against the
// real (canonical) path of the model file to support symlinked models
// (e.g. models in Hugging Face Hub local cache).
if (!model_path.empty()) {
auto real_model_dir = std::filesystem::weakly_canonical(model_path).parent_path();
Comment thread
tianleiwu marked this conversation as resolved.

auto [real_base_end, real_resolved_it] = std::mismatch(
real_model_dir.begin(), real_model_dir.end(),
resolved.begin(), resolved.end());

if (real_base_end == real_model_dir.end()) {
return Status::OK();
}
}
}

ORT_RETURN_IF(base_end != base_canonical.end(),
"External data path: ", location, " escapes model directory: ", base_dir);
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
}
Expand Down
17 changes: 10 additions & 7 deletions onnxruntime/core/framework/tensorprotoutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,16 +526,19 @@ Status TensorProtoWithExternalDataToTensorProto(
ONNX_NAMESPACE::TensorProto& new_tensor_proto);

/// <summary>
/// The functions will make sure the 'location' specified in the external data is under the 'base_dir'.
/// Validates if the external data path is under the model directory.
/// If the model is a symlink, it checks against both the logical model directory (base_dir)
/// and the real/canonical directory of the model.
/// If the `base_dir` is empty, the function only ensures that `location` is not an absolute path.
/// </summary>
/// <param name="base_dir">model location directory</param>
/// <param name="location">location is a string retrieved from TensorProto external data that is not
/// an in-memory tag</param>
/// <returns>The function will fail if the resolved full path is not under the model directory
/// or one of the subdirectories</returns>
/// <param name="base_dir">Logical model location directory</param>
/// <param name="location">Location string retrieved from TensorProto external data</param>
/// <param name="model_path">Optional path to the model file, used for canonical path validation if base_dir check fails</param>
/// <returns>The function will fail if the resolved full path is not under the logical model directory
/// nor the real directory of the model path</returns>
Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
const std::filesystem::path& location);
const std::filesystem::path& location,
const std::filesystem::path& model_path = {});

#endif // !defined(SHARED_PROVIDER)

Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/graph/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3771,7 +3771,7 @@ Status Graph::ConvertInitializersIntoOrtValues() {
std::unique_ptr<onnxruntime::ExternalDataInfo> external_data_info;
ORT_RETURN_IF_ERROR(onnxruntime::ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));
const auto& location = external_data_info->GetRelPath();
auto st = utils::ValidateExternalDataPath(model_dir, location);
auto st = utils::ValidateExternalDataPath(model_dir, location, model_path);
if (!st.IsOK()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"External data path validation failed for initializer: ", tensor_proto.name(),
Expand Down
5 changes: 0 additions & 5 deletions onnxruntime/core/providers/shared_library/provider_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,11 +453,6 @@ inline bool HasExternalDataInMemory(const ONNX_NAMESPACE::TensorProto& ten_proto
return g_host->Utils__HasExternalDataInMemory(ten_proto);
}

inline Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
const std::filesystem::path& location) {
return g_host->Utils__ValidateExternalDataPath(base_dir, location);
}

} // namespace utils

namespace graph_utils {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1004,9 +1004,6 @@ struct ProviderHost {

virtual bool Utils__HasExternalDataInMemory(const ONNX_NAMESPACE::TensorProto& ten_proto) = 0;

virtual Status Utils__ValidateExternalDataPath(const std::filesystem::path& base_path,
const std::filesystem::path& location) = 0;

// Model
virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
const IOnnxRuntimeOpSchemaRegistryList* local_registries,
Expand Down
5 changes: 0 additions & 5 deletions onnxruntime/core/session/provider_bridge_ort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1295,11 +1295,6 @@ struct ProviderHostImpl : ProviderHost {
return onnxruntime::utils::HasExternalDataInMemory(ten_proto);
}

Status Utils__ValidateExternalDataPath(const std::filesystem::path& base_path,
const std::filesystem::path& location) override {
return onnxruntime::utils::ValidateExternalDataPath(base_path, location);
}

// Model (wrapped)
std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
const IOnnxRuntimeOpSchemaRegistryList* local_registries,
Expand Down
263 changes: 263 additions & 0 deletions onnxruntime/test/python/onnxruntime_test_python_symlink_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import os
import shutil
import struct
import tempfile
import unittest

import numpy as np
import onnx
Comment thread Fixed
from onnx import TensorProto, helper

import onnxruntime as ort


class TestWhitelistedData(unittest.TestCase):
def test_huggingface_hub_symlink(self):
# working directory structure (simulate huggingface hub local cache):
# temp_dir/
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
# blobs/
# guid1
# guid2
# snapshots/version/onnx/
# model.onnx -> ../../blobs/guid1
# data.bin -> ../../blobs/guid2

self.temp_dir = tempfile.mkdtemp()
try:
blobs_dir = os.path.join(self.temp_dir, "blobs")
os.makedirs(blobs_dir)

snapshots_dir = os.path.join(self.temp_dir, "snapshots", "version", "onnx")
os.makedirs(snapshots_dir)

# Create real files in blobs
# We'll use the helper to create the model, but we need to control where files end up.
# Let's manually create the data file in blobs
data_blob_path = os.path.join(blobs_dir, "guid2")
vals = [float(i) for i in range(10)]
with open(data_blob_path, "wb") as f:
f.writelines(struct.pack("f", v) for v in vals)

# Create model in blobs (referencing "data.bin" as external data)
# Note: The model proto will just say "data.bin".
# When loaded from snapshots/onnx/model.onnx, ORT looks for snapshots/onnx/data.bin

input_ = helper.make_tensor_value_info("input", TensorProto.FLOAT, [10])
output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [10])
tensor = helper.make_tensor("external_data", TensorProto.FLOAT, [10], vals)
tensor.data_location = TensorProto.EXTERNAL
tensor.ClearField("float_data")
tensor.ClearField("raw_data")

k = tensor.external_data.add()
k.key = "location"
k.value = "data.bin" # Relative path

offset = tensor.external_data.add()
offset.key = "offset"
offset.value = "0"

length = tensor.external_data.add()
length.key = "length"
length.value = str(len(vals) * 4)

const_node = helper.make_node("Constant", [], ["const_out"], value=tensor)
add_node = helper.make_node("Add", ["input", "const_out"], ["output"])
graph = helper.make_graph([const_node, add_node], "test_graph", [input_], [output])
model = helper.make_model(graph)

model_blob_path = os.path.join(blobs_dir, "guid1")
onnx.save(model, model_blob_path)

# Now create symlinks in snapshots
model_symlink_path = os.path.join(snapshots_dir, "model.onnx")
data_symlink_path = os.path.join(snapshots_dir, "data.bin")

try:
os.symlink(model_blob_path, model_symlink_path)
os.symlink(data_blob_path, data_symlink_path)
except (OSError, NotImplementedError) as e:
self.skipTest(f"Skipping symlink test: symlink creation is not supported in this environment: {e}")

print(f"Model symlink: {model_symlink_path} -> {os.readlink(model_symlink_path)}")
print(f"Data symlink: {data_symlink_path} -> {os.readlink(data_symlink_path)}")

Comment thread
tianleiwu marked this conversation as resolved.
Outdated
sess = ort.InferenceSession(model_symlink_path, providers=["CPUExecutionProvider"])

input_data = np.zeros(10, dtype=np.float32)
res = sess.run(["output"], {"input": input_data})
expected = np.array([float(i) for i in range(10)], dtype=np.float32)
np.testing.assert_allclose(res[0], expected)

finally:
shutil.rmtree(self.temp_dir)

def test_symlink_with_data_in_model_sub_dir(self):
# working directory structure (data is in model sub directory):
# temp_dir/
# blobs/
# guid1
# data/guid2
# snapshots/version/onnx/
# model.onnx -> ../../blobs/guid1
# data.bin -> ../../blobs/data/guid2

self.temp_dir = tempfile.mkdtemp()
try:
blobs_dir = os.path.join(self.temp_dir, "blobs")
os.makedirs(blobs_dir)
data_dir = os.path.join(blobs_dir, "data")
os.makedirs(data_dir)

snapshots_dir = os.path.join(self.temp_dir, "snapshots", "version", "onnx")
os.makedirs(snapshots_dir)

# Create real files in blobs
# We'll use the helper to create the model, but we need to control where files end up.
# Let's manually create the data file in blobs
data_blob_path = os.path.join(data_dir, "guid2")
vals = [float(i) for i in range(10)]
with open(data_blob_path, "wb") as f:
f.writelines(struct.pack("f", v) for v in vals)

# Create model in blobs (referencing "data.bin" as external data)
# Note: The model proto will just say "data.bin".
# When loaded from snapshots/onnx/model.onnx, ORT looks for snapshots/onnx/data.bin

input_ = helper.make_tensor_value_info("input", TensorProto.FLOAT, [10])
output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [10])
tensor = helper.make_tensor("external_data", TensorProto.FLOAT, [10], vals)
tensor.data_location = TensorProto.EXTERNAL
tensor.ClearField("float_data")
tensor.ClearField("raw_data")

k = tensor.external_data.add()
k.key = "location"
k.value = "data.bin" # Relative path

offset = tensor.external_data.add()
offset.key = "offset"
offset.value = "0"

length = tensor.external_data.add()
length.key = "length"
length.value = str(len(vals) * 4)

const_node = helper.make_node("Constant", [], ["const_out"], value=tensor)
add_node = helper.make_node("Add", ["input", "const_out"], ["output"])
graph = helper.make_graph([const_node, add_node], "test_graph", [input_], [output])
model = helper.make_model(graph)

model_blob_path = os.path.join(blobs_dir, "guid1")
onnx.save(model, model_blob_path)

# Now create symlinks in snapshots
model_symlink_path = os.path.join(snapshots_dir, "model.onnx")
data_symlink_path = os.path.join(snapshots_dir, "data.bin")

try:
os.symlink(model_blob_path, model_symlink_path)
os.symlink(data_blob_path, data_symlink_path)
except (OSError, NotImplementedError) as e:
self.skipTest(f"Skipping symlink test: symlink creation is not supported in this environment: {e}")

print(f"Model symlink: {model_symlink_path} -> {os.readlink(model_symlink_path)}")
print(f"Data symlink: {data_symlink_path} -> {os.readlink(data_symlink_path)}")

Comment thread
tianleiwu marked this conversation as resolved.
Outdated
sess = ort.InferenceSession(model_symlink_path, providers=["CPUExecutionProvider"])

input_data = np.zeros(10, dtype=np.float32)
res = sess.run(["output"], {"input": input_data})
expected = np.array([float(i) for i in range(10)], dtype=np.float32)
np.testing.assert_allclose(res[0], expected)

finally:
shutil.rmtree(self.temp_dir)

def test_symlink_with_data_not_in_model_sub_dir(self):
# working directory structure (data is not in model directory or its sub directories):
# temp_dir/
# model/
# guid1
# data/
# guid2
# snapshots/version/onnx/
# model.onnx -> ../../model/guid1
# data.bin -> ../../data/guid2

self.temp_dir = tempfile.mkdtemp()
Comment thread
tianleiwu marked this conversation as resolved.
try:
model_dir = os.path.join(self.temp_dir, "model")
os.makedirs(model_dir)
data_dir = os.path.join(self.temp_dir, "data")
os.makedirs(data_dir)

snapshots_dir = os.path.join(self.temp_dir, "snapshots", "version", "onnx")
os.makedirs(snapshots_dir)

# Create real files in blobs
# We'll use the helper to create the model, but we need to control where files end up.
# Let's manually create the data file in blobs
data_blob_path = os.path.join(data_dir, "guid2")
vals = [float(i) for i in range(10)]
with open(data_blob_path, "wb") as f:
f.writelines(struct.pack("f", v) for v in vals)

# Create model in blobs (referencing "data.bin" as external data)
# Note: The model proto will just say "data.bin".
# When loaded from snapshots/onnx/model.onnx, ORT looks for snapshots/onnx/data.bin

input_ = helper.make_tensor_value_info("input", TensorProto.FLOAT, [10])
output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [10])
tensor = helper.make_tensor("external_data", TensorProto.FLOAT, [10], vals)
tensor.data_location = TensorProto.EXTERNAL
tensor.ClearField("float_data")
tensor.ClearField("raw_data")

k = tensor.external_data.add()
k.key = "location"
k.value = "data.bin" # Relative path

offset = tensor.external_data.add()
offset.key = "offset"
offset.value = "0"

length = tensor.external_data.add()
length.key = "length"
length.value = str(len(vals) * 4)

const_node = helper.make_node("Constant", [], ["const_out"], value=tensor)
add_node = helper.make_node("Add", ["input", "const_out"], ["output"])
graph = helper.make_graph([const_node, add_node], "test_graph", [input_], [output])
model = helper.make_model(graph)

model_blob_path = os.path.join(model_dir, "guid1")
onnx.save(model, model_blob_path)

# Now create symlinks in snapshots
model_symlink_path = os.path.join(snapshots_dir, "model.onnx")
data_symlink_path = os.path.join(snapshots_dir, "data.bin")

try:
os.symlink(model_blob_path, model_symlink_path)
os.symlink(data_blob_path, data_symlink_path)
except (OSError, NotImplementedError) as e:
self.skipTest(f"Skipping symlink test: symlink creation is not supported in this environment: {e}")

print(f"Model symlink: {model_symlink_path} -> {os.readlink(model_symlink_path)}")
print(f"Data symlink: {data_symlink_path} -> {os.readlink(data_symlink_path)}")

Comment thread
tianleiwu marked this conversation as resolved.
Outdated
with self.assertRaises(Exception) as cm:
ort.InferenceSession(model_symlink_path, providers=["CPUExecutionProvider"])

# We expect an error about external data not being in whitelisted directories
self.assertIn("External data path validation failed", str(cm.exception))
finally:
shutil.rmtree(self.temp_dir)


if __name__ == "__main__":
unittest.main()
Loading