diff --git a/onnxscript/_framework_apis/torch_2_5.py b/onnxscript/_framework_apis/torch_2_5.py index 4fc6fda247..2f8601c7c6 100644 --- a/onnxscript/_framework_apis/torch_2_5.py +++ b/onnxscript/_framework_apis/torch_2_5.py @@ -19,7 +19,6 @@ from onnxscript import ir, optimizer, version_converter from onnxscript.function_libs.torch_lib import registration -from onnxscript.ir import _external_data @dataclasses.dataclass(frozen=True) @@ -68,32 +67,16 @@ def save_model_with_external_data(model: ir.Model, model_path: str | os.PathLike """Save the model with external data. The model is unchanged after saving.""" # TODO(#1835): Decide if we want to externalize large attributes as well - initializer_values = tuple(model.graph.initializers.values()) - tensors = [v.const_value for v in initializer_values] - for tensor in tensors: - if tensor is None: + for value in model.graph.initializers.values(): + if value.const_value is None: raise ValueError( "The model contains uninitialized initializer values. " "Please make sure all initializer values are initialized." ) destination_path = pathlib.Path(model_path) - base_dir = destination_path.parent data_path = f"{destination_path.name}.data" - external_tensors = _external_data.convert_tensors_to_external( - tensors, # type: ignore[arg-type] - base_dir, - data_path, - ) - - # Replace the initializer values with external tensors and save the model - for initializer, external_tensor in zip(initializer_values, external_tensors): - initializer.const_value = external_tensor - ir.save(model, model_path) - - # Restore the original initializer values so the model is unchanged - for initializer, tensor in zip(initializer_values, tensors): - initializer.const_value = tensor + ir.save(model, model_path, external_data=data_path) def get_torchlib_ops() -> list[_OnnxFunctionMeta]: diff --git a/onnxscript/ir/__init__.py b/onnxscript/ir/__init__.py index b50cf77ad0..a9918e9713 100644 --- a/onnxscript/ir/__init__.py +++ b/onnxscript/ir/__init__.py @@ -5,7 +5,9 @@ __all__ = [ # Modules "serde", + "traversal", "convenience", + "external_data", # IR classes "Tensor", "ExternalTensor", @@ -72,13 +74,12 @@ "tensor", # Pass infrastructure "passes", - "traversal", # IO "load", "save", ] -from onnxscript.ir import convenience, passes, serde, traversal +from onnxscript.ir import convenience, external_data, passes, serde, traversal from onnxscript.ir._convenience import tensor from onnxscript.ir._core import ( Attr, diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py index 14d07cb9f4..fb113ee835 100644 --- a/onnxscript/ir/_core.py +++ b/onnxscript/ir/_core.py @@ -22,12 +22,12 @@ import sys import textwrap import typing +from collections.abc import Hashable from typing import ( AbstractSet, Any, Collection, Generic, - Hashable, Iterable, Iterator, NamedTuple, @@ -516,6 +516,7 @@ class ExternalTensor(TensorBase, _protocols.TensorProtocol): # pylint: disable= "_metadata_props", "_offset", "_shape", + "_valid", "doc_string", "name", "raw", @@ -568,6 +569,7 @@ def __init__( self.raw: mmap.mmap | None = None self._metadata_props = metadata_props self._metadata: _metadata.MetadataStore | None = None + self._valid = True @property def base_dir(self) -> str | os.PathLike: @@ -609,6 +611,7 @@ def shape(self) -> Shape: return self._shape def _load(self): + self._check_validity() assert self._array is None, "Bug: The array should be loaded only once." if self.size == 0: # When the size is 0, mmap is impossible and meaningless @@ -647,6 +650,7 @@ def _load(self): self._array = self._array.reshape(shape) def __array__(self, dtype: Any = None) -> np.ndarray: + self._check_validity() if self._array is None: self._load() assert self._array is not None @@ -675,6 +679,7 @@ def numpy(self) -> np.ndarray: The data will be memory mapped into memory and will not taken up physical memory space. """ + self._check_validity() if self._array is None: self._load() assert self._array is not None @@ -685,6 +690,7 @@ def tobytes(self) -> bytes: This will load the tensor into memory. """ + self._check_validity() if self.raw is None: self._load() assert self.raw is not None @@ -692,6 +698,26 @@ def tobytes(self) -> bytes: length = self._length or self.nbytes return self.raw[offset : offset + length] + def valid(self) -> bool: + """Check if the tensor is valid. + + The external tensor is valid if it has not been invalidated. + """ + return self._valid + + def _check_validity(self) -> None: + if not self.valid(): + raise ValueError( + f"The external tensor '{self!r}' is invalidated. The data may be corrupted or deleted." + ) + + def invalidate(self) -> None: + """Invalidate the tensor. + + The external tensor is invalidated when the data is known to be corrupted or deleted. + """ + self._valid = False + def release(self) -> None: """Delete all references to the memory buffer and close the memory-mapped file.""" self._array = None diff --git a/onnxscript/ir/_external_data.py b/onnxscript/ir/_external_data.py deleted file mode 100644 index 75a7e34bc1..0000000000 --- a/onnxscript/ir/_external_data.py +++ /dev/null @@ -1,323 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. -"""External data related utilities.""" - -from __future__ import annotations - -__all__ = ["set_base_dir"] - -import dataclasses -import os -from typing import Iterator, Sequence - -from onnxscript.ir import _core, _enums, _protocols, traversal - -# Note: If needed in future, add these as parameters to the function calls -# align_offset: Offset will always be page aligned and alloction granularity aligned for mmap support. This is done by padding previous tensor data with zeros keeping same length. Tensor data will be aligned if > align_threshold -_ALIGN_OFFSET = True -# align_threshold: Alignment threshold for size of data. Having a low threshold will waste file space for small initializers. Only when tensor's data is > the page_align_threshold it will be force aligned. -_ALIGN_THRESHOLD = 1048576 # 1MB -# allocation_granularity: The allocation Granularity for mmap() support. Typically 64KB for Windows & 4KB for other OSes. -_ALLOCATION_GRANULARITY = 65536 # 64KB - - -@dataclasses.dataclass -class _ExternalDataInfo: - """ - A class that stores information about a tensor that is to be stored as external data. - - Attributes: - name: The name of the tensor that is to be stored as external data. - offset: The offset is used to determine where exactly in the file the external data is written to. - length: Stores the size of the tensor. - """ - - name: str | None - offset: int - length: int - - -def _all_tensors( - graph: _core.Graph | _core.GraphView, include_attributes: bool = False -) -> Iterator[_protocols.TensorProtocol]: - """Iterate over all tensors in the graph. - - Args: - graph: The graph to traverse tensors on. - include_attributes: Whether to include tensors in attributes. - - Yields: - Tensors in the graph. - """ - # Yield all tensors in initializers - for value in graph.initializers.values(): - if value.const_value is not None: - yield value.const_value - if not include_attributes: - return - # Look at constant attributes in nodes - for node in traversal.RecursiveGraphIterator(graph): - for attr in node.attributes.values(): - if isinstance(attr, _core.RefAttr): - continue - if attr.type == _enums.AttributeType.TENSOR and attr.value is not None: - yield attr.value - elif attr.type == _enums.AttributeType.TENSORS and attr.value is not None: - yield from attr.value - - -def set_base_dir(graph: _core.Graph | _core.GraphView, base_dir: str | os.PathLike) -> None: - """Set the base directory for external data in a graph. - - Args: - graph: The graph to traverse tensors on. - base_dir: The base directory. This is the directory where the ONNX file is. - """ - for tensor in _all_tensors(graph, include_attributes=True): - if isinstance(tensor, _core.ExternalTensor): - tensor.base_dir = base_dir - - -def _load_external_data_file( - tensors: Sequence[_protocols.TensorProtocol], - base_path: str | os.PathLike, - relative_path: str | os.PathLike, -) -> list[_protocols.TensorProtocol]: - """Load all external data that is at relative_path into memory for the provided model. - - Args: - tensors: Tensors to be converted to external tensors. They can be external tensors themselves. - base_path: Path of base directory. - relative_path: Path to which external data is to be stored, relative to the ONNX file. - - Returns: - A list of ir.Tensor values. - """ - updated_tensors: list[_protocols.TensorProtocol] = [] - for tensor in tensors: - if isinstance(tensor, _core.ExternalTensor): - external_tensor = tensor - if os.path.samefile(tensor.path, os.path.join(base_path, relative_path)): - # Copy the data as the .numpy() call references data from a file whose data is eventually modified - tensor_data = external_tensor.numpy().copy() - external_tensor.release() - tensor = _core.Tensor( - tensor_data, name=external_tensor.name, dtype=external_tensor.dtype - ) - updated_tensors.append(tensor) - return updated_tensors - - -def _compute_new_offset( - current_offset: int, - tensor_size: int, - align_offset: bool = _ALIGN_OFFSET, - align_threshold: int = _ALIGN_THRESHOLD, - allocation_granularity: int = _ALLOCATION_GRANULARITY, -) -> int: - """Compute the offset to align the tensor data based on the current offset. - - Args: - current_offset: Current location in the file at which tensor data will be written to. - tensor_size: Size of the tensor data to be written to file. - align_offset: Offset will always be page aligned and alloction granularity aligned for mmap support. This is done by padding previous tensor data with zeros keeping same length. Tensor data will be aligned if > align_threshold - align_threshold: Alignment threshold for size of data. Having a low threshold will waste file space for small initializers. Only when tensor's data is > the page_align_threshold it will be force aligned. - allocation_granularity: The allocation Granularity for mmap() support. Typically 64KB for Windows & 4KB for other OSes. - - Returns: - The updated offset value. - """ - if align_offset and tensor_size > align_threshold: - alignment_factor = max(4096, allocation_granularity) - # Align to the next page or alloc granularity - return (current_offset + alignment_factor - 1) // alignment_factor * alignment_factor - return current_offset - - -def _compute_external_data_info( - tensor: _protocols.TensorProtocol, - current_offset: int, -) -> _ExternalDataInfo: - """Capture information about a tensor that is to be stored as external data.""" - tensor_size = tensor.nbytes - # Calculate updated offset and align tensors - current_offset = _compute_new_offset(current_offset, tensor_size) - # Store offset and tensor size as ExternalDataInfo - external_data_info = _ExternalDataInfo( - tensor.name, - current_offset, - tensor_size, - ) - return external_data_info - - -def _save_external_data( - external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]], - file_path: str | os.PathLike, -) -> None: - """Write tensor data to an external file according to information stored in ExternalDataInfo objects. - - Args: - external_data_info: A collection of external data information stored for each tensor to be written as external data. - file_path: Location to which external data is to be stored. - """ - with open(file_path, "wb") as data_file: - for tensor, tensor_info in external_data_info: - current_offset = tensor_info.offset - assert tensor is not None - raw_data = tensor.tobytes() - if isinstance(tensor, _core.ExternalTensor): - tensor.release() - # Pad file to required offset if needed - file_size = data_file.tell() - if current_offset > file_size: - data_file.write(b"\0" * (current_offset - file_size)) - data_file.write(raw_data) - - -def _convert_as_external_tensors( - external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]], - base_path: str | os.PathLike, - relative_path: str | os.PathLike, -) -> list[_core.ExternalTensor]: - """Convert the tensors (stored within the values) written as external data to _core.ExternalTensor types. - - Args: - external_data_info: A collection of external data information stored for each tensor to be written as external data. - base_path: Path of base directory. - relative_path: Path to which external data is to be stored, relative to the ONNX file. - - Returns: - A list of external tensors. - """ - external_tensors: list[_core.ExternalTensor] = [] - for tensor, tensor_info in external_data_info: - assert tensor is not None - external_tensor = _core.ExternalTensor( - os.path.normpath(relative_path), - tensor_info.offset, - tensor_info.length, - tensor.dtype, # type: ignore[arg-type] - shape=tensor.shape, # type: ignore[arg-type] - name=tensor.name, # type: ignore[arg-type] - base_dir=os.path.normpath(base_path), - ) - external_tensors.append(external_tensor) - return external_tensors - - -def convert_tensors_to_external( - tensors: Sequence[_protocols.TensorProtocol], - base_path: str | os.PathLike, - relative_path: str | os.PathLike, - load_external_to_memory: bool = False, -) -> list[_core.ExternalTensor]: - """Convert a sequence of any TensorProtocol tensors to external tensors. - - Args: - tensors: Tensors to be converted to external tensors. They can be external tensors themselves. - base_path: Path of base directory. - relative_path: Path to which external data is to be stored, relative to the ONNX file. - load_external_to_memory: If set to true, loads external tensors present in the same file path as destination path to memory. - - Returns: - A list of external tensors derived from a list of input tensors. - """ - path = os.path.join(base_path, relative_path) - # Check if file path is valid, and create subsequent subdirectories within the path if they don't exist - os.makedirs(os.path.dirname(path), exist_ok=True) - tmp_file_created = False - # Check if file exists. Load pre-existing external data if it does. - if os.path.exists(path): - # Check if any tensor in the model is using the destination file - file_used = False - for tensor in tensors: - if isinstance(tensor, _core.ExternalTensor) and os.path.samefile( - path, tensor.path - ): - # FIXME(shubhambhokare1): If there is a non-initializer tensor that is referring to this file, that tensor is now invalid. This is a special case we are ok not handling right now. - file_used = True - if file_used: - if load_external_to_memory: - tensors = _load_external_data_file(tensors, base_path, relative_path) - else: - tmp_path = os.path.join(base_path, "tmp") - os.makedirs(tmp_path, exist_ok=True) - # If exisiting external tensors are not loaded to memory, copy the external data to a temporary location - os.rename(path, os.path.join(tmp_path, relative_path)) - tmp_file_created = True - for tensor in tensors: - if ( - isinstance(tensor, _core.ExternalTensor) - and tensor.location == relative_path - ): - tensor.base_dir = tmp_path - - external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]] = [] - # Sort all tensors based on tensor sizes, in order to avoid unneccesarry alignment. - # All the smaller tensors are written earlier and alignment is performed for the larger tensors. - sorted_indices = sorted(range(len(tensors)), key=lambda i: tensors[i].nbytes) - sorted_tensors = [tensors[i] for i in sorted_indices] - - current_offset = 0 - for tensor in sorted_tensors: - tensor_info = _compute_external_data_info(tensor, current_offset) - external_data_info.append((tensor, tensor_info)) - current_offset = tensor_info.offset + tensor_info.length - _save_external_data(external_data_info, path) - - # Convert initializers to ExternalTensors - external_tensors = _convert_as_external_tensors( - external_data_info, base_path, relative_path - ) - # Sort external_tensors based on original key order - external_tensors = [ - external_tensors[i] - for i in sorted(range(len(external_tensors)), key=lambda i: sorted_indices[i]) - ] - - # Clean-up temporary file if it is created - tmp_path = os.path.join(base_path, "tmp", relative_path) - if os.path.exists(tmp_path) and tmp_file_created: - os.remove(tmp_path) - - return external_tensors - - -def to_external_data( - model: _core.Model, - base_path: str | os.PathLike, - relative_path: str | os.PathLike, - load_external_to_memory: bool = False, -) -> _core.Model: - """Set all tensors with raw data as external data. - - Args: - model: Model to process. - base_path: Path of base directory. - relative_path: Path to which external data is to be stored, relative to the ONNX file. - load_external_to_memory: If set to true, loads external tensors present in the same file path as destination path to memory. Otherwise, the external tensors are appended to file. - - Returns: - An ir.Model with all tensors with raw data converted to external tensors. - """ - - # Get all the tensors in the graph which are to be stored as external data. - # Iterate through all the tensors, and extract the external data information such as - # name, offset and length. - # TODO: Currently attributes not handled, eventually try to use _all_tensors to include attrs - tensors: list[_protocols.TensorProtocol] = [] - for value in model.graph.initializers.values(): - if value.const_value is not None: - tensors.append(value.const_value) - - external_tensors = convert_tensors_to_external( - tensors, - base_path, - relative_path, - load_external_to_memory=load_external_to_memory, - ) - - for value, external_tensor in zip(model.graph.initializers.values(), external_tensors): - value.const_value = external_tensor - return model diff --git a/onnxscript/ir/_io.py b/onnxscript/ir/_io.py index a9c867f3fb..e05ebb478d 100644 --- a/onnxscript/ir/_io.py +++ b/onnxscript/ir/_io.py @@ -10,7 +10,9 @@ import onnx -from onnxscript.ir import _core, _external_data, serde +from onnxscript.ir import _core, serde +from onnxscript.ir import external_data as _external_data +from onnxscript.ir._polyfill import zip def load(path: str | os.PathLike, format: str | None = None) -> _core.Model: @@ -35,16 +37,61 @@ def load(path: str | os.PathLike, format: str | None = None) -> _core.Model: return model -def save(model: _core.Model, path: str | os.PathLike, format: str | None = None) -> None: +def save( + model: _core.Model, + path: str | os.PathLike, + format: str | None = None, + external_data: str | os.PathLike | None = None, + size_threshold_bytes: int = 256, +) -> None: """Save an ONNX model to a file. + The model remains unchanged after the call. If any existing external tensor + references the provided :param:`external_data` path, it will be invalidated + after the external data is overwritten. To obtain a valid model, use :func:`load` + to load the newly saved model, or provide a different external data path that + is not currently referenced by any tensors in the model. + Args: model: The model to save. - path: The path to save the model to. - format: The format of the file (e.g. protobuf, textproto, json, etc.). + path: The path to save the model to. E.g. "model.onnx". + format: The format of the file (e.g. ``protobuf``, ``textproto``, ``json``, etc.). If None, the format is inferred from the file extension. + external_data: The relative path to save external data to. When specified, + all initializers in the model will be converted to external data and + saved to the specified directory. If None, all tensors will be saved unmodified. + That is, if a tensor in the model is already external, it will be saved + with the same external information; if the tensor is not external, + it will be serialized in the ONNX Proto message. + size_threshold_bytes: Save to external data if the tensor size in bytes is larger than this threshold. + Effective only when :param:`external_data` is set. + + Raises: + ValueError: If the external data path is an absolute path. """ - proto = serde.serialize_model(model) - onnx.save(proto, path, format=format) - # TODO(justinchuby): Handle external data when the relative path has changed - # TODO(justinchuby): Handle off loading external data to disk when saving + if external_data is not None: + if os.path.isabs(external_data): + raise ValueError( + f"The external data path must be relative to the ONNX file path, not '{external_data}'." + ) + base_dir = os.path.dirname(path) + + # Store the original initializer values so they can be restored if modify_model=False + initializer_values = tuple(model.graph.initializers.values()) + tensors = [v.const_value for v in model.graph.initializers.values()] + + try: + model = _external_data.unload_from_model( + model, base_dir, external_data, size_threshold_bytes=size_threshold_bytes + ) + proto = serde.serialize_model(model) + onnx.save(proto, path, format=format) + + finally: + # Restore the original initializer values so the model is unchanged + for initializer, tensor in zip(initializer_values, tensors, strict=True): + initializer.const_value = tensor + + else: + proto = serde.serialize_model(model) + onnx.save(proto, path, format=format) diff --git a/onnxscript/ir/_io_test.py b/onnxscript/ir/_io_test.py new file mode 100644 index 0000000000..be3ef2b647 --- /dev/null +++ b/onnxscript/ir/_io_test.py @@ -0,0 +1,143 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Unit tests for the _io module.""" + +import os +import tempfile +import unittest + +import numpy as np + +from onnxscript import ir +from onnxscript.ir import _io + + +def _create_initializer(tensor: ir.TensorProtocol) -> ir.Value: + return ir.Value( + name=tensor.name, + shape=tensor.shape, + type=ir.TensorType(tensor.dtype), + const_value=tensor, + ) + + +def _create_simple_model_with_initializers() -> ir.Model: + tensor_0 = ir.tensor([0.0], dtype=ir.DataType.FLOAT, name="initializer_0") + initializer = _create_initializer(tensor_0) + tensor_1 = ir.tensor([1.0], dtype=ir.DataType.FLOAT) + identity_node = ir.Node("", "Identity", inputs=(initializer,)) + identity_node.outputs[0].shape = ir.Shape([1]) + identity_node.outputs[0].dtype = ir.DataType.FLOAT + identity_node.outputs[0].name = "identity_0" + const_node = ir.Node( + "", + "Constant", + inputs=(), + outputs=( + ir.Value(name="const_0", shape=tensor_1.shape, type=ir.TensorType(tensor_1.dtype)), + ), + attributes=ir.convenience.convert_attributes(dict(value=tensor_1)), + ) + graph = ir.Graph( + inputs=[initializer], + outputs=[*identity_node.outputs, *const_node.outputs], + nodes=[identity_node, const_node], + initializers=[initializer], + name="test_graph", + ) + return ir.Model(graph, ir_version=10) + + +class IOFunctionsTest(unittest.TestCase): + def test_load(self): + model = _create_simple_model_with_initializers() + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "model.onnx") + _io.save(model, path) + loaded_model = _io.load(path) + self.assertEqual(loaded_model.ir_version, model.ir_version) + self.assertEqual(loaded_model.graph.name, model.graph.name) + self.assertEqual(len(loaded_model.graph.initializers), 1) + self.assertEqual(len(loaded_model.graph), 2) + np.testing.assert_array_equal( + loaded_model.graph.initializers["initializer_0"].const_value.numpy(), + np.array([0.0]), + ) + np.testing.assert_array_equal( + loaded_model.graph.node(1).attributes["value"].as_tensor().numpy(), np.array([1.0]) + ) + self.assertEqual(loaded_model.graph.inputs[0].name, "initializer_0") + self.assertEqual(loaded_model.graph.outputs[0].name, "identity_0") + self.assertEqual(loaded_model.graph.outputs[1].name, "const_0") + + def test_save_with_external_data_does_not_modify_model(self): + model = _create_simple_model_with_initializers() + self.assertIsInstance(model.graph.initializers["initializer_0"].const_value, ir.Tensor) + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "model.onnx") + external_data_file = "model.data" + _io.save(model, path, external_data=external_data_file, size_threshold_bytes=0) + self.assertTrue(os.path.exists(path)) + external_data_path = os.path.join(tmpdir, external_data_file) + self.assertTrue(os.path.exists(external_data_path)) + loaded_model = _io.load(path) + + # The loaded model contains external data + initializer_tensor = loaded_model.graph.initializers["initializer_0"].const_value + self.assertIsInstance(initializer_tensor, ir.ExternalTensor) + # The attribute is not externalized + const_attr_tensor = loaded_model.graph.node(1).attributes["value"].as_tensor() + self.assertIsInstance(const_attr_tensor, ir.TensorProtoTensor) + np.testing.assert_array_equal(initializer_tensor.numpy(), np.array([0.0])) + np.testing.assert_array_equal(const_attr_tensor.numpy(), np.array([1.0])) + + # The original model is not changed and can be accessed even if the + # external data file is deleted + initializer_tensor = model.graph.initializers["initializer_0"].const_value + self.assertIsInstance(initializer_tensor, ir.Tensor) + const_attr_tensor = model.graph.node(1).attributes["value"].as_tensor() + self.assertIsInstance(const_attr_tensor, ir.Tensor) + np.testing.assert_array_equal(initializer_tensor.numpy(), np.array([0.0])) + np.testing.assert_array_equal(const_attr_tensor.numpy(), np.array([1.0])) + + def test_save_raise_when_external_data_is_not_relative_path(self): + model = _create_simple_model_with_initializers() + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "model.onnx") + external_data_file = os.path.join(tmpdir, "model.data") + with self.assertRaises(ValueError): + _io.save(model, path, external_data=external_data_file) + + def test_save_with_external_data_invalidates_obsolete_external_tensors(self): + model = _create_simple_model_with_initializers() + self.assertIsInstance(model.graph.initializers["initializer_0"].const_value, ir.Tensor) + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "model.onnx") + external_data_file = "model.data" + _io.save(model, path, external_data=external_data_file, size_threshold_bytes=0) + loaded_model = _io.load(path) + # Now if we load the model back, create a different initializer and save + # the model to the same external data file, the existing external tensor + # should be invalidated + tensor_2 = ir.tensor([2.0], dtype=ir.DataType.FLOAT, name="initializer_2") + initializer_2 = _create_initializer(tensor_2) + loaded_model.graph.initializers["initializer_2"] = initializer_2 + _io.save( + loaded_model, path, external_data=external_data_file, size_threshold_bytes=0 + ) + initializer_0_tensor = loaded_model.graph.initializers["initializer_0"].const_value + self.assertIsInstance(initializer_0_tensor, ir.ExternalTensor) + self.assertFalse(initializer_0_tensor.valid()) + with self.assertRaisesRegex(ValueError, "is invalidated"): + # The existing model has to be modified to use in memory tensors + # for the values to stay correct. Saving again should raise an error + _io.save( + loaded_model, + path, + external_data=external_data_file, + size_threshold_bytes=0, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxscript/ir/_polyfill.py b/onnxscript/ir/_polyfill.py new file mode 100644 index 0000000000..fb6008db37 --- /dev/null +++ b/onnxscript/ir/_polyfill.py @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Polyfill for Python builtin functions.""" + +import sys +from typing import Any, Sequence + +if sys.version_info >= (3, 10): + zip = zip # pylint: disable=self-assigning-variable +else: + # zip(..., strict=True) was added in Python 3.10 + # TODO: Remove this polyfill when we drop support for Python 3.9 + _python_zip = zip + + def zip(a: Sequence[Any], b: Sequence[Any], strict: bool = False): + """Polyfill for Python's zip function. + + This is a special version which only supports two Sequence inputs. + + Raises: + ValueError: If the iterables have different lengths and strict is True. + """ + if len(a) != len(b) and strict: + raise ValueError("zip() argument lengths must be equal") + return _python_zip(a, b) diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py new file mode 100644 index 0000000000..6e89951e71 --- /dev/null +++ b/onnxscript/ir/external_data.py @@ -0,0 +1,398 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""External data related utilities.""" + +from __future__ import annotations + +__all__ = [ + "set_base_dir", + "unload_from_model", + "load_to_model", + "convert_tensors_to_external", + "convert_tensors_from_external", +] + +import dataclasses +import logging +import os +from typing import Iterator, Sequence + +from onnxscript.ir import _core, _enums, _protocols +from onnxscript.ir import traversal as _traversal +from onnxscript.ir._polyfill import zip + +# Note: If needed in future, add these as parameters to the function calls +# align_offset: Offset will always be page aligned and alloction granularity aligned for mmap support. This is done by padding previous tensor data with zeros keeping same length. Tensor data will be aligned if > align_threshold +_ALIGN_OFFSET = True +# align_threshold: Alignment threshold for size of data. Having a low threshold will waste file space for small initializers. Only when tensor's data is > the page_align_threshold it will be force aligned. +_ALIGN_THRESHOLD = 1048576 # 1MB +# allocation_granularity: The allocation Granularity for mmap() support. Typically 64KB for Windows & 4KB for other OSes. +_ALLOCATION_GRANULARITY = 65536 # 64KB + + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class _ExternalDataInfo: + """ + A class that stores information about a tensor that is to be stored as external data. + + Attributes: + name: The name of the tensor that is to be stored as external data. + offset: The offset is used to determine where exactly in the file the external data is written to. + length: Stores the size of the tensor. + """ + + name: str | None + offset: int + length: int + + +def _all_tensors( + graph: _core.Graph | _core.GraphView, include_attributes: bool = False +) -> Iterator[_protocols.TensorProtocol]: + """Iterate over all tensors in the graph. + + Args: + graph: The graph to traverse tensors on. + include_attributes: Whether to include tensors in attributes. + + Yields: + Tensors in the graph. + """ + # Yield all tensors in initializers + for value in graph.initializers.values(): + if value.const_value is not None: + yield value.const_value + if not include_attributes: + return + # Look at constant attributes in nodes + for node in _traversal.RecursiveGraphIterator(graph): + for attr in node.attributes.values(): + if isinstance(attr, _core.RefAttr): + continue + if attr.type == _enums.AttributeType.TENSOR and attr.value is not None: + yield attr.value + elif attr.type == _enums.AttributeType.TENSORS and attr.value is not None: + yield from attr.value + + +def set_base_dir(graph: _core.Graph | _core.GraphView, base_dir: str | os.PathLike) -> None: + """Set the base directory for external data in a graph. + + Args: + graph: The graph to traverse tensors on. + base_dir: The base directory. This is the directory where the ONNX file is. + """ + for tensor in _all_tensors(graph, include_attributes=True): + if isinstance(tensor, _core.ExternalTensor): + tensor.base_dir = base_dir + + +def _external_tensor_to_memory_tensor( + tensor: _protocols.TensorProtocol, +) -> _protocols.TensorProtocol: + """Convert an external tensor to an in memory tensor. + + Args: + tensor: An external tensor to load. + base_dir: Path of base directory. + relative_path: Path to which external data is to be stored, relative to the ONNX file. + + Returns: + An ir.Tensor object with the data loaded into memory. + """ + if not isinstance(tensor, _core.ExternalTensor): + raise TypeError(f"Expected ExternalTensor, got {type(tensor)}") + # Copy the data as the .numpy() call references data from a file whose data is eventually modified + tensor_data = tensor.numpy().copy() + tensor.release() + return _core.Tensor(tensor_data, name=tensor.name, dtype=tensor.dtype) + + +def _compute_new_offset( + current_offset: int, + tensor_size: int, + align_offset: bool = _ALIGN_OFFSET, + align_threshold: int = _ALIGN_THRESHOLD, + allocation_granularity: int = _ALLOCATION_GRANULARITY, +) -> int: + """Compute the offset to align the tensor data based on the current offset. + + Args: + current_offset: Current location in the file at which tensor data will be written to. + tensor_size: Size of the tensor data to be written to file. + align_offset: Offset will always be page aligned and alloction granularity aligned for mmap support. This is done by padding previous tensor data with zeros keeping same length. Tensor data will be aligned if > align_threshold + align_threshold: Alignment threshold for size of data. Having a low threshold will waste file space for small initializers. Only when tensor's data is > the page_align_threshold it will be force aligned. + allocation_granularity: The allocation Granularity for mmap() support. Typically 64KB for Windows & 4KB for other OSes. + + Returns: + The updated offset value. + """ + if align_offset and tensor_size > align_threshold: + alignment_factor = max(4096, allocation_granularity) + # Align to the next page or alloc granularity + return (current_offset + alignment_factor - 1) // alignment_factor * alignment_factor + return current_offset + + +def _compute_external_data_info( + tensor: _protocols.TensorProtocol, + current_offset: int, +) -> _ExternalDataInfo: + """Capture information about a tensor that is to be stored as external data.""" + tensor_size = tensor.nbytes + # Calculate updated offset and align tensors + current_offset = _compute_new_offset(current_offset, tensor_size) + # Store offset and tensor size as ExternalDataInfo + external_data_info = _ExternalDataInfo( + tensor.name, + current_offset, + tensor_size, + ) + return external_data_info + + +def _write_external_data( + tensors: Sequence[_protocols.TensorProtocol], + external_data_infos: Sequence[_ExternalDataInfo], + file_path: str | os.PathLike, +) -> None: + """Write tensor data to an external file according to information stored in ExternalDataInfo objects. + + Args: + tensors: Tensors to be written as external data. + external_data_infos: External data information stored for each tensor to be written as external data. + file_path: Location to which external data is to be stored. + """ + assert len(tensors) == len(external_data_infos), ( + "Number of tensors and external data infos should match" + ) + with open(file_path, "wb") as data_file: + for tensor, tensor_info in zip(tensors, external_data_infos, strict=True): + current_offset = tensor_info.offset + assert tensor is not None + raw_data = tensor.tobytes() + if isinstance(tensor, _core.ExternalTensor): + tensor.release() + # Pad file to required offset if needed + file_size = data_file.tell() + if current_offset > file_size: + data_file.write(b"\0" * (current_offset - file_size)) + data_file.write(raw_data) + + +def _create_external_tensor( + tensor: _protocols.TensorProtocol, + external_data_info: _ExternalDataInfo, + base_dir: str | os.PathLike, + relative_path: str | os.PathLike, +) -> _core.ExternalTensor: + """Create external tensors from external data information. + + Args: + tensor: Tensor to be converted to external tensor. + external_data_info: External data information stored for the tensor to be written as external data. + base_dir: Path of base directory. + relative_path: Path to which external data is to be stored, relative to the ONNX file. + + Returns: + External tensor created from the information. + """ + return _core.ExternalTensor( + os.path.normpath(relative_path), + external_data_info.offset, + external_data_info.length, + tensor.dtype, # type: ignore[arg-type] + shape=tensor.shape, # type: ignore[arg-type] + name=tensor.name, # type: ignore[arg-type] + base_dir=os.path.normpath(base_dir), + ) + + +def convert_tensors_from_external( + tensors: Sequence[_protocols.TensorProtocol], +) -> list[_protocols.TensorProtocol]: + """Convert a sequence of external tensors to in-memory tensors. + + Args: + tensors: External tensors to be converted to in-memory tensors. + + Returns: + A list of in-memory tensors derived from a list of external tensors. + """ + return [_external_tensor_to_memory_tensor(tensor) for tensor in tensors] + + +def convert_tensors_to_external( + tensors: Sequence[_protocols.TensorProtocol], + base_dir: str | os.PathLike, + relative_path: str | os.PathLike, +) -> list[_core.ExternalTensor]: + """Convert a sequence of any TensorProtocol tensors to external tensors. + + Existing external tensors are loaded to memory if they are referring to the + same file path as the destination path. + + Args: + tensors: Tensors to be converted to external tensors. They can be external tensors themselves. + base_dir: Path of base directory. + relative_path: Path to which external data is to be stored, relative to the ONNX file. + + Returns: + A list of external tensors derived from a list of input tensors. The order + should match the input tensor order. + """ + path = os.path.join(base_dir, relative_path) + # Check if file path is valid, and create subsequent subdirectories within the path if they don't exist + os.makedirs(os.path.dirname(path), exist_ok=True) + + # Check if output path exists. Load pre-existing external data if it does. + if os.path.exists(path): + # Check if any tensor provided is using the destination file + new_tensors = [] + for tensor in tensors: + if ( + isinstance(tensor, _core.ExternalTensor) + and os.path.exists(tensor.path) + and os.path.samefile(path, tensor.path) + ): + # FIXME(shubhambhokare1): If there is a non-initializer tensor that + # is referring to this file, that tensor is now invalid. + # This is a special case we are ok not handling right now. + new_tensors.append(_external_tensor_to_memory_tensor(tensor)) + # Mark the original external tensor as invalid because it is now pointing + # to a file that is going to be overwritten. + tensor.invalidate() + logger.warning( + "External tensor %s is referring to the same file as the destination path. " + "It has been invalidated because the data file is changed. To avoid this, " + "save the external data to a different path or load the newly saved model back " + "with ir.load().", + tensor, + ) + else: + new_tensors.append(tensor) + tensors = new_tensors + + external_data_infos: list[_ExternalDataInfo] = [] + # Sort all tensors based on tensor sizes, in order to avoid unnecessary alignment. + # All the smaller tensors are written earlier and alignment is performed for the larger tensors. + sorted_indices = sorted(range(len(tensors)), key=lambda i: tensors[i].nbytes) + sorted_tensors = [tensors[i] for i in sorted_indices] + + # Compute external data information for each tensor and write to disk + current_offset = 0 + for tensor in sorted_tensors: + external_info = _compute_external_data_info(tensor, current_offset) + external_data_infos.append(external_info) + current_offset = external_info.offset + external_info.length + _write_external_data(sorted_tensors, external_data_infos, path) + + # Create external tensor objects + external_tensors: list[_core.ExternalTensor] = [ + _create_external_tensor(tensor, external_info, base_dir, relative_path) + for tensor, external_info in zip(sorted_tensors, external_data_infos, strict=True) + ] + + # Sort external_tensors based on original key order. So that it can match the input tensor order + external_tensors = [ + external_tensors[i] + for i in sorted(range(len(external_tensors)), key=lambda i: sorted_indices[i]) + ] + + return external_tensors + + +def load_to_model(model: _core.Model) -> _core.Model: + """Convert all external model initializers to memory tensors in-place. + + Args: + model: Model to process. + """ + # TODO(justinchuby): Load attributes and initializers in subgraphs + values_to_convert = [] + for value in model.graph.initializers.values(): + if value.const_value is None: + # Filter out the uninitialized initializer values + continue + if isinstance(value.const_value, _core.ExternalTensor): + values_to_convert.append(value) + loaded_tensors = convert_tensors_from_external( + [v.const_value for v in values_to_convert] # type: ignore[misc] + ) + for value, tensor in zip(values_to_convert, loaded_tensors, strict=True): + value.const_value = tensor + + # Return the model because we may change the implementation to an out of place one + # to keep the input unchanged + return model + + +def unload_from_model( + model: _core.Model, + base_dir: str | os.PathLike, + relative_path: str | os.PathLike, + *, + size_threshold_bytes: int = 0, +) -> _core.Model: + """Convert all initializers equal or above size_threshold_bytes to external tensors in-place and save data to a single data file. + + It should only replace the initializers in the model with external tensors + and not make any other modifications to the model. + + If any existing external tensor + references the provided :param:`external_data` path, it will be invalidated + after the external data is overwritten. To obtain a valid model, use :func:`load` + to load the newly saved model, or provide a different external data path that + is not currently referenced by any tensors in the model. + + Args: + model: Model to process. + base_dir: Path the directory where the ONNX model file is. + relative_path: Path to which external data is to be stored, relative to the ONNX file. + E.g. "model.data" + size_threshold_bytes: Save to external data if the tensor size in bytes is larger than this threshold. + + Returns: + An ir.Model with all initializer data equal or above :param:`size_threshold_bytes` + converted to external tensors. + """ + # In-memory or external tensors, if equal to or above the threshold, should be converted to or re-saved as external tensors + initializers_to_become_external = [] + # Existing external tensors, if below the threshold, should be loaded to memory + initializers_to_load_to_memory = [] + for value in model.graph.initializers.values(): + if value.const_value is None: + # Filter out the uninitialized initializer values + continue + if value.const_value.nbytes > size_threshold_bytes: + initializers_to_become_external.append(value) + elif isinstance(value.const_value, _core.ExternalTensor): + initializers_to_load_to_memory.append(value) + + # Load to memory first, then convert to external tensors, because + # the existing external tensors may be overwritten by the new external data + memory_tensors = convert_tensors_from_external( + [v.const_value for v in initializers_to_load_to_memory] # type: ignore[misc] + ) + external_tensors = convert_tensors_to_external( + [v.const_value for v in initializers_to_become_external], # type: ignore[misc] + base_dir=base_dir, + relative_path=relative_path, + ) + + # Replace the initializer values with external tensors and save the model + for value, external_tensor in zip( + initializers_to_become_external, external_tensors, strict=True + ): + value.const_value = external_tensor + for value, memory_tensor in zip( + initializers_to_load_to_memory, memory_tensors, strict=True + ): + value.const_value = memory_tensor + + # Return the model because we may change the implementation to an out of place one + # to keep the input unchanged + return model diff --git a/onnxscript/ir/_external_data_test.py b/onnxscript/ir/external_data_test.py similarity index 81% rename from onnxscript/ir/_external_data_test.py rename to onnxscript/ir/external_data_test.py index afcf32b200..53ef2af3ed 100644 --- a/onnxscript/ir/_external_data_test.py +++ b/onnxscript/ir/external_data_test.py @@ -11,7 +11,7 @@ import onnx.external_data_helper from onnxscript import ir -from onnxscript.ir import _external_data +from onnxscript.ir import external_data class ExternalDataTest(unittest.TestCase): @@ -51,7 +51,7 @@ def test_set_base_dir_sets_base_dir_for_all_external_tensors(self): ) model = ir.serde.deserialize_model(model_proto) expected_dir = "something_else" - _external_data.set_base_dir(model.graph, expected_dir) + external_data.set_base_dir(model.graph, expected_dir) initializer_tensor = model.graph.initializers["test_tensor"].const_value assert isinstance(initializer_tensor, ir.ExternalTensor) @@ -67,7 +67,7 @@ def test_align_offset_false(self): # Tensor size > Align Threshold current_offset = 20000 tensor_size = 1048 - new_offset = _external_data._compute_new_offset( # pylint: disable=protected-access + new_offset = external_data._compute_new_offset( # pylint: disable=protected-access current_offset, tensor_size, align_offset=False ) self.assertEqual(current_offset, new_offset) @@ -76,7 +76,7 @@ def test_align_with_small_align_threshold(self): # Tensor size < Align Threshold current_offset = 20000 tensor_size = 1048 - new_offset = _external_data._compute_new_offset( # pylint: disable=protected-access + new_offset = external_data._compute_new_offset( # pylint: disable=protected-access current_offset, tensor_size, align_threshold=1000, @@ -87,7 +87,7 @@ def test_align_with_large_align_threshold(self): # Tensor size > Align Threshold current_offset = 20000 tensor_size = 1048 - new_offset = _external_data._compute_new_offset( # pylint: disable=protected-access + new_offset = external_data._compute_new_offset( # pylint: disable=protected-access current_offset, tensor_size, ) @@ -97,12 +97,12 @@ def test_allocation_granularity_diff(self): # Tensor size > Align Threshold current_offset = 20000 tensor_size = 1048577 - new_offset_1 = _external_data._compute_new_offset( # pylint: disable=protected-access + new_offset_1 = external_data._compute_new_offset( # pylint: disable=protected-access current_offset, tensor_size, allocation_granularity=4000, ) - new_offset_2 = _external_data._compute_new_offset( # pylint: disable=protected-access + new_offset_2 = external_data._compute_new_offset( # pylint: disable=protected-access current_offset, tensor_size, ) @@ -335,7 +335,7 @@ def _model_with_mixed_external_data(self) -> ir.Model: return model def test_external_data_simple(self): - model_with_external_data = _external_data.to_external_data( + model_with_external_data = external_data.unload_from_model( self.model, self.base_path, self.external_data_name ) external_tensor = model_with_external_data.graph.initializers["tensor1"].const_value @@ -347,29 +347,8 @@ def test_external_data_simple(self): self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes()) self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes()) - def test_same_path_external_data_written_to_memory(self): - model_with_external_data = _external_data.to_external_data( - self.model_with_external_data_same_path, - self.base_path, - self.external_data_name, - load_external_to_memory=True, - ) - external_tensor = model_with_external_data.graph.initializers["tensor1"].const_value - external_tensor2 = model_with_external_data.graph.initializers["tensor2"].const_value - external_tensor3 = model_with_external_data.graph.initializers[ - "tensor_same_file" - ].const_value - - self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes()) - self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes()) - self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes()) - # Ensure repeated reads are consistent - self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes()) - self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes()) - self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes()) - - def test_same_path_external_data_written_to_disk(self): - model_with_external_data = _external_data.to_external_data( + def test_same_path_external_data(self): + model_with_external_data = external_data.unload_from_model( self.model_with_external_data_same_path, self.base_path, self.external_data_name, @@ -389,7 +368,7 @@ def test_same_path_external_data_written_to_disk(self): self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes()) def test_external_data_diff_paths(self): - model_with_external_data = _external_data.to_external_data( + model_with_external_data = external_data.unload_from_model( self.model_with_external_data_diff_path, self.base_path, self.external_data_name, @@ -419,7 +398,7 @@ def test_external_data_diff_paths(self): self.assertEqual(external_tensor5.numpy().tobytes(), self.data_ext2_1.tobytes()) def test_custom_tensor_in_initializers(self): - model_with_external_data = _external_data.to_external_data( + model_with_external_data = external_data.unload_from_model( self.model_with_custom_tensor_class, self.base_path, self.external_data_name, @@ -438,52 +417,9 @@ def test_custom_tensor_in_initializers(self): self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes()) self.assertEqual(external_tensor3.numpy().tobytes(), self.custom_data.tobytes()) - def test_mixed_external_data_to_disk(self): - model_with_external_data = _external_data.to_external_data( - self.model_with_mixed_external_data, - self.base_path, - self.external_data_name, - ) - external_tensor = model_with_external_data.graph.initializers["tensor1"].const_value - external_tensor2 = model_with_external_data.graph.initializers["tensor2"].const_value - external_tensor3 = model_with_external_data.graph.initializers[ - "tensor_same_file" - ].const_value - external_tensor4 = model_with_external_data.graph.initializers[ - "custom_tensor" - ].const_value - external_tensor5 = model_with_external_data.graph.initializers[ - "tensor_ext1_1" - ].const_value - external_tensor6 = model_with_external_data.graph.initializers[ - "tensor_ext1_2" - ].const_value - external_tensor7 = model_with_external_data.graph.initializers[ - "tensor_ext2_1" - ].const_value - - self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes()) - self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes()) - self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes()) - self.assertEqual(external_tensor4.numpy().tobytes(), self.custom_data.tobytes()) - self.assertEqual(external_tensor5.numpy().tobytes(), self.data_ext1_1.tobytes()) - self.assertEqual(external_tensor6.numpy().tobytes(), self.data_ext1_2.tobytes()) - self.assertEqual(external_tensor7.numpy().tobytes(), self.data_ext2_1.tobytes()) - # Ensure repeated reads are consistent - self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes()) - self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes()) - self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes()) - self.assertEqual(external_tensor4.numpy().tobytes(), self.custom_data.tobytes()) - self.assertEqual(external_tensor5.numpy().tobytes(), self.data_ext1_1.tobytes()) - self.assertEqual(external_tensor6.numpy().tobytes(), self.data_ext1_2.tobytes()) - self.assertEqual(external_tensor7.numpy().tobytes(), self.data_ext2_1.tobytes()) - - def test_mixed_external_data_to_memory(self): - model_with_external_data = _external_data.to_external_data( - self.model_with_mixed_external_data, - self.base_path, - self.external_data_name, - load_external_to_memory=True, + def test_mixed_external_data(self): + model_with_external_data = external_data.unload_from_model( + self.model_with_mixed_external_data, self.base_path, self.external_data_name ) external_tensor = model_with_external_data.graph.initializers["tensor1"].const_value external_tensor2 = model_with_external_data.graph.initializers["tensor2"].const_value @@ -520,7 +456,7 @@ def test_mixed_external_data_to_memory(self): self.assertEqual(external_tensor7.numpy().tobytes(), self.data_ext2_1.tobytes()) def test_external_data_sorted(self): - model_with_external_data = _external_data.to_external_data( + model_with_external_data = external_data.unload_from_model( self.model_with_mixed_external_data, self.base_path, self.external_data_name,