diff --git a/BUILD.md b/BUILD.md index bb09229e112e5..72a936b6b911b 100644 --- a/BUILD.md +++ b/BUILD.md @@ -396,13 +396,13 @@ Note that OpenVINO is built as a [shared provider library](#Execution-Provider-S ##### Windows ``` -.\build.bat --config RelWithDebInfo --use_openvino +.\build.bat --config RelWithDebInfo --use_openvino --build_shared_lib ``` *Note: The default Windows CMake Generator is Visual Studio 2017, but you can also use the newer Visual Studio 2019 by passing `--cmake_generator "Visual Studio 16 2019"` to `.\build.bat`* ##### Linux ``` -./build.sh --config RelWithDebInfo --use_openvino +./build.sh --config RelWithDebInfo --use_openvino --build_shared_lib ``` --use_openvino: Builds the OpenVINO Execution Provider in ONNX Runtime. diff --git a/LICENSE b/LICENSE index 506ab97e56e2b..48bc6bb4996ac 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2018 Microsoft Corporation +Copyright (c) Microsoft Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index b03aafe95ede2..15bc4af753b82 100644 --- a/README.md +++ b/README.md @@ -1,242 +1,53 @@

-[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9) -[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CI%20Pipeline?label=Windows+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=10) -[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11) -[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12) -[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13) -[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86) -[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84) +**ONNX Runtime** is a cross-platform **inference and training machine-learning accelerator** compatible with deep learning frameworks, PyTorch and TensorFlow/Keras, as well as classical machine learning libraries such as scikit-learn, and more. -**ONNX Runtime** is a cross-platform **inferencing and training accelerator** compatible with many popular ML/DNN frameworks, including PyTorch, TensorFlow/Keras, scikit-learn, and more. **[aka.ms/onnxruntime](https://aka.ms/onnxruntime)** +ONNX Runtime uses the portable [ONNX](https://onnx.ai) computation graph format, backed by execution providers optimized for operating systems, drivers and hardware. +Common use cases for ONNX Runtime: -Many users can benefit from ONNX Runtime, including those looking to: * Improve inference performance for a wide variety of ML models * Reduce time and cost of training large models * Train in Python but deploy into a C#/C++/Java app -* Run on different hardware and operating systems +* Run with optimized performance on different hardware and operating systems * Support models created in several different frameworks -[ONNX Runtime inferencing](./onnxruntime) APIs are stable and production-ready since the [1.0 release](https://github.com/microsoft/onnxruntime/releases/tag/v1.0.0) in October 2019 and can enable faster customer experiences and lower costs. +[ONNX Runtime inference](https://www.onnxruntime.ai/docs/get-started/inference.html) APIs are stable and production-ready since the [1.0 release](https://github.com/microsoft/onnxruntime/releases/tag/v1.0.0) in October 2019 and can enable faster customer experiences and lower costs. -[ONNX Runtime training](./orttraining) feature was introduced in May 2020 in preview. This feature supports acceleration of PyTorch training on multi-node NVIDIA GPUs for transformer models. Additional updates for this feature are coming soon. +[ONNX Runtime training](https://www.onnxruntime.ai/docs/get-started/training.html) feature was introduced in May 2020 in preview. This feature supports acceleration of PyTorch training on multi-node NVIDIA GPUs for transformer models. Additional updates for this feature are coming soon. -*** +## Get Started -# Table of Contents +**http://onnxruntime.ai/** +* [Install](https://www.onnxruntime.ai/docs/get-started/install.html) +* [Inference](https://www.onnxruntime.ai/docs/get-started/inference.html) +* [Training](https://www.onnxruntime.ai/docs/get-started/training.html) +* [Documentation](https://www.onnxruntime.ai/docs/) +* [Samples and Tutorials](https://www.onnxruntime.ai/docs/tutorials/) +* [Frequently Asked Questions](./docs/FAQ.md) -* **[Get Started](#get-started)** - * [ONNX Runtime Inferencing](#inferencing-start) - * [ONNX Runtime Training](#training-start) -* **[Data/Telemetry](#DataTelemetry)** -* **[Contributions and Feedback](#contributions-and-feedback)** -* **[License](#license)** - -*** - -# Get Started - -[Frequently Asked Questions](./docs/FAQ.md) - -## Inferencing: Start - -To use ONNX Runtime, refer to the table on [aka.ms/onnxruntime](https://aka.ms/onnxruntime) for instructions for different build combinations. - -* [Compatibility](#compatibility) -* [Binaries](#binaries) -* [Build from source (includes additional combinations)](#build-from-source) -* [Docker images](#docker-images) -* [API documentation](#api-documentation) -* [Hardware accelerators](#supported-accelerators) -* [Deploy ONNX Runtime inferencing](#deploying-onnx-runtime) -* [Samples](./samples) -* [High level architectural design](docs/InferenceHighLevelDesign.md) -* [Performance Tuning](./docs/ONNX_Runtime_Perf_Tuning.md) -* [Extensibility: Add a new graph transform](include/onnxruntime/core/optimizer/graph_transformer.h) -* [Extensibility: Add a new rewrite rule](include/onnxruntime/core/optimizer/rewrite_rule.h) - -### Compatibility - -Supporting models based on the standard [ONNX](https://onnx.ai) format, the runtime is compatible with PyTorch, scikit-learn, TensorFlow, Keras, and all other frameworks and tools that support the interoperable format. - -* [Getting ONNX models - tutorials](https://github.com/onnx/tutorials#getting-onnx-models) - -ONNX Runtime is up to date and backwards compatible with all operators (both DNN and traditional ML) since ONNX v1.2.1+. [(ONNX compatibility details)](docs/Versioning.md). Newer versions of ONNX Runtime support all models that worked with prior versions, so updates should not break integrations. - -* [Supported operators/types](./docs/OperatorKernels.md) - * *Operators not supported in the current ONNX spec may be available as a [Contrib Operator](./docs/ContribOperators.md)* -* [Extensibility: Add a custom operator/kernel](docs/AddingCustomOp.md) - -### Binaries - -Official builds are available on PyPi (Python), Nuget (C#/C/C++), Maven Central (Java), and npm (node.js). - -* Default CPU Provider (Eigen + MLAS) -* GPU Provider - NVIDIA CUDA -* GPU Provider - DirectML (Windows) - * *On Windows, the [DirectML execution provider](./docs/execution_providers/DirectML-ExecutionProvider.md) is recommended for optimal performance and compatibility with a broad set of GPUs.* - -Dev builds created from the master branch are available for testing newer changes between official releases. Please use these at your own risk. We strongly advise against deploying these to production workloads as support is limited for dev builds. - -|Repository|Details| -|---|---| -|Pypi (Python)|*If using pip, run `pip install --upgrade pip` prior to downloading.*
CPU: [**onnxruntime**](https://pypi.org/project/onnxruntime) / [ort-nightly (dev)](https://test.pypi.org/project/ort-nightly)
GPU: [**onnxruntime-gpu**](https://pypi.org/project/onnxruntime-gpu) / [ort-gpu-nightly (dev)](https://test.pypi.org/project/ort-gpu-nightly)| -|Nuget (C#/C/C++)|CPU: [**Microsoft.ML.OnnxRuntime**](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime) / [ort-nightly (dev)](https://aiinfra.visualstudio.com/PublicPackages/_packaging?_a=feed&feed=ORT-Nightly)
GPU: [**Microsoft.ML.OnnxRuntime.Gpu**](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.gpu) / [ort-nightly (dev)](https://aiinfra.visualstudio.com/PublicPackages/_packaging?_a=feed&feed=ORT-Nightly)| -|Maven Central (Java)|CPU: [**com.microsoft.onnxruntime/onnxruntime**](https://search.maven.org/artifact/com.microsoft.onnxruntime/onnxruntime)
GPU: [**com.microsoft.onnxruntime/onnxruntime_gpu**](https://search.maven.org/artifact/com.microsoft.onnxruntime/onnxruntime_gpu)| -|npm (node.js)|CPU: [**onnxruntime**](https://www.npmjs.com/package/onnxruntime)| -|Other|[Contributed non-official packages](https://docs.microsoft.com/en-us/windows/ai/windows-ml/get-started-uwp) (including Homebrew, Linuxbrew, and nixpkgs)
*These are not maintained by the core ONNX Runtime team and may have limited support; use at your discretion.*| - - -#### System Requirements - -The following are required for usage of the official published packages. - -* Visual C++ Runtime (for Windows packages) - * Requires [Visual C++ 2019 runtime](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads) -* System language - * Installation of the **English language package** and configuring `en_US.UTF-8 locale` is required, as certain operators makes use of system locales. - * For Ubuntu, install [language-pack-en package](https://packages.ubuntu.com/search?keywords=language-pack-en) - * Run the following commands: - `locale-gen en_US.UTF-8` - `update-locale LANG=en_US.UTF-8` - * Follow similar procedure to configure other locales on other platforms. - -* Default CPU - * ONNX Runtime binaries in the CPU packages use OpenMP and depend on the library being available at runtime in the system. - * For Windows, **OpenMP** support comes as part of VC runtime. It is also available as redist packages: - [vc_redist.x64.exe](https://aka.ms/vs/16/release/vc_redist.x64.exe) and [vc_redist.x86.exe](https://aka.ms/vs/16/release/vc_redist.x86.exe) - * For Linux, the system must have **libgomp.so.1** which can be installed using `apt-get install libgomp1`. - * For Mac OS X, the system must have **libomp.dylib** which can be installed using `brew install libomp`. - -* Default GPU (CUDA) - * The default GPU build requires CUDA runtime libraries being installed on the system: - * Version: **CUDA 10.2** and **cuDNN 8.0.3** - * Version dependencies from older ONNX Runtime releases can be found in [prior release notes](https://github.com/microsoft/onnxruntime/releases). - -### Build from Source - -For production scenarios, it's strongly recommended to build only from an [official release branch](https://github.com/microsoft/onnxruntime/releases). - -* [Instructions for additional build flavors](./BUILD.md) - -### Docker Images - -* [ONNX-Ecosystem](https://github.com/onnx/onnx-docker/tree/master/onnx-ecosystem): includes ONNX Runtime (CPU, Python), dependencies, tools to convert from various frameworks, and Jupyter notebooks to help get started -* [Additional dockerfiles](./dockerfiles) - -### API Documentation - -|API|Supported Versions|Samples| -|---|---|---| -[Python](https://aka.ms/onnxruntime-python)| 3.6, 3.7, 3.8, 3.9 (3.8/3.9 excludes Win GPU and Linux ARM)
[Python Dev Notes](./docs/Python_Dev_Notes.md)| [Samples](./samples#python)| -|[C#](docs/CSharp_API.md)| | [Samples](./samples#C)| -|[C++](./include/onnxruntime/core/session/onnxruntime_cxx_api.h)| |[Samples](./samples#CC)| -|[C](docs/C_API.md)| | [Samples](./samples#CC)| -|[WinRT](docs/WinRT_API.md) | [Windows.AI.MachineLearning](https://docs.microsoft.com/en-us/windows/ai/windows-ml/api-reference)| [Samples](https://github.com/microsoft/windows-Machine-Learning)| -|[Java](docs/Java_API.md)|8+|[Samples](./samples#Java)| -[Ruby](https://github.com/ankane/onnxruntime) (external project)| 2.4-2.7| [Samples](https://ankane.org/tensorflow-ruby)| -|[Javascript (node.js)](./nodejs) |12.x | [Samples](./samples/nodejs) | - -### Supported Accelerators - -[Execution Providers](./docs/execution_providers) - -|CPU|GPU|IoT/Edge/Mobile|Other| +## Build Pipeline Status +|System|CPU|GPU|EPs| |---|---|---|---| -|
  • Default CPU - *MLAS (Microsoft Linear Algebra Subprograms) + Eigen*
  • [Intel DNNL](./docs/execution_providers/DNNL-ExecutionProvider.md)
  • Intel MKL-ML *(build option)*
|
  • NVIDIA CUDA
  • [NVIDIA TensorRT](./docs/execution_providers/TensorRT-ExecutionProvider.md)
  • [DirectML](./docs/execution_providers/DirectML-ExecutionProvider.md)
  • [AMD MIGraphX](./docs/execution_providers/MIGraphX-ExecutionProvider.md) (*preview*)
|
  • [Intel OpenVINO](./docs/execution_providers/OpenVINO-ExecutionProvider.md)
  • [ARM Compute Library](./docs/execution_providers/ACL-ExecutionProvider.md) (*preview*)
  • [Android Neural Networks API](./docs/execution_providers/NNAPI-ExecutionProvider.md) (*preview*)
  • [ARM-NN](./docs/execution_providers/ArmNN-ExecutionProvider.md) (*preview*)
  • [Rockchip NPU](./docs/execution_providers/RKNPU-ExecutionProvider.md) (*preview*)
|
  • [Nuphar Model Compiler](./docs/execution_providers/Nuphar-ExecutionProvider.md) - (*preview*)
  • [Xilinx Vitis-AI](./docs/execution_providers/Vitis-AI-ExecutionProvider.md) (*preview*)
| - -* [Roadmap: Upcoming accelerators](./docs/Roadmap.md#accelerators-and-execution-providers) -* [Extensibility: Add an execution provider](docs/AddingExecutionProvider.md) - -### Deploying ONNX Runtime - -#### Cloud - -* ONNX Runtime can be deployed to any cloud for model inferencing, including [Azure Machine Learning Services](https://azure.microsoft.com/en-us/services/machine-learning-service). - * [Detailed instructions](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-build-deploy-onnx) - * [AzureML sample notebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment/onnx) - -* **ONNX Runtime Server (beta)** is a hosting application for serving ONNX models using ONNX Runtime, providing a REST API for prediction. - * [Usage details](./docs/ONNX_Runtime_Server_Usage.md) - * [Image installation instructions](./dockerfiles#onnx-runtime-server-preview) - -#### IoT and edge devices - -* [Reference implementations](https://github.com/Azure-Samples/onnxruntime-iot-edge) - -The expanding focus and selection of IoT devices with sensors and consistent signal streams introduces new opportunities to move AI workloads to the edge. -This is particularly important when there are massive volumes of incoming data/signals that may not be efficient or useful to push to the cloud due to storage or latency considerations. Consider: surveillance tapes where 99% of footage is uneventful, or real-time person detection scenarios where immediate action is required. In these scenarios, directly executing model inferencing on the target device is crucial for optimal assistance. - -#### Client applications - -* Install or build the package you need to use in your application. ([sample implementations](https://github.com/microsoft/onnxruntime/tree/master/samples/c_cxx) using the C++ API) - -* On newer Windows 10 devices (1809+), ONNX Runtime is available by default as part of the OS and is accessible via the [Windows Machine Learning APIs](https://docs.microsoft.com/en-us/windows/ai/windows-ml/). ([Tutorials for Windows Desktop or UWP app](https://docs.microsoft.com/en-us/windows/ai/windows-ml/get-started-desktop)) - -*** +|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CI%20Pipeline?label=Windows+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=10)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)| +|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20x64%20NoContribops%20CI%20Pipeline?label=Linux+CPU+x64+No+Contrib+Ops)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=110)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/centos7_cpu?label=Linux+CentOS7)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=78)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-distributed?label=Distributed+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=140)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20NUPHAR%20CI%20Pipeline?label=Linux+NUPHAR)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=110)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline%20v2?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=108)| +|Mac|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20NoContribops%20CI%20Pipeline?label=MacOS+NoContribops)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=65)||| +|Android|||[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Android%20CI%20Pipeline?label=Android)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=53)| +|iOS|||[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/iOS%20CI%20Pipeline?label=iOS)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=134)| -## Training: Start -The ONNX Runtime training feature enables easy integration with existing Pytorch trainer code to accelerate the exection. With a few lines of code, you can add ONNX Runtime into your existing training scripts and start seeing acceleration. The current preview version supports training acceleration for transformer models on NVIDIA GPUs. - -**[ONNX Runtime pre-training sample](https://github.com/microsoft/onnxruntime-training-examples)**: This sample is setup to pre-train the BERT-Large model to show how ONNX Runtime training can be used to accelerate training execution. - -### Train PyTorch model with ONNX Runtime -ONNX Runtime (ORT) has the capability to train existing PyTorch models through its optimized backend. For this, we have introduced an python API for PyTorch, called ORTTrainer, which can be used to switch the training backend for PyTorch models (instance of `torch.nn.Module`) to `orttrainer`. This requires some changes in the trainer code, such as replacing the PyTorch optimizer, and optionally, setting flags to enable additional features such as mixed-precision training. Here is a sample code fragment to integrate ONNX Runtime Training in your PyTorch pre-training script: - -_NOTE: The current API is experimental and expected to see significant changes in the near future. Our goal is to improve the interface to provide a seamless integration with PyTorch training that requires minimal changes in users’ training code._ - - ```python - import torch - ... - import onnxruntime - from onnxruntime.training import ORTTrainer, optim - - # Model definition - class NeuralNet(torch.nn.Module): - def __init__(self, input_size, hidden_size, num_classes): - ... - def forward(self, data): - ... - - model = NeuralNet(input_size=784, hidden_size=500, num_classes=10) - criterion = torch.nn.Functional.cross_entropy - model_description = {'inputs': [('data', ['in', 'batch_size']), - ('target', ['label_x_batch_size'])], - 'outputs': [('loss', [], True), - ('output', ['out', 'batch_size'])]} - - optimizer_config = optim.AdamConfig(lr=learning_rate) - - trainer = ORTTrainer(model, # model - model_description, # model description - optimizer_config, # optimizer configuration - criterion) # loss function - - # Training Loop - for t in range(1000): - # forward + backward + weight update - loss, y_pred = trainer.train_step(input_data, target_labels, learning_rate) - total_loss += loss.item() - ... - ``` - -### Build ONNX Runtime Training from source -To use ONNX Runtime training in a custom environment, like on-prem NVIDIA DGX-2 clusters, you can use these [build instructions](BUILD.md#training) to generate the Python package to integrate into existing trainer code. - - - -# Data/Telemetry +## Data/Telemetry This project may collect usage data and send it to Microsoft to help improve our products and services. See the [privacy statement](docs/Privacy.md) for more details. -# Contributions and Feedback +## Contributions and Feedback We welcome contributions! Please see the [contribution guidelines](CONTRIBUTING.md). -For any feedback or to report a bug, please file a [GitHub Issue](https://github.com/Microsoft/onnxruntime/issues). +For feature requests or bug reports, please file a [GitHub Issue](https://github.com/Microsoft/onnxruntime/issues). + +For general discussion or questions, please use [Github Discussions](https://github.com/microsoft/onnxruntime/discussions). ## Code of Conduct @@ -244,6 +55,6 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -# License +## License This project is licensed under the [MIT License](LICENSE). diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index 176da7f186ca5..50a692aa24f67 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -2168,38 +2168,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h _____ -jemalloc - -Unless otherwise specified, files in the jemalloc source distribution are -subject to the following license: --------------------------------------------------------------------------------- -Copyright (C) 2002-2018 Jason Evans . -All rights reserved. -Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. -Copyright (C) 2009-2018 Facebook, Inc. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright notice(s), - this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice(s), - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS -OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, -INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE -OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --------------------------------------------------------------------------------- - -_____ - FreeBSD: getopt.c file Copyright (c) 1987, 1993, 1994 diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json index fde4e5ef97854..e12b046af2752 100644 --- a/cgmanifests/cgmanifest.json +++ b/cgmanifests/cgmanifest.json @@ -45,15 +45,6 @@ } } }, - { - "component": { - "type": "git", - "git": { - "commitHash": "e02b83cc5e3c4d30f93dba945162e3aa58d962d6", - "repositoryUrl": "https://github.com/jemalloc/jemalloc.git" - } - } - }, { "component": { "type": "git", diff --git a/cgmanifests/submodules/cgmanifest.json b/cgmanifests/submodules/cgmanifest.json index 4d61c1f44a0b8..3f894e468903a 100644 --- a/cgmanifests/submodules/cgmanifest.json +++ b/cgmanifests/submodules/cgmanifest.json @@ -242,7 +242,7 @@ "component": { "type": "git", "git": { - "commitHash": "174de7d086a768cba29374a56a7461eff87cfdb3", + "commitHash": "237926eab41de21fb9addc4b03b751fd6a3343ec", "repositoryUrl": "https://github.com/onnx/onnx" }, "comments": "git submodule at cmake/external/onnx" diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index c031fc6830cf7..a07893771c5b7 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -59,7 +59,6 @@ option(onnxruntime_ENABLE_MEMLEAK_CHECKER "Experimental: Enable memory leak chec option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF) option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) -option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON) option(onnxruntime_USE_COREML "Build with CoreML support" OFF) option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF) option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) @@ -124,6 +123,7 @@ option(onnxruntime_DISABLE_RTTI "Disable RTTI" OFF) option(onnxruntime_DISABLE_EXCEPTIONS "Disable exception handling. Requires onnxruntime_MINIMAL_BUILD currently." OFF) option(onnxruntime_MINIMAL_BUILD "Exclude as much as possible from the build. Support ORT format models. No support for ONNX format models." OFF) option(onnxruntime_EXTENDED_MINIMAL_BUILD "onnxruntime_MINIMAL_BUILD with support for execution providers that compile kernels." OFF) +option(onnxruntime_MINIMAL_BUILD_CUSTOM_OPS "Add custom operator kernels support to a minimal build." OFF) option(onnxruntime_REDUCED_OPS_BUILD "Reduced set of kernels are registered in build via modification of the kernel registration source files." OFF) option(onnxruntime_DISABLE_ORT_FORMAT_LOAD "Disable loading an ORT format model when onnxruntime_MINIMAL_BUILD=OFF (i.e. in a full build)." OFF) @@ -161,6 +161,10 @@ if(onnxruntime_USE_VALGRIND AND NOT WIN32) add_definitions(-DRE2_ON_VALGRIND=1) endif() +if(WIN32) + string(APPEND CMAKE_CXX_FLAGS " /W3") +endif() + if (onnxruntime_ENABLE_NVTX_PROFILE) add_definitions(-DENABLE_NVTX_PROFILE=1) endif() @@ -233,6 +237,10 @@ if (onnxruntime_MINIMAL_BUILD) add_compile_definitions(ORT_EXTENDED_MINIMAL_BUILD) endif() + if (onnxruntime_MINIMAL_BUILD_CUSTOM_OPS) + add_compile_definitions(ORT_MINIMAL_BUILD_CUSTOM_OPS) + endif() + set(onnxruntime_REDUCED_OPS_BUILD ON) if (NOT onnxruntime_ENABLE_PYTHON) @@ -643,18 +651,6 @@ if (onnxruntime_USE_CUDA AND "${onnxruntime_CUDNN_HOME}" STREQUAL "") message(FATAL_ERROR "onnxruntime_CUDNN_HOME required for onnxruntime_USE_CUDA") endif() -if (onnxruntime_USE_EIGEN_FOR_BLAS) - add_definitions(-DUSE_EIGEN_FOR_BLAS) -endif() - -if (onnxruntime_USE_OPENBLAS AND "${onnxruntime_OPENBLAS_HOME}" STREQUAL "" AND WIN32) - # On linux we assume blas is installed via 'apt-get install libopenblas-dev' - message(FATAL_ERROR "onnxruntime_OPENBLAS_HOME required for onnxruntime_USE_OPENBLAS") -endif() - -if (onnxruntime_USE_OPENBLAS AND onnxruntime_USE_EIGEN_FOR_BLAS) - message(FATAL_ERROR "use one of onnxruntime_USE_OPENBLAS, onnxruntime_USE_EIGEN_FOR_BLAS") -endif() get_filename_component(ONNXRUNTIME_ROOT "${ONNXRUNTIME_ROOT}" ABSOLUTE) get_filename_component(ORTTRAINING_ROOT "${ORTTRAINING_ROOT}" ABSOLUTE) @@ -701,11 +697,81 @@ set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -DGSL_UNENFORCED_O include(eigen) -#onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn, jemalloc, -# dnnl/mklml, openblas, onnxruntime_codegen_tvm, tvm, nnvm_compiler and pthread +#onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn, +# dnnl/mklml, onnxruntime_codegen_tvm, tvm, nnvm_compiler and pthread # pthread is always at the last set(onnxruntime_EXTERNAL_LIBRARIES onnx onnx_proto ${PROTOBUF_LIB} re2::re2) + +set(onnxruntime_LINK_DIRS ) +if(onnxruntime_USE_CUDA) + #TODO: combine onnxruntime_CUDNN_HOME and onnxruntime_CUDA_HOME, assume they are the same + if (WIN32) + list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib/x64 ${onnxruntime_CUDA_HOME}/x64/lib64) + else() + list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib64 ${onnxruntime_CUDA_HOME}/lib64) + endif() +endif() + +function(onnxruntime_add_shared_library target_name) + add_library(${target_name} SHARED ${ARGN}) + target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS}) + if (MSVC) + target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") + target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /sdl>" "$<$>:/sdl>") + set_target_properties(${target_name} PROPERTIES VS_CA_EXCLUDE_PATH "${CMAKE_CURRENT_SOURCE_DIR}") + else() + target_compile_definitions(${target_name} PUBLIC -DNSYNC_ATOMIC_CPP11) + target_include_directories(${target_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public") + endif() + target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}) + if(onnxruntime_ENABLE_LTO) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE) + endif() +endfunction() + +#For plugins that are not linked into other targets but may be loaded dynamically at runtime using dlopen-like functionality. +function(onnxruntime_add_shared_library_module target_name) + add_library(${target_name} MODULE ${ARGN}) + target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS}) + if (MSVC) + target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") + target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /sdl>" "$<$>:/sdl>") + set_target_properties(${target_name} PROPERTIES VS_CA_EXCLUDE_PATH "${CMAKE_CURRENT_SOURCE_DIR}") + else() + target_compile_definitions(${target_name} PUBLIC -DNSYNC_ATOMIC_CPP11) + target_include_directories(${target_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public") + endif() + target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}) + if(onnxruntime_ENABLE_LTO) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE) + endif() +endfunction() + +#almost the same as the above function, except the first line of the body +function(onnxruntime_add_executable target_name) + add_executable(${target_name} ${ARGN}) + target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS}) + if (MSVC) + target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") + target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /sdl>" "$<$>:/sdl>") + set_target_properties(${target_name} PROPERTIES VS_CA_EXCLUDE_PATH "${CMAKE_CURRENT_SOURCE_DIR}") + else() + target_compile_definitions(${target_name} PUBLIC -DNSYNC_ATOMIC_CPP11) + target_include_directories(${target_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public") + endif() + target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}) + if(onnxruntime_ENABLE_LTO) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE) + set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE) + endif() +endfunction() + function(onnxruntime_add_include_to_target dst_target) foreach(src_target ${ARGN}) target_include_directories(${dst_target} PRIVATE $) @@ -890,6 +956,7 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android" AND Onnxruntime_GCOV_COVERAGE) string(APPEND CMAKE_C_FLAGS " -g -O0 --coverage ") endif() +set(ORT_WARNING_FLAGS ) #Adjust warning flags if (WIN32) add_definitions(-DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES) @@ -898,27 +965,30 @@ if (WIN32) endif() # parallel build # These compiler opitions cannot be forwarded to NVCC, so cannot use add_compiler_options - string(APPEND CMAKE_CXX_FLAGS " /MP /W4") - string(APPEND CMAKE_C_FLAGS " /MP /W4") + string(APPEND CMAKE_CXX_FLAGS " /MP") + if(onnxruntime_USE_CUDA) + list(APPEND ORT_WARNING_FLAGS "/W3") + else() + list(APPEND ORT_WARNING_FLAGS "/W4") + endif() #Compiler bug, we should get such warnings. It will be fixed in a new VC release - string(APPEND CMAKE_CXX_FLAGS " /wd4127") + list(APPEND ORT_WARNING_FLAGS "/wd4127") # class needs to have dll-interface to be used by clients - string(APPEND CMAKE_CXX_FLAGS " /wd4251") + list(APPEND ORT_WARNING_FLAGS "/wd4251") # issued by thrust nonstandard extension used: nameless struct/union - string(APPEND CMAKE_CXX_FLAGS " /wd4201") + list(APPEND ORT_WARNING_FLAGS "/wd4201") if (onnxruntime_ENABLE_STATIC_ANALYSIS) - string(APPEND CMAKE_CXX_FLAGS - " /analyze:stacksize 131072" - # disable warning because there are many occurrences from test macros - " /wd6326 " # potential comparison of a constant with another constant - ) + list(APPEND ORT_WARNING_FLAGS "/analyze:stacksize 131072") + list(APPEND ORT_WARNING_FLAGS "/wd6326") # potential comparison of a constant with another constant + if(onnxruntime_USE_OPENMP) + list(APPEND ORT_WARNING_FLAGS "/wd6993") # Code analysis ignores OpenMP constructs + endif() endif() - # Treat warning as error if onnxruntime_DEV_MODE is ON # For cross-compiled ARM64 binaries, there are too many warnings to fix, hence ignore warnings for now if (onnxruntime_DEV_MODE AND NOT CMAKE_CROSSCOMPILING) # treat warnings as errors - string(APPEND CMAKE_CXX_FLAGS " /WX") + list(APPEND ORT_WARNING_FLAGS "/WX") foreach(type EXE STATIC SHARED) set(CMAKE_${type}_LINKER_FLAGS "${CMAKE_${type}_LINKER_FLAGS} /WX") endforeach() @@ -946,6 +1016,10 @@ if (WIN32) endif() endforeach() endif() + foreach(ORT_FLAG ${ORT_WARNING_FLAGS}) + string(APPEND CMAKE_CXX_FLAGS " ${ORT_FLAG}") + string(APPEND CMAKE_C_FLAGS " ${ORT_FLAG}") + endforeach() else() add_definitions(-DPLATFORM_POSIX) # Enable warning and enable space optimization in Linux @@ -991,19 +1065,6 @@ else() endif() endif() set(onnxruntime_DELAYLOAD_FLAGS "") -if (onnxruntime_USE_JEMALLOC) - if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR) - message( FATAL_ERROR "You cannot specify both jemalloc and mimalloc." ) - endif() - - if (Win32) - message( FATAL_ERROR "Jemalloc is not supported on Windows." ) - endif() - include(jemalloc) - add_definitions(-DUSE_JEMALLOC=1) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${JEMALLOC_STATIC_LIBRARIES}) - list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES jemalloc) -endif() include_directories( ${ONNXRUNTIME_INCLUDE_DIR} @@ -1087,17 +1148,6 @@ if (onnxruntime_USE_VITISAI) endif() endif() -if (onnxruntime_USE_OPENBLAS) - add_definitions(-DUSE_OPENBLAS=1) - if (WIN32) - include_directories(${onnxruntime_OPENBLAS_HOME}) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_OPENBLAS_HOME}/lib/libopenblas.lib) - else() - # on linux we assume blas is installed via 'apt-get install libopenblas-dev' - list(APPEND onnxruntime_EXTERNAL_LIBRARIES openblas) - endif() -endif() - configure_file(onnxruntime_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime_config.h) if (onnxruntime_USE_CUDA) @@ -1129,25 +1179,6 @@ if (onnxruntime_USE_CUDA) list(APPEND ONNXRUNTIME_CUDA_LIBRARIES cublas cudnn curand cufft) endif() - # CUDA_HOME and CUDNN_HOME may differ, so need to add both to the link directories - if (WIN32) - link_directories(${onnxruntime_CUDA_HOME}/lib/x64) - link_directories(${onnxruntime_CUDNN_HOME}/lib/x64) - - # delayload causes crash on exit, so disable for now - # please update cudaDelayLoadedLibs in Microsoft.ML.OnnxRuntime/SessionOptions.cs if you change delayload - #file(GLOB cuda_dll_paths "${onnxruntime_CUDA_HOME}/bin/cublas64_*" "${onnxruntime_CUDA_HOME}/bin/cudart64_*" "${onnxruntime_CUDA_HOME}/bin/curand64_*" "${onnxruntime_CUDA_HOME}/bin/cufft64_*") - #set(onnxruntime_DELAYLOAD_FLAGS "${onnxruntime_DELAYLOAD_FLAGS} /DELAYLOAD:cudnn64_8.dll") - #foreach(cuda_dll_path ${cuda_dll_paths}) - # get_filename_component(cuda_dll_file_name ${cuda_dll_path} NAME) - # set(onnxruntime_DELAYLOAD_FLAGS "${onnxruntime_DELAYLOAD_FLAGS} /DELAYLOAD:${cuda_dll_file_name}") - #endforeach(cuda_dll_path) - - else() - link_directories(${onnxruntime_CUDA_HOME}/lib64) - link_directories(${onnxruntime_CUDNN_HOME}/lib64) - endif() - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ONNXRUNTIME_CUDA_LIBRARIES}) if(NOT CMAKE_CUDA_ARCHITECTURES) if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64-linux-gnu") @@ -1172,9 +1203,12 @@ if (onnxruntime_USE_CUDA) endif() endif() endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch") + endif() if (NOT WIN32) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC") endif() # Options passed to cudafe set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"") @@ -1368,6 +1402,7 @@ foreach(target_name onnxruntime_common onnxruntime_graph onnxruntime_framework o if (MSVC) target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options /sdl>" "$<$>:/sdl>") + set_target_properties(${target_name} PROPERTIES VS_CA_EXCLUDE_PATH "${CMAKE_CURRENT_SOURCE_DIR}") else() target_compile_definitions(${target_name} PUBLIC -DNSYNC_ATOMIC_CPP11) target_include_directories(${target_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public") diff --git a/cmake/ConfigureVisualStudioCodeAnalysis.props b/cmake/ConfigureVisualStudioCodeAnalysis.props deleted file mode 100644 index bad2a87b930e7..0000000000000 --- a/cmake/ConfigureVisualStudioCodeAnalysis.props +++ /dev/null @@ -1,14 +0,0 @@ - - - - true - NativeRecommendedRules.ruleset - false - - $(SolutionDir);$(SolutionDir)..\..\..\cmake; - - diff --git a/cmake/EnableVisualStudioCodeAnalysis.props b/cmake/EnableVisualStudioCodeAnalysis.props deleted file mode 100644 index 9da9f0be46560..0000000000000 --- a/cmake/EnableVisualStudioCodeAnalysis.props +++ /dev/null @@ -1,24 +0,0 @@ - - - - true - NativeRecommendedRules.ruleset - - $(SolutionDir);$(MSBuildThisFileDirectory) - - - true - - - diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake index 49a2d7a1448da..7ccb40a21da18 100644 --- a/cmake/external/dml.cmake +++ b/cmake/external/dml.cmake @@ -20,7 +20,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML) set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config) set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config) get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE) - set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.4.0) + set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.4.1) set(DML_SHARED_LIB DirectML.dll) # Restore nuget packages, which will pull down the DirectML redist package diff --git a/cmake/external/onnx b/cmake/external/onnx index 174de7d086a76..237926eab41de 160000 --- a/cmake/external/onnx +++ b/cmake/external/onnx @@ -1 +1 @@ -Subproject commit 174de7d086a768cba29374a56a7461eff87cfdb3 +Subproject commit 237926eab41de21fb9addc4b03b751fd6a3343ec diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 64a4b6b64aea3..3b283767c589c 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -33,16 +33,15 @@ add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_s add_custom_target(onnxruntime_generate_def ALL DEPENDS ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c) if(WIN32) - add_library(onnxruntime SHARED + onnxruntime_add_shared_library(onnxruntime ${SYMBOL_FILE} "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc" "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc" ) else() - add_library(onnxruntime SHARED ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c) + onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c) endif() -set_target_properties(onnxruntime PROPERTIES VERSION ${ORT_VERSION}) add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES}) target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT}) onnxruntime_add_include_to_target(onnxruntime) @@ -129,11 +128,10 @@ endif() set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS}) set_target_properties(onnxruntime PROPERTIES LINK_DEPENDS ${SYMBOL_FILE}) -if(onnxruntime_ENABLE_LTO) - set_target_properties(onnxruntime PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) - set_target_properties(onnxruntime PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE) - set_target_properties(onnxruntime PROPERTIES INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE) -endif() + + +set_target_properties(onnxruntime PROPERTIES VERSION ${ORT_VERSION}) + install(TARGETS onnxruntime ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake index 06542f7437183..f3d9c119b1c98 100644 --- a/cmake/onnxruntime_framework.cmake +++ b/cmake/onnxruntime_framework.cmake @@ -14,6 +14,14 @@ if (onnxruntime_MINIMAL_BUILD) "${ONNXRUNTIME_ROOT}/core/framework/fallback_cpu_capability.cc" ) + # custom ops support must be explicitly enabled in a minimal build. exclude if not. + if (NOT onnxruntime_MINIMAL_BUILD_CUSTOM_OPS) + list(APPEND onnxruntime_framework_src_exclude + "${ONNXRUNTIME_INCLUDE_DIR}/core/framework/customregistry.h" + "${ONNXRUNTIME_ROOT}/core/framework/customregistry.cc" + ) + endif() + list(REMOVE_ITEM onnxruntime_framework_srcs ${onnxruntime_framework_src_exclude}) endif() @@ -48,8 +56,3 @@ endif() install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core) -if (WIN32) - # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include. - set_target_properties(onnxruntime_framework PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/ConfigureVisualStudioCodeAnalysis.props) -endif() - diff --git a/cmake/onnxruntime_fuzz_test.cmake b/cmake/onnxruntime_fuzz_test.cmake index 381a08573bac0..b7cd95cb8693f 100644 --- a/cmake/onnxruntime_fuzz_test.cmake +++ b/cmake/onnxruntime_fuzz_test.cmake @@ -36,7 +36,7 @@ if (onnxruntime_FUZZ_ENABLED) "${SEC_FUZZ_ROOT}/src/test.cpp") # compile the executables - add_executable(onnxruntime_security_fuzz ${SEC_FUZ_SRC}) + onnxruntime_add_executable(onnxruntime_security_fuzz ${SEC_FUZ_SRC}) # compile with c++17 target_compile_features(onnxruntime_security_fuzz PUBLIC cxx_std_17) diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake index f2ff9f9afec8a..315a66edd1904 100644 --- a/cmake/onnxruntime_graph.cmake +++ b/cmake/onnxruntime_graph.cmake @@ -98,6 +98,10 @@ if (onnxruntime_ENABLE_TRAINING) source_group(TREE ${ORTTRAINING_ROOT} FILES ${orttraining_graph_src}) endif() +if (onnxruntime_BUILD_MS_EXPERIMENTAL_OPS) + target_compile_definitions(onnxruntime_graph PRIVATE BUILD_MS_EXPERIMENTAL_OPS=1) +endif() + if (WIN32) set(onnxruntime_graph_static_library_flags -IGNORE:4221 # LNK4221: This object file does not define any previously undefined public symbols, so it will not be used by any link operation that consumes this library diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake index afc28602ea9bb..54eb81f83306f 100644 --- a/cmake/onnxruntime_java.cmake +++ b/cmake/onnxruntime_java.cmake @@ -160,8 +160,11 @@ endif() # On Windows TARGET_LINKER_FILE_NAME is the .lib, TARGET_FILE_NAME is the .dll if (WIN32) - add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $ ${JAVA_PACKAGE_LIB_DIR}/$) - add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $ ${JAVA_PACKAGE_JNI_DIR}/$) + #Our static analysis plugin set /p:LinkCompiled=false + if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS) + add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $ ${JAVA_PACKAGE_LIB_DIR}/$) + add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $ ${JAVA_PACKAGE_JNI_DIR}/$) + endif() else() add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $ ${JAVA_PACKAGE_LIB_DIR}/$) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $ ${JAVA_PACKAGE_JNI_DIR}/$) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 3c6cd8fbe3d21..a34b9f9f97d9a 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -171,6 +171,10 @@ if (MSVC) endif() onnxruntime_add_include_to_target(onnxruntime_providers onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf flatbuffers) +if (onnxruntime_BUILD_MS_EXPERIMENTAL_OPS) + target_compile_definitions(onnxruntime_providers PRIVATE BUILD_MS_EXPERIMENTAL_OPS=1) +endif() + if (onnxruntime_USE_FEATURIZERS) add_dependencies(onnxruntime_providers onnxruntime_featurizers) onnxruntime_add_include_to_target(onnxruntime_providers onnxruntime_featurizers) @@ -265,13 +269,25 @@ if (onnxruntime_USE_CUDA) endif() add_library(onnxruntime_providers_cuda ${onnxruntime_providers_cuda_src}) - + + #target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler \"/analyze:stacksize 131072\">") + if (HAS_GUARD_CF) + target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler /guard:cf>") + endif() + if (HAS_QSPECTRE) + target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler /Qspectre>") + endif() + foreach(ORT_FLAG ${ORT_WARNING_FLAGS}) + target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler \"${ORT_FLAG}\">") + endforeach() if (UNIX) target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler -Wno-reorder>" "$<$>:-Wno-reorder>") target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler -Wno-error=sign-compare>" "$<$>:-Wno-error=sign-compare>") else() + #mutex.cuh(91): warning C4834: discarding return value of function with 'nodiscard' attribute + target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler /wd4834>") target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:-Xcompiler /wd4127>") endif() onnxruntime_add_include_to_target(onnxruntime_providers_cuda onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf flatbuffers) @@ -335,7 +351,7 @@ if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO ) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_shared_cc_srcs}) - add_library(onnxruntime_providers_shared SHARED ${onnxruntime_providers_shared_cc_srcs}) + onnxruntime_add_shared_library(onnxruntime_providers_shared ${onnxruntime_providers_shared_cc_srcs}) set_target_properties(onnxruntime_providers_shared PROPERTIES FOLDER "ONNXRuntime") set_target_properties(onnxruntime_providers_shared PROPERTIES LINKER_LANGUAGE CXX) @@ -366,7 +382,7 @@ if (onnxruntime_USE_DNNL) ) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_dnnl_cc_srcs}) - add_library(onnxruntime_providers_dnnl SHARED ${onnxruntime_providers_dnnl_cc_srcs}) + onnxruntime_add_shared_library_module(onnxruntime_providers_dnnl ${onnxruntime_providers_dnnl_cc_srcs}) target_link_directories(onnxruntime_providers_dnnl PRIVATE ${DNNL_LIB_DIR}) onnxruntime_add_include_to_target(onnxruntime_providers_dnnl onnxruntime_common onnx) # onnx needed for stl_backports.h add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES}) @@ -452,14 +468,9 @@ if (onnxruntime_USE_TENSORRT) ) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_tensorrt_cc_srcs}) - add_library(onnxruntime_providers_tensorrt SHARED ${onnxruntime_providers_tensorrt_cc_srcs}) + onnxruntime_add_shared_library_module(onnxruntime_providers_tensorrt ${onnxruntime_providers_tensorrt_cc_srcs}) onnxruntime_add_include_to_target(onnxruntime_providers_tensorrt onnxruntime_common onnx flatbuffers) add_dependencies(onnxruntime_providers_tensorrt onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) - if(WIN32) - target_link_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDA_HOME}/x64/lib64) - else() - target_link_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDA_HOME}/lib64) - endif() target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf flatbuffers) target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found @@ -614,7 +625,7 @@ if (onnxruntime_USE_OPENVINO) endif() source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs}) - add_library(onnxruntime_providers_openvino SHARED ${onnxruntime_providers_openvino_cc_srcs}) + onnxruntime_add_shared_library_module(onnxruntime_providers_openvino ${onnxruntime_providers_openvino_cc_srcs}) onnxruntime_add_include_to_target(onnxruntime_providers_openvino onnxruntime_common onnx) install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/openvino DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers) set_target_properties(onnxruntime_providers_openvino PROPERTIES LINKER_LANGUAGE CXX) @@ -1064,7 +1075,8 @@ if (onnxruntime_USE_ROCM) if (HAS_NO_UNDEFINED_VAR_TEMPLATE) target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-undefined-var-template) endif() - target_include_directories(onnxruntime_providers_rocm PRIVATE ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/include/hipcub ${onnxruntime_ROCM_HOME}/include/hiprand ${onnxruntime_ROCM_HOME}/include/rocrand) + # During transition to separate hipFFT repo, put hipfft/include early + target_include_directories(onnxruntime_providers_rocm PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/include/hipcub ${onnxruntime_ROCM_HOME}/include/hiprand ${onnxruntime_ROCM_HOME}/include/rocrand) target_include_directories(onnxruntime_providers_rocm PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${MPI_INCLUDE_DIRS} ${SAFEINT_INCLUDE_DIR} ${ONNXRUNTIME_ROOT}/../cmake/external/eigen) if (onnxruntime_ENABLE_TRAINING) diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index a211cb4431efb..9a3fdba3aad48 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -47,7 +47,7 @@ file(GLOB onnxruntime_pybind_srcs CONFIGURE_DEPENDS ${onnxruntime_pybind_srcs_pattern} ) -add_library(onnxruntime_pybind11_state MODULE ${onnxruntime_pybind_srcs}) +onnxruntime_add_shared_library_module(onnxruntime_pybind11_state ${onnxruntime_pybind_srcs}) if(MSVC) target_compile_options(onnxruntime_pybind11_state PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") endif() diff --git a/cmake/onnxruntime_training.cmake b/cmake/onnxruntime_training.cmake index 8776f9113270a..128b66d2fd875 100644 --- a/cmake/onnxruntime_training.cmake +++ b/cmake/onnxruntime_training.cmake @@ -88,7 +88,7 @@ file(GLOB_RECURSE training_mnist_src "${ORTTRAINING_SOURCE_DIR}/models/mnist/mnist_data_provider.cc" "${ORTTRAINING_SOURCE_DIR}/models/mnist/main.cc" ) -add_executable(onnxruntime_training_mnist ${training_mnist_src}) +onnxruntime_add_executable(onnxruntime_training_mnist ${training_mnist_src}) onnxruntime_add_include_to_target(onnxruntime_training_mnist onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training flatbuffers) target_include_directories(onnxruntime_training_mnist PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) @@ -128,7 +128,7 @@ file(GLOB_RECURSE training_squeezene_src "${ORTTRAINING_SOURCE_DIR}/models/squeezenet/*.h" "${ORTTRAINING_SOURCE_DIR}/models/squeezenet/*.cc" ) -add_executable(onnxruntime_training_squeezenet ${training_squeezene_src}) +onnxruntime_add_executable(onnxruntime_training_squeezenet ${training_squeezene_src}) onnxruntime_add_include_to_target(onnxruntime_training_squeezenet onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training flatbuffers) target_include_directories(onnxruntime_training_squeezenet PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) if(UNIX AND NOT APPLE) @@ -143,7 +143,7 @@ file(GLOB_RECURSE training_bert_src "${ORTTRAINING_SOURCE_DIR}/models/bert/*.h" "${ORTTRAINING_SOURCE_DIR}/models/bert/*.cc" ) -add_executable(onnxruntime_training_bert ${training_bert_src}) +onnxruntime_add_executable(onnxruntime_training_bert ${training_bert_src}) if(UNIX AND NOT APPLE) if (HAS_NO_MAYBE_UNINITIALIZED) @@ -162,7 +162,7 @@ file(GLOB_RECURSE training_pipeline_poc_src "${ORTTRAINING_SOURCE_DIR}/models/pipeline_poc/*.h" "${ORTTRAINING_SOURCE_DIR}/models/pipeline_poc/*.cc" ) -add_executable(onnxruntime_training_pipeline_poc ${training_pipeline_poc_src}) +onnxruntime_add_executable(onnxruntime_training_pipeline_poc ${training_pipeline_poc_src}) if(UNIX AND NOT APPLE) if (HAS_NO_MAYBE_UNINITIALIZED) @@ -181,7 +181,7 @@ file(GLOB_RECURSE training_gpt2_src "${ORTTRAINING_SOURCE_DIR}/models/gpt2/*.h" "${ORTTRAINING_SOURCE_DIR}/models/gpt2/*.cc" ) -add_executable(onnxruntime_training_gpt2 ${training_gpt2_src}) +onnxruntime_add_executable(onnxruntime_training_gpt2 ${training_gpt2_src}) if(UNIX AND NOT APPLE) if (HAS_NO_MAYBE_UNINITIALIZED) target_compile_options(onnxruntime_training_gpt2 PUBLIC "-Wno-maybe-uninitialized") diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 8678be2594474..e73397b2dc168 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -19,9 +19,9 @@ function(AddTest) list(REMOVE_DUPLICATES _UT_SOURCES) if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - add_executable(${_UT_TARGET} ${TEST_SRC_DIR}/xctest/orttestmain.m) + onnxruntime_add_executable(${_UT_TARGET} ${TEST_SRC_DIR}/xctest/orttestmain.m) else() - add_executable(${_UT_TARGET} ${_UT_SOURCES}) + onnxruntime_add_executable(${_UT_TARGET} ${_UT_SOURCES}) endif() if (_UT_DEPENDS) @@ -718,7 +718,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS) list(APPEND onnx_test_libs onnxruntime_language_interop onnxruntime_pyop) endif() -add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc) +onnxruntime_add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc) if(MSVC) target_compile_options(onnx_test_runner PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") @@ -741,7 +741,7 @@ install(TARGETS onnx_test_runner if(onnxruntime_BUILD_BENCHMARKS) SET(BENCHMARK_DIR ${TEST_SRC_DIR}/onnx/microbenchmark) - add_executable(onnxruntime_benchmark + onnxruntime_add_executable(onnxruntime_benchmark ${BENCHMARK_DIR}/main.cc ${BENCHMARK_DIR}/modeltest.cc ${BENCHMARK_DIR}/pooling.cc @@ -798,7 +798,7 @@ endif() file(GLOB onnxruntime_perf_test_src CONFIGURE_DEPENDS ${onnxruntime_perf_test_src_patterns} ) -add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc) +onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc) if(MSVC) target_compile_options(onnxruntime_perf_test PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") @@ -923,20 +923,20 @@ endif(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS) #some ETW tools if(WIN32 AND onnxruntime_ENABLE_INSTRUMENT) - add_executable(generate_perf_report_from_etl ${ONNXRUNTIME_ROOT}/tool/etw/main.cc + onnxruntime_add_executable(generate_perf_report_from_etl ${ONNXRUNTIME_ROOT}/tool/etw/main.cc ${ONNXRUNTIME_ROOT}/tool/etw/eparser.h ${ONNXRUNTIME_ROOT}/tool/etw/eparser.cc ${ONNXRUNTIME_ROOT}/tool/etw/TraceSession.h ${ONNXRUNTIME_ROOT}/tool/etw/TraceSession.cc) target_compile_definitions(generate_perf_report_from_etl PRIVATE "_CONSOLE" "_UNICODE" "UNICODE") target_link_libraries(generate_perf_report_from_etl PRIVATE tdh Advapi32) - add_executable(compare_two_sessions ${ONNXRUNTIME_ROOT}/tool/etw/compare_two_sessions.cc + onnxruntime_add_executable(compare_two_sessions ${ONNXRUNTIME_ROOT}/tool/etw/compare_two_sessions.cc ${ONNXRUNTIME_ROOT}/tool/etw/eparser.h ${ONNXRUNTIME_ROOT}/tool/etw/eparser.cc ${ONNXRUNTIME_ROOT}/tool/etw/TraceSession.h ${ONNXRUNTIME_ROOT}/tool/etw/TraceSession.cc) target_compile_definitions(compare_two_sessions PRIVATE "_CONSOLE" "_UNICODE" "UNICODE") target_link_libraries(compare_two_sessions PRIVATE ${GETOPT_LIB_WIDE} tdh Advapi32) endif() -add_executable(onnxruntime_mlas_test ${TEST_SRC_DIR}/mlas/unittest.cpp) +onnxruntime_add_executable(onnxruntime_mlas_test ${TEST_SRC_DIR}/mlas/unittest.cpp) if(MSVC) target_compile_options(onnxruntime_mlas_test PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") diff --git a/cmake/onnxruntime_util.cmake b/cmake/onnxruntime_util.cmake index 54280348848cf..6b22ac9693957 100644 --- a/cmake/onnxruntime_util.cmake +++ b/cmake/onnxruntime_util.cmake @@ -11,7 +11,7 @@ file(GLOB_RECURSE onnxruntime_util_srcs CONFIGURE_DEPENDS source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_util_srcs}) add_library(onnxruntime_util ${onnxruntime_util_srcs}) -target_include_directories(onnxruntime_util PRIVATE ${ONNXRUNTIME_ROOT} ${MKLML_INCLUDE_DIR} PUBLIC ${eigen_INCLUDE_DIRS}) +target_include_directories(onnxruntime_util PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${eigen_INCLUDE_DIRS}) if (onnxruntime_USE_CUDA) target_include_directories(onnxruntime_util PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) endif() diff --git a/cmake/winml.cmake b/cmake/winml.cmake index 20f25d62340f2..436ef5ca45629 100644 --- a/cmake/winml.cmake +++ b/cmake/winml.cmake @@ -541,8 +541,16 @@ endif(onnxruntime_USE_DML) # Add static library that will be archived/linked for both static/dynamic library add_library(winml_lib_api_experimental STATIC - ${winml_lib_api_experimental_dir}/Dummy.cpp - ${winml_lib_api_experimental_dir}/Dummy.h + ${winml_lib_api_experimental_dir}/LearningModelBuilder.cpp + ${winml_lib_api_experimental_dir}/LearningModelBuilder.h + ${winml_lib_api_experimental_dir}/LearningModelInputs.cpp + ${winml_lib_api_experimental_dir}/LearningModelInputs.h + ${winml_lib_api_experimental_dir}/LearningModelOutputs.cpp + ${winml_lib_api_experimental_dir}/LearningModelOutputs.h + ${winml_lib_api_experimental_dir}/LearningModelOperator.cpp + ${winml_lib_api_experimental_dir}/LearningModelOperator.h + ${winml_lib_api_experimental_dir}/LearningModelOperatorSet.cpp + ${winml_lib_api_experimental_dir}/LearningModelOperatorSet.h ${winml_lib_api_experimental_dir}/LearningModelSessionExperimental.cpp ${winml_lib_api_experimental_dir}/LearningModelSessionExperimental.h ${winml_lib_api_experimental_dir}/LearningModelSessionOptionsExperimental.cpp @@ -568,7 +576,7 @@ target_precompiled_header(winml_lib_api_experimental pch.h) # Includes target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/winml_api) # windows machine learning generated component headers target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/winml_api/comp_generated) # windows machine learning generated component headers -target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/winml_api_experimental) # windows machine learning generated component headers +target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/winml_api_experimental) # windows machine learning generated component headers target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/winml_api_experimental/comp_generated) # windows machine learning generated component headers target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/winml/sdk/cppwinrt/include) # sdk cppwinrt headers diff --git a/cmake/winml_cppwinrt.cmake b/cmake/winml_cppwinrt.cmake index 3c8056cc4abc1..8061033fd251d 100644 --- a/cmake/winml_cppwinrt.cmake +++ b/cmake/winml_cppwinrt.cmake @@ -246,4 +246,4 @@ function(add_generate_cppwinrt_sdk_headers_target set_target_properties(${target_name} PROPERTIES FOLDER ${folder_name}) endif() -endfunction() +endfunction() \ No newline at end of file diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake index 70198bf1190cc..a92503c64f433 100644 --- a/cmake/winml_unittests.cmake +++ b/cmake/winml_unittests.cmake @@ -43,7 +43,7 @@ function(add_winml_test) list(REMOVE_DUPLICATES _UT_DEPENDS) endif() - add_executable(${_UT_TARGET} ${_UT_SOURCES}) + onnxruntime_add_executable(${_UT_TARGET} ${_UT_SOURCES}) onnxruntime_add_include_to_target(${_UT_TARGET} onnx_proto) source_group(TREE ${WINML_TEST_SRC_DIR} FILES ${_UT_SOURCES}) set_winml_target_properties(${_UT_TARGET}) @@ -61,6 +61,10 @@ function(add_winml_test) target_compile_definitions(${_UT_TARGET} PRIVATE "BUILD_INBOX=1") endif() + if (onnxruntime_BUILD_MS_EXPERIMENTAL_OPS) + target_compile_definitions(${_UT_TARGET} PRIVATE "BUILD_MS_EXPERIMENTAL_OPS=1") + endif() + add_test(NAME ${_UT_TARGET} COMMAND ${_UT_TARGET} WORKING_DIRECTORY $ @@ -279,6 +283,9 @@ if(NOT onnxruntime_ENABLE_MEMLEAK_CHECKER) SOURCES ${winml_test_model_src} LIBS winml_test_common ${winml_test_model_libs} ) + if (EXISTS ${dxcore_header}) + target_delayload(winml_test_model ext-ms-win-dxcore-l1-*.dll) + endif() target_precompiled_header(winml_test_model testPch.h) endif() diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/README.md b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/README.md deleted file mode 100644 index 5b042da0c63a1..0000000000000 --- a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/README.md +++ /dev/null @@ -1,172 +0,0 @@ -# C# Sample: Faster R-CNN - -The sample walks through how to run a pretrained Faster R-CNN object detection ONNX model using the ONNX Runtime C# API. - -The source code for this sample is available [here](Program.cs). - -## Prerequisites - -To run this sample, you'll need the following things: - -1. Install [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet-core/3.1) or higher for you OS (Mac, Windows or Linux). -2. Download the [Faster R-CNN](https://github.com/onnx/models/blob/master/vision/object_detection_segmentation/faster-rcnn/model/FasterRCNN-10.onnx) ONNX model to your local system. -3. Download [this demo image](demo.jpg) to test the model. You can also use any image you like. - -## Getting Started - -Now we have everything set up, we can start adding code to run the model on the image. We'll do this in the main method of the program for simplicity. - -### Read paths - -Firstly, let's read the path to the model, path to the image we want to test, and path to the output image: - -```cs -string modelFilePath = args[0]; -string imageFilePath = args[1]; -string outImageFilePath = args[2]; -``` - -### Read image - -Next, we will read the image in using the cross-platform image library [ImageSharp](https://www.nuget.org/packages/SixLabors.ImageSharp): - -```cs -using Image image = Image.Load(imageFilePath, out IImageFormat format); -``` - -Note, we're specifically reading the `Rgb24` type so we can efficiently preprocess the image in a later step. - -### Resize image - -Next, we will resize the image to the appropriate size that the model is expecting; it is recommended to resize the image such that both height and width are within the range of [800, 1333]. - -```cs -float ratio = 800f / Math.Min(image.Width, image.Height); -using Stream imageStream = new MemoryStream(); -image.Mutate(x => x.Resize((int)(ratio * image.Width), (int)(ratio * image.Height))); -image.Save(imageStream, format); -``` - -### Preprocess image - -Next, we will preprocess the image according to the [requirements of the model](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/faster-rcnn#preprocessing-steps): - -```cs -var paddedHeight = (int)(Math.Ceiling(image.Height / 32f) * 32f); -var paddedWidth = (int)(Math.Ceiling(image.Width / 32f) * 32f); -Tensor input = new DenseTensor(new[] { 3, paddedHeight, paddedWidth }); -var mean = new[] { 102.9801f, 115.9465f, 122.7717f }; -for (int y = paddedHeight - image.Height; y < image.Height; y++) -{ - Span pixelSpan = image.GetPixelRowSpan(y); - for (int x = paddedWidth - image.Width; x < image.Width; x++) - { - input[0, y, x] = pixelSpan[x].B - mean[0]; - input[1, y, x] = pixelSpan[x].G - mean[1]; - input[2, y, x] = pixelSpan[x].R - mean[2]; - } -} -``` - -Here, we're creating a Tensor of the required size `(channels, paddedHeight, paddedWidth)`, accessing the pixel values, preprocessing them and finally assigning them to the tensor at the appropriate indicies. - -### Setup inputs - -Next, we will create the inputs to the model: - -```cs -var inputs = new List -{ - NamedOnnxValue.CreateFromTensor("image", input) -}; -``` - -To check the input node names for an ONNX model, you can use [Netron](https://github.com/lutzroeder/netron) to visualise the model and see input/output names. In this case, this model has `image` as the input node name. - -### Run inference - -Next, we will create an inference session and run the input through it: - -```cs -using var session = new InferenceSession(modelFilePath); -using IDisposableReadOnlyCollection results = session.Run(inputs); -``` - -### Postprocess output - -Next, we will need to postprocess the output to get boxes and associated label and confidence scores for each box: - -```cs -var resultsArray = results.ToArray(); -float[] boxes = resultsArray[0].AsEnumerable().ToArray(); -long[] labels = resultsArray[1].AsEnumerable().ToArray(); -float[] confidences = resultsArray[2].AsEnumerable().ToArray(); -var predictions = new List(); -var minConfidence = 0.7f; -for (int i = 0; i < boxes.Length - 4; i += 4) -{ - var index = i / 4; - if (confidences[index] >= minConfidence) - { - predictions.Add(new Prediction - { - Box = new Box(boxes[i], boxes[i + 1], boxes[i + 2], boxes[i + 3]), - Label = LabelMap.Labels[labels[index]], - Confidence = confidences[index] - }); - } -} -``` - -Note, we're only taking boxes that have a confidence above 0.7 to remove false positives. - -### View prediction - -Next, we'll draw the boxes and associated labels and confidence scores on the image to see how the model went: - -```cs -using var outputImage = File.OpenWrite(outImageFilePath); -Font font = SystemFonts.CreateFont("Arial", 16); -foreach (var p in predictions) -{ - image.Mutate(x => - { - x.DrawLines(Color.Red, 2f, new PointF[] { - - new PointF(p.Box.Xmin, p.Box.Ymin), - new PointF(p.Box.Xmax, p.Box.Ymin), - - new PointF(p.Box.Xmax, p.Box.Ymin), - new PointF(p.Box.Xmax, p.Box.Ymax), - - new PointF(p.Box.Xmax, p.Box.Ymax), - new PointF(p.Box.Xmin, p.Box.Ymax), - - new PointF(p.Box.Xmin, p.Box.Ymax), - new PointF(p.Box.Xmin, p.Box.Ymin) - }); - x.DrawText($"{p.Label}, {p.Confidence:0.00}", font, Color.White, new PointF(p.Box.Xmin, p.Box.Ymin)); - }); -} -image.Save(outputImage, format); -``` - -For each box prediction, we're using ImageSharp to draw red lines to create the boxes, and drawing the label and confidence text. - -## Running the program - -Now the program is created, we can run it will the following command: - -``` -dotnet run [path-to-model] [path-to-image] [path-to-output-image] -``` - -e.g. running: - -``` -dotnet run ~/Downloads/FasterRCNN-10.onnx ~/Downloads/demo.jpg ~/Downloads/out.jpg -``` - -detects the following objects in the image: - -![](out.jpg) \ No newline at end of file diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/README.md b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/README.md deleted file mode 100644 index 7e72547624ec5..0000000000000 --- a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/README.md +++ /dev/null @@ -1,169 +0,0 @@ -# C# Sample: ResNet50 v2 - -The sample walks through how to run a pretrained ResNet50 v2 ONNX model using the Onnx Runtime C# API. - -The source code for this sample is available [here](Program.cs). - -## Prerequisites - -To run this sample, you'll need the following things: - -1. Install [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet-core/3.1) or higher for you OS (Mac, Windows or Linux). -2. Download the [ResNet50 v2](https://github.com/onnx/models/blob/master/vision/classification/resnet/model/resnet50-v2-7.onnx) ONNX model to your local system. -3. Download [this picture of a dog](dog.jpeg) to test the model. You can also use any image you like. - -## Getting Started - -Now we have everything set up, we can start adding code to run the model on the image. We'll do this in the main method of the program for simplicity. - -### Read paths - -Firstly, let's read the path to the model and path to the image we want to test in through program arguments: - -```cs -string modelFilePath = args[0]; -string imageFilePath = args[1]; -``` - -### Read image - -Next, we will read the image in using the cross-platform image library [ImageSharp](https://www.nuget.org/packages/SixLabors.ImageSharp): - -```cs -using Image image = Image.Load(imageFilePath, out IImageFormat format); -``` - -Note, we're specifically reading the `Rgb24` type so we can efficiently preprocess the image in a later step. - -### Resize image - -Next, we will resize the image to the appropriate size that the model is expecting; 224 pixels by 224 pixels: - -```cs -using Stream imageStream = new MemoryStream(); -image.Mutate(x => -{ - x.Resize(new ResizeOptions - { - Size = new Size(224, 224), - Mode = ResizeMode.Crop - }); -}); -image.Save(imageStream, format); -``` - -Note, we're doing a centered crop resize to preserve aspect ratio. - -### Preprocess image - -Next, we will preprocess the image according to the [requirements of the model](https://github.com/onnx/models/tree/master/vision/classification/resnet#preprocessing): - -```cs -Tensor input = new DenseTensor(new[] { 1, 3, 224, 224 }); -var mean = new[] { 0.485f, 0.456f, 0.406f }; -var stddev = new[] { 0.229f, 0.224f, 0.225f }; -for (int y = 0; y < image.Height; y++) -{ - Span pixelSpan = image.GetPixelRowSpan(y); - for (int x = 0; x < image.Width; x++) - { - input[0, 0, y, x] = ((pixelSpan[x].R / 255f) - mean[0]) / stddev[0]; - input[0, 1, y, x] = ((pixelSpan[x].G / 255f) - mean[1]) / stddev[1]; - input[0, 2, y, x] = ((pixelSpan[x].B / 255f) - mean[2]) / stddev[2]; - } -} -``` - -Here, we're creating a Tensor of the required size `(batch-size, channels, height, width)`, accessing the pixel values, preprocessing them and finally assigning them to the tensor at the appropriate indicies. - -### Setup inputs - -Next, we will create the inputs to the model: - -```cs -var inputs = new List -{ - NamedOnnxValue.CreateFromTensor("data", input) -}; -``` - -To check the input node names for an ONNX model, you can use [Netron](https://github.com/lutzroeder/netron) to visualise the model and see input/output names. In this case, this model has `data` as the input node name. - -### Run inference - -Next, we will create an inference session and run the input through it: - -```cs -using var session = new InferenceSession(modelFilePath); -using IDisposableReadOnlyCollection results = session.Run(inputs); -``` - -### Postprocess output - -Next, we will need to postprocess the output to get the softmax vector, as this is not handled by the model itself: - -```cs -IEnumerable output = results.First().AsEnumerable(); -float sum = output.Sum(x => (float)Math.Exp(x)); -IEnumerable softmax = output.Select(x => (float)Math.Exp(x) / sum); -``` - -Other models may apply a Softmax node before the output, in which case you won't need this step. Again, you can use Netron to see the model outputs. - -### Extract top 10 - -Next, we will extract the top 10 class predictions: - -```cs -IEnumerable top10 = softmax.Select((x, i) => new Prediction { Label = LabelMap.Labels[i], Confidence = x }) - .OrderByDescending(x => x.Confidence) - .Take(10); -``` - -### Print results - -Next, we will print the top 10 results to the console: - -```cs -Console.WriteLine("Top 10 predictions for ResNet50 v2..."); -Console.WriteLine("--------------------------------------------------------------"); -foreach (var t in top10) -{ - Console.WriteLine($"Label: {t.Label}, Confidence: {t.Confidence}"); -} -``` - -## Running the program - -Now the program is created, we can run it will the following command: - -``` -dotnet run [path-to-model] [path-to-image] -``` - -e.g. - -``` -dotnet run ~/Downloads/resnet50-v2-7.onnx ~/Downloads/dog.jpeg -``` - -Running this on the following image: - -![](dog.jpeg) - -We get the following output: - -``` -Top 10 predictions for ResNet50 v2... --------------------------------------------------------------- -Label: Golden Retriever, Confidence: 0.9212826 -Label: Kuvasz, Confidence: 0.026514154 -Label: Clumber Spaniel, Confidence: 0.012455719 -Label: Labrador Retriever, Confidence: 0.004103844 -Label: Saluki, Confidence: 0.0033182495 -Label: Flat-Coated Retriever, Confidence: 0.0032045357 -Label: English Setter, Confidence: 0.002513516 -Label: Brittany, Confidence: 0.0023459378 -Label: Cocker Spaniels, Confidence: 0.0019343802 -Label: Sussex Spaniel, Confidence: 0.0019247672 -``` diff --git a/csharp/src/Microsoft.AI.MachineLearning.Interop/Microsoft.AI.MachineLearning.Interop.csproj b/csharp/src/Microsoft.AI.MachineLearning.Interop/Microsoft.AI.MachineLearning.Interop.csproj index ca31e658f8db8..689d281d025fd 100644 --- a/csharp/src/Microsoft.AI.MachineLearning.Interop/Microsoft.AI.MachineLearning.Interop.csproj +++ b/csharp/src/Microsoft.AI.MachineLearning.Interop/Microsoft.AI.MachineLearning.Interop.csproj @@ -30,6 +30,7 @@ + diff --git a/csharp/src/Microsoft.AI.MachineLearning/Microsoft.AI.MachineLearning.targets b/csharp/src/Microsoft.AI.MachineLearning/Microsoft.AI.MachineLearning.targets index bfecff8c983ee..807cf89e303ef 100644 --- a/csharp/src/Microsoft.AI.MachineLearning/Microsoft.AI.MachineLearning.targets +++ b/csharp/src/Microsoft.AI.MachineLearning/Microsoft.AI.MachineLearning.targets @@ -20,6 +20,9 @@ $(WindowsAIBinary) + + $(WindowsAIBinary) + diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj index 803e1911811e2..18e00f1470a4c 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj +++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj @@ -146,11 +146,6 @@ Pack="true" Visible="false" /> - GetUpperTriangle(int offset) return GetTriangle(offset, upper: true); } + /// + /// Implementation method for GetTriangle, GetLowerTriangle, GetUpperTriangle + /// + /// Offset of diagonal to set in returned tensor. + /// true for upper triangular and false otherwise + /// public Tensor GetTriangle(int offset, bool upper) { if (Rank < 2) @@ -1158,8 +1164,16 @@ object IList.this[int index] } } + /// + /// Always fixed size Tensor + /// + /// always true public bool IsFixedSize => true; + /// + /// Tensor is not readonly + /// + /// always false public bool IsReadOnly => false; int IList.Add(object value) @@ -1566,6 +1580,11 @@ public virtual DenseTensor ToDenseTensor() #endregion + /// + /// Get a string representation of Tensor + /// + /// + /// public string GetArrayString(bool includeWhitespace = true) { var builder = new StringBuilder(); diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh index 64dab04ae356d..18260eba94108 100755 --- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh +++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh @@ -28,6 +28,8 @@ docker run --gpus all --rm \ -e "PackageName=$PackageName" \ -e "RunTestCsharp=$RunTestCsharp" \ -e "RunTestNative=$RunTestNative" \ + -e "BUILD_BINARIESDIRECTORY=/home/onnxruntimedev" \ + -e "BUILD_SOURCESDIRECTORY=/onnxruntime_src" \ "$DockerImage" \ /bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \ /home/onnxruntimedev/$NUGET_REPO_DIRNAME /onnxruntime_src /home/onnxruntimedev $CurrentOnnxRuntimeVersion diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh index 87991b6fff449..262a73363a5cb 100755 --- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh +++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh @@ -36,6 +36,8 @@ docker run --rm \ -e "DisableMlOps=$DISABLEMLOPS" \ -e "RunTestCsharp=$RunTestCsharp" \ -e "RunTestNative=$RunTestNative" \ + -e "BUILD_BINARIESDIRECTORY=/home/onnxruntimedev" \ + -e "BUILD_SOURCESDIRECTORY=/onnxruntime_src" \ "$DockerImage" \ /bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \ /home/onnxruntimedev/$NUGET_REPO_DIRNAME /onnxruntime_src /home/onnxruntimedev $CurrentOnnxRuntimeVersion diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh index c64bc54014605..15a65db835bae 100755 --- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh +++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh @@ -3,7 +3,6 @@ # Licensed under the MIT License. LocalNuGetRepo=$1 -SourceRoot=$2 BuildDir=$3 export CurrentOnnxRuntimeVersion=$4 IsMacOS=${5:-false} @@ -12,22 +11,26 @@ PackageName=${PackageName:-Microsoft.ML.OnnxRuntime} RunTestCsharp=${RunTestCsharp:-true} RunTestNative=${RunTestNative:-true} -set -x +set -x -e OldDir=`pwd` -cd $SourceRoot +cd $BUILD_SOURCESDIRECTORY echo "Current NuGet package version is $CurrentOnnxRuntimeVersion" if [ $RunTestCsharp = "true" ]; then + if [[ $IsMacOS == "True" || $IsMacOS == "true" ]]; then + mkdir -p $BUILD_BINARIESDIRECTORY/models + ln -s $BUILD_SOURCESDIRECTORY/cmake/external/onnx/onnx/backend/test/data/node $BUILD_BINARIESDIRECTORY/models/opset14 + fi # Run C# tests - dotnet restore $SourceRoot/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj -s $LocalNuGetRepo -s https://api.nuget.org/v3/index.json + dotnet restore $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj -s $LocalNuGetRepo -s https://api.nuget.org/v3/index.json if [ $? -ne 0 ]; then echo "Failed to restore nuget packages for the test project" exit 1 fi - dotnet test $SourceRoot/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --verbosity detailed + dotnet test $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --verbosity detailed if [ $? -ne 0 ]; then echo "Failed to build or execute the end-to-end test" exit 1 @@ -45,23 +48,23 @@ if [ $RunTestNative = "true" ]; then inc="-I build/native/include" - if [ $IsMacOS = "true" ]; then + if [[ $IsMacOS == "True" || $IsMacOS == "true" ]]; then export DYLD_FALLBACK_LIBRARY_PATH=$LocalNuGetRepo/_tmp:${DYLD_FALLBACK_LIBRARY_PATH} libs="-L runtimes/osx-x64/native -l onnxruntime" - g++ -std=c++11 $SourceRoot/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp $inc $libs -Wunused-result -Wformat=0 -o sampletest + g++ -std=c++11 $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp $inc $libs -Wunused-result -Wformat=0 -o sampletest libName=$(otool -L ./sampletest | grep onnxruntime | xargs | cut -d' ' -f1 | cut -d'/' -f2) ln -sf runtimes/osx-x64/native/libonnxruntime.dylib $libName else export LD_LIBRARY_PATH=$LocalNuGetRepo/_tmp:${LD_LIBRARY_PATH} libs="-L runtimes/linux-x86/native -L runtimes/linux-x64/native -l onnxruntime" - g++ -std=c++11 $SourceRoot/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp $inc $libs -Wunused-result -o sampletest + g++ -std=c++11 $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp $inc $libs -Wunused-result -o sampletest # Create link to versioned shared object required at runtime libname=`ldd sampletest | grep onnxruntime | xargs | cut -d" " -f1` ln -sf runtimes/linux-x64/native/libonnxruntime.so $libname fi # Copy Sample Model - cp $SourceRoot/csharp/testdata/squeezenet.onnx . + cp $BUILD_SOURCESDIRECTORY/csharp/testdata/squeezenet.onnx . # Run the sample model ./sampletest diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs index 54188f26472b1..c8c4d1f9b6644 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs @@ -719,6 +719,70 @@ private static Dictionary GetSkippedModels(DirectoryInfo modelsD { "tf_resnet_v1_152", "result mismatch when Conv BN Fusion is applied" }, { "coreml_Imputer-LogisticRegression_sklearn_load_breast_cancer", "Can't determine model file name" }, { "mask_rcnn_keras", "Model should be edited to remove the extra outputs" }, + { "test_strnormalizer_export_monday_casesensintive_lower", "ElementType not currently supported"}, + { "test_max_float64", "node test error"}, + { "test_min_uint8", "node test error"}, + { "test_mod_mixed_sign_float64", "node test error"}, + { "test_einsum_transpose", "node test error"}, + { "test_momentum", "node test error"}, + { "test_max_uint16", "node test error"}, + { "test_resize_downsample_scales_linear_align_corners", "node test error"}, + { "test_strnormalizer_nostopwords_nochangecase", "node test error"}, + { "test_cast_STRING_to_FLOAT", "node test error"}, + { "test_cumsum_2d_negative_axis", "node test error"}, + { "test_cast_FLOAT16_to_DOUBLE", "node test error"}, + { "test_adagrad_multiple", "node test error"}, + { "test_einsum_inner_prod", "node test error"}, + { "test_clip_default_int8_min", "node test error"}, + { "test_max_int8", "node test error"}, + { "test_sequence_insert_at_back", "node test error"}, + { "test_mod_mixed_sign_int8", "node test error"}, + { "test_maxunpool_export_with_output_shape", "node test error"}, + { "test_strnormalizer_export_monday_empty_output", "node test error"}, + { "test_strnormalizer_export_monday_insensintive_upper_twodim", "ElementType not currently supported"}, + { "test_clip_default_int8_max", "node test error"}, + { "test_einsum_sum", "node test error"}, + { "test_min_int16", "node test error"}, + { "test_cast_FLOAT_to_DOUBLE", "node test error"}, + { "test_adagrad", "node test error"}, + { "test_min_float64", "node test error"}, + { "test_max_int16", "node test error"}, + { "test_einsum_batch_diagonal", "node test error"}, + { "test_sequence_insert_at_front", "node test error"}, + { "test_cumsum_1d_exclusive", "node test error"}, + { "test_training_dropout_default", "node test error"}, + { "test_cast_BFLOAT16_to_FLOAT", "node test error"}, + { "test_training_dropout", "node test error"}, + { "test_adam", "node test error"}, + { "test_training_dropout_mask", "node test error"}, + { "test_clip_default_int8_inbounds", "node test error"}, + { "test_eyelike_with_dtype", "node test error"}, + { "test_cumsum_1d", "node test error"}, + { "test_conv_with_autopad_same", "node test error"}, + { "test_cumsum_1d_reverse_exclusive", "node test error"}, + { "test_cast_FLOAT_to_BFLOAT16", "node test error"}, + { "test_bitshift_right_uint16", "node test error"}, + { "test_bitshift_left_uint16", "node test error"}, + { "test_pow_types_float32_uint64", "node test error"}, + { "test_cumsum_2d_axis_0", "node test error"}, + { "test_max_uint8", "node test error"}, + { "test_strnormalizer_export_monday_casesensintive_nochangecase", "ElementType not currently supported"}, + { "test_momentum_multiple", "node test error"}, + { "test_cumsum_1d_reverse", "node test error"}, + { "test_pow_types_float32_uint32", "node test error"}, + { "test_if_seq", "node test error"}, + { "test_resize_downsample_scales_cubic_align_corners", "node test error"}, + { "test_einsum_batch_matmul", "node test error"}, + { "test_nesterov_momentum", "node test error"}, + { "test_cumsum_2d_axis_1", "node test error"}, + { "test_strnormalizer_export_monday_casesensintive_upper", "node test error"}, + { "test_min_uint16", "node test error"}, + { "test_adam_multiple", "node test error"}, + { "test_loop13_seq", "node test error"}, + { "test_convtranspose_autopad_same", "node test error"}, + { "test_training_dropout_default_mask", "node test error"}, + { "test_min_int8", "node test error"}, + { "test_cast_FLOAT_to_STRING", "node test error"}, }; // The following models fails on nocontribops win CI diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino index 01ac9baab25bf..03625511e362c 100644 --- a/dockerfiles/Dockerfile.openvino +++ b/dockerfiles/Dockerfile.openvino @@ -84,7 +84,7 @@ RUN apt update && apt -y install --no-install-recommends apt-transport-https ca- git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \ /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh && \ cd onnxruntime/cmake/external/onnx && python3 setup.py install && \ - cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_wheel && \ + cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \ pip install build/Linux/Release/dist/*-linux_x86_64.whl && \ cd ${MY_ROOT}/ && rm -rf onnxruntime && cd /opt && rm -rf v1.0.22.zip && cd ${MY_ROOT} &&\ apt remove -y cmake && cd /usr/share/python-wheels/ && rm -rf *.whl &&\ diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp index deb8be1b7c686..b84d89a0b70ea 100644 --- a/dockerfiles/Dockerfile.openvino-csharp +++ b/dockerfiles/Dockerfile.openvino-csharp @@ -102,7 +102,7 @@ RUN apt update && \ git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \ /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh && \ cd onnxruntime/cmake/external/onnx && python3 setup.py install && \ - cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget && \ + cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib && \ mv ${MY_ROOT}/onnxruntime/build/Linux/Release/nuget-artifacts ${MY_ROOT} && \ # Clean-up unnecessary files rm -rf ${MY_ROOT}/cmake* /opt/cmake ${MY_ROOT}/onnxruntime && \ @@ -111,4 +111,4 @@ RUN apt update && \ apt remove -y git && apt autoremove -y && apt remove -y cmake && \ cd /usr/lib/ && rm -rf python2.7 python3.6 python3.8 && cd && rm -rf .cache && \ cd /usr/share/python-wheels/ && rm -rf *.whl - \ No newline at end of file + diff --git a/docs/AddingCustomOp.md b/docs/AddingCustomOp.md deleted file mode 100644 index 952aad34125ca..0000000000000 --- a/docs/AddingCustomOp.md +++ /dev/null @@ -1,31 +0,0 @@ -Adding a new op -=============== - -## A new op can be written and registered with ONNXRuntime in the following 3 ways -### 1. Using the custom op API in the C/C++ APIs (onnxruntime_c_api.h) -* Create an OrtCustomOpDomain with the domain name used by the custom ops -* Create an OrtCustomOp structure for each op and add them to the OrtCustomOpDomain with OrtCustomOpDomain_Add -* Call OrtAddCustomOpDomain to add the custom domain of ops to the session options -See [this](../onnxruntime/test/shared_lib/test_inference.cc) for examples of MyCustomOp and SliceCustomOp that use the C++ helper API (onnxruntime_cxx_api.h). -You can also compile the custom ops into a shared library and use that to run a model via the C++ API. The same test file contains an example. -The source code for a sample custom op shared library containing two custom kernels is [here](../onnxruntime/test/testdata/custom_op_library/custom_op_library.cc). -See [this](../onnxruntime/test/python/onnxruntime_test_python.py) for an example called testRegisterCustomOpsLibrary that uses the Python API -to register a shared library that contains custom op kernels. -Currently, the only supported Execution Providers (EPs) for custom ops registered via this approach are the `CUDA` and the `CPU` EPs. - -Note that when a model being inferred on gpu, onnxruntime will insert MemcpyToHost op before a cpu custom op and append MemcpyFromHost after to make sure tensor(s) are accessible throughout calling, meaning there are no extra efforts required from custom op developer for the case. - -To facilitate the custom operator development, sharing and release, please check the [onnxruntime custom operator library](https://github.com/microsoft/ort-customops) project for the more information. - -### 2. Using RegisterCustomRegistry API -* Implement your kernel and schema (if required) using the OpKernel and OpSchema APIs (headers are in the include folder). -* Create a CustomRegistry object and register your kernel and schema with this registry. -* Register the custom registry with ONNXRuntime using RegisterCustomRegistry API. - -See -[this](../onnxruntime/test/framework/local_kernel_registry_test.cc) for an example. - -### 3. Contributing the op to ONNXRuntime -This is mostly meant for ops that are in the process of being proposed to ONNX. This way you don't have to wait for an approval from the ONNX team -if the op is required in production today. -See [this](../onnxruntime/contrib_ops) for an example. diff --git a/docs/AddingExecutionProvider.md b/docs/AddingExecutionProvider.md deleted file mode 100644 index 3cadc45f909a7..0000000000000 --- a/docs/AddingExecutionProvider.md +++ /dev/null @@ -1,37 +0,0 @@ -# Adding a new execution provider - -* Create a folder under onnxruntime/core/providers -* Create a folder under include/onnxruntime/core/providers, it should has the same name as the first step. -* Create a new class, which must inherit from [IExecutionProvider](../include/onnxruntime/core/framework/execution_provider.h). The source code should be put in 'onnxruntime/core/providers/[your_provider_name]' -* Create a new header file under include/onnxruntime/core/providers/[your_provider_name]. The file should provide one function for creating an OrtProviderFactoryInterface. You may use 'include/onnxruntime/core/providers/cpu/cpu_provider_factory.h' as a template. You don't need to provide a function for creating MemoryInfo. -* Put a symbols.txt under 'onnxruntime/core/providers/[your_provider_name]'. The file should contain all the function names that would be exported from you provider. Usually, just a single function for creating provider factory is enough. -* Add your provider in onnxruntime_providers.cmake. Build it as a static lib. -* Add one line in cmake/onnxruntime.cmake, to the 'target_link_libraries' function call. Put your provider there. - - -Examples: - - * [CPU Execution - Provider](../onnxruntime/core/providers/cpu/cpu_execution_provider.h) - * [CUDA Execution - Provider](../onnxruntime/core/providers/cuda/cuda_execution_provider.h) - * [DNNL Execution - Provider](../onnxruntime/core/providers/dnnl/dnnl_execution_provider.h) - - -# Using the execution provider -1. Create a factory for that provider, by using the c function you exported in 'symbols.txt' -2. Put the provider factory into session options -3. Create session from that session option -e.g. - -```c - OrtEnv* env; - OrtInitialize(ORT_LOGGING_LEVEL_WARNING, "test", &env) - OrtSessionOptions* session_option = OrtCreateSessionOptions(); - OrtProviderFactoryInterface** factory; - OrtCreateCUDAExecutionProviderFactory(0, &factory); - OrtSessionOptionsAppendExecutionProvider(session_option, factory); - OrtReleaseObject(factory); - OrtCreateSession(env, model_path, session_option, &session); -``` diff --git a/docs/CSharp_API.md b/docs/CSharp_API.md deleted file mode 100644 index 38ba2a1678514..0000000000000 --- a/docs/CSharp_API.md +++ /dev/null @@ -1,309 +0,0 @@ -# ONNX Runtime C# API -The ONNX runtime provides a C# .Net binding for running inference on ONNX models in any of the .Net standard platforms. The API is .Net standard 1.1 compliant for maximum portability. This document describes the API. - -## NuGet Package -The Microsoft.ML.OnnxRuntime Nuget package includes the precompiled binaries for ONNX runtime, and includes libraries for Windows and Linux platforms with X64 CPUs. The APIs conform to .Net Standard 1.1. - -## Sample Code - -The unit tests contain several examples of loading models, inspecting input/output node shapes and types, as well as constructing tensors for scoring. - -* [../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L166](../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L166) - -## Getting Started -Here is simple tutorial for getting started with running inference on an existing ONNX model for a given input data. The model is typically trained using any of the well-known training frameworks and exported into the ONNX format. To start scoring using the model, open a session using the `InferenceSession` class, passing in the file path to the model as a parameter. - -```cs -var session = new InferenceSession("model.onnx"); -``` - -Once a session is created, you can execute queries using the `Run` method of the `InferenceSession` object. Currently, only `Tensor` type of input and outputs are supported. The results of the `Run` method are represented as a collection of .Net `Tensor` objects (as defined in [System.Numerics.Tensor](https://www.nuget.org/packages/System.Numerics.Tensors)). - -```cs -Tensor t1, t2; // let's say data is fed into the Tensor objects -var inputs = new List() - { - NamedOnnxValue.CreateFromTensor("name1", t1), - NamedOnnxValue.CreateFromTensor("name2", t2) - }; -using (var results = session.Run(inputs)) -{ - // manipulate the results -} -``` - -You can load your input data into Tensor objects in several ways. A simple example is to create the Tensor from arrays. - -```cs -float[] sourceData; // assume your data is loaded into a flat float array -int[] dimensions; // and the dimensions of the input is stored here -Tensor t1 = new DenseTensor(sourceData, dimensions); -``` - -Here is a [complete sample code](../csharp/sample/Microsoft.ML.OnnxRuntime.InferenceSample) that runs inference on a pretrained model. - -## Reuse input/output tensor buffers - -In some scenarios, you may want to reuse input/output tensors. This often happens when you want to chain 2 models (ie. feed one's output as input to another), or want to accelerate inference speed during multiple inference runs. - -### Chaining: Feed model A's output(s) as input(s) to model B - -```cs -InferenceSession session1, session2; // let's say 2 sessions are initialized - -Tensor t1; // let's say data is fed into the Tensor objects -var inputs1 = new List() - { - NamedOnnxValue.CreateFromTensor("name1", t1) - }; -// session1 inference -using (var outputs1 = session1.Run(inputs1)) -{ - // get intermediate value - var input2 = outputs1.First(); - - // modify the name of the ONNX value - input2.Name = "name2"; - - // create input list for session2 - var inputs2 = new List() { input2 }; - - // session2 inference - using (var results = session2.Run(inputs2)) - { - // manipulate the results - } -} -``` - -### Multiple inference runs with fixed sized input(s) and output(s) - -If the model have fixed sized inputs and outputs of numeric tensors, you can use "FixedBufferOnnxValue" to accelerate the inference speed. By using "FixedBufferOnnxValue", the container objects only need to be allocated/disposed one time during multiple InferenceSession.Run() calls. This avoids some overhead which may be beneficial for smaller models where the time is noticeable in the overall running time. - -An example can be found at `TestReusingFixedBufferOnnxValueNonStringTypeMultiInferences()`: -* [../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L1047](../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L1047) - -## Running on GPU (Optional) -If using the GPU package, simply use the appropriate SessionOptions when creating an InferenceSession. - -```cs -int gpuDeviceId = 0; // The GPU device ID to execute on -var session = new InferenceSession("model.onnx", SessionOptions.MakeSessionOptionWithCudaProvider(gpuDeviceId)); -``` - -## API Reference - -### OrtEnv -```cs -class OrtEnv -``` -Holds some methods which can be used to tune the ONNX Runtime's runime environment - -#### Constructor -No public constructor available. - -#### Methods -```cs -static OrtEnv Instance(); -``` -Returns an instance of the singlton class `OrtEnv`. - -```cs -void EnableTelemetryEvents(); -``` -Enables platform-specific telemetry collection where applicable. Please see [Privacy](./Privacy.md) for more details. - -```cs -void DisableTelemetryEvents(); -``` -Disables platform-specific telemetry collection. Please see [Privacy](./Privacy.md) for more details. - -### InferenceSession -```cs -class InferenceSession: IDisposable -``` - -The runtime representation of an ONNX model - -#### Constructor -```cs -InferenceSession(string modelPath); -InferenceSession(string modelPath, SessionOptions options); -``` - -#### Properties -```cs -IReadOnlyDictionary InputMetadata; -``` -Data types and shapes of the input nodes of the model. - -```cs -IReadOnlyDictionary OutputMetadata; -``` -Data types and shapes of the output nodes of the model. - -#### Methods -```cs -IDisposableReadOnlyCollection Run(IReadOnlyCollection inputs); -``` -Runs the model with the given input data to compute all the output nodes and returns the output node values. Both input and output are collection of NamedOnnxValue, which in turn is a name-value pair of string names and Tensor values. The outputs are IDisposable variant of NamedOnnxValue, since they wrap some unmanaged objects. - -```cs -IDisposableReadOnlyCollection Run(IReadOnlyCollection inputs, IReadOnlyCollection desiredOutputNodes); -``` -Runs the model on given inputs for the given output nodes only. - -### System.Numerics.Tensor -The primary .Net object that is used for holding input-output of the model inference. Details on this newly introduced data type can be found in its [open-source implementation](https://github.com/dotnet/corefx/tree/master/src/System.Numerics.Tensors). The binaries are available as a [.Net NuGet package](https://www.nuget.org/packages/System.Numerics.Tensors). - -### NamedOnnxValue -```cs -class NamedOnnxValue; -``` -Represents a name-value pair of string names and any type of value that ONNX runtime supports as input-output data. Currently, only Tensor objects are supported as input-output values. - -#### Constructor -No public constructor available. - -#### Properties -```cs -string Name; // get or set the name -``` - -#### Methods -```cs -static NamedOnnxValue CreateFromTensor(string name, Tensor); -``` -Creates a NamedOnnxValue from a name and a Tensor object. - -```cs -Tensor AsTensor(); -``` -Accesses the value as a Tensor. Returns null if the value is not a Tensor. - -### DisposableNamedOnnxValue -```cs -class DisposableNamedOnnxValue: NamedOnnxValue, IDisposable; -``` -This is a disposable variant of NamedOnnxValue, used for holding output values which contains objects allocated in unmanaged memory. - -### FixedBufferOnnxValue -```cs -class FixedBufferOnnxValue: IDisposable; -``` -Class `FixedBufferOnnxValue` enables the availability to pin the tensor buffer. This helps to minimize overhead within each inference run. - -`FixedBufferOnnxValue` can be used as either input or output. However, if used as output, it has to be a numeric tensor. - -`FixedBufferOnnxValue` implements `IDisposable`, so make sure it get disposed after use. -#### Methods -```cs -static FixedBufferOnnxValue CreateFromTensor(Tensor); -``` -Creates a FixedBufferOnnxValue from a name and a Tensor object. - - -### IDisposableReadOnlyCollection -```cs -interface IDisposableReadOnlyCollection: IReadOnlyCollection, IDisposable -``` -Collection interface to hold disposable values. Used for output of Run method. - -### SessionOptions -```cs -class SessionOptions: IDisposable; -``` -A collection of properties to be set for configuring the OnnxRuntime session - -#### Constructor -```cs -SessionOptions(); -``` -Constructs a SessionOptions will all options at default/unset values. - -#### Properties -```cs -static SessionOptions Default; //read-only -``` -Accessor to the default static option object - -#### Methods -```cs -SetSessionGraphOptimizationLevel(GraphOptimizationLevel graph_transformer_level); -``` -See [ONNX_Runtime_Graph_Optimizations.md] for more details. - -```cs -SetSessionExecutionMode(ExecutionMode execution_mode); -``` - * ORT_SEQUENTIAL - execute operators in the graph sequentially. - * ORT_PARALLEL - execute operators in the graph in parallel. -See [ONNX_Runtime_Perf_Tuning.md] for more details. - -### NodeMetadata -Container of metadata for a model graph node, used for communicating the shape and type of the input and output nodes. - -#### Properties -```cs -int[] Dimensions; -``` -Read-only shape of the node, when the node is a Tensor. Undefined if the node is not a Tensor. - -```cs -System.Type ElementType; -``` -Type of the elements of the node, when node is a Tensor. Undefined for non-Tensor nodes. - -```cs -bool IsTensor; -``` -Whether the node is a Tensor - -### Exceptions -```cs -class OnnxRuntimeException: Exception; -``` - -The type of Exception that is thrown in most of the error conditions related to Onnx Runtime. - -### ModelMetadata -```cs -class ModelMetadata -``` -Encapsulates some metadata about the ONNX model. - -#### Constructor -No public constructor available. - -The `ModelMetadata` instance for an ONNX model may be obtained by querying the `ModelMetadata` property of an `InferenceSession` instance. - -#### Properties -```cs -string ProducerName; -``` -Holds the producer name of the ONNX model. - -```cs -string GraphName; -``` -Holds the graph name of the ONNX model. - -```cs -string Domain; -``` -Holds the opset domain of the ONNX model. - -```cs -string Description; -``` -Holds the description of the ONNX model. - -```cs -long Version; -``` -Holds the version of the ONNX model. - -```cs -Dictionary CustomMetadataMap; -``` -Holds a dictionary containing key-value pairs of custom metadata held by the ONNX model. diff --git a/docs/C_API.md b/docs/C_API.md deleted file mode 100644 index 3d6999046c874..0000000000000 --- a/docs/C_API.md +++ /dev/null @@ -1,77 +0,0 @@ -# C API - -## Features - -* Creating an InferenceSession from an on-disk model file and a set of SessionOptions. -* Registering customized loggers. -* Registering customized allocators. -* Registering predefined providers and set the priority order. ONNXRuntime has a set of predefined execution providers, like CUDA, DNNL. User can register providers to their InferenceSession. The order of registration indicates the preference order as well. -* Running a model with inputs. These inputs must be in CPU memory, not GPU. If the model has multiple outputs, user can specify which outputs they want. -* Converting an in-memory ONNX Tensor encoded in protobuf format to a pointer that can be used as model input. -* Setting the thread pool size for each session. -* Setting graph optimization level for each session. -* Dynamically loading custom ops. [Instructions](/docs/AddingCustomOp.md) -* Ability to load a model from a byte array. See ```OrtCreateSessionFromArray``` in [onnxruntime_c_api.h](/include/onnxruntime/core/session/onnxruntime_c_api.h). -* **Global/shared threadpools:** By default each session creates its own set of threadpools. In situations where multiple -sessions need to be created (to infer different models) in the same process, you end up with several threadpools created -by each session. In order to address this inefficiency we introduce a new feature called global/shared threadpools. -The basic idea here is to share a set of global threadpools across multiple sessions. Typical usage of this feature -is as follows - * Populate ```ThreadingOptions```. Use the value of 0 for ORT to pick the defaults. - * Create env using ```CreateEnvWithGlobalThreadPools()``` - * Create session and call ```DisablePerSessionThreads()``` on the session options object - * Call ```Run()``` as usual -* **Share allocator(s) between sessions:** - * *Description*: This feature allows multiple sessions in the same process to use the same allocator(s). - * *Scenario*: You've several sessions in the same process and see high memory usage. One of the reasons for this is as follows. Each session creates its own CPU allocator which is arena based by default. [ORT implements](onnxruntime/core/framework/bfc_arena.h) a simplified version of an arena allocator that is based on [Doug Lea's best-first with coalescing algorithm](http://gee.cs.oswego.edu/dl/html/malloc.html). Each allocator lives in its own session. It allocates a large region of memory during init time and thereafter it chunks, coalesces and extends this initial region as per allocation/deallocation demands. Overtime the arena ends up with unused chunks of memory per session. Moreover, the memory allocated by the arena is never returned to the system; once allocated it always remains allocated. All these factors add up when using multiple sessions (each with its own arena) thereby increasing the overall memory consumption of the process. Hence it becomes important to share the arena allocator between sessions. - * *Usage*: - * Create and register a shared allocator with the env using the ```CreateAndRegisterAllocator``` API. This allocator is then reused by all sessions that use the same env instance unless a session -chooses to override this by setting ```session_state.use_env_allocators``` to "0". - * Set ```session.use_env_allocators``` to "1" for each session that wants to use the env registered allocators. - * See test ```TestSharedAllocatorUsingCreateAndRegisterAllocator``` in - onnxruntime/test/shared_lib/test_inference.cc for an example. - * Configuring *OrtArenaCfg*: - * Default values for these configs can be found in the [BFCArena class](onnxruntime/core/framework/bfc_arena.h). - * ```initial_chunk_size_bytes```: This is the size of the region that the arena allocates first. Chunks are handed over to allocation requests from this region. If the logs show that the arena is getting extended a lot more than expected, you're better off choosing a big enough initial size for this. - * ```max_mem```: This is the maximum amount of memory the arena allocates. If a chunk cannot be serviced by any existing region, the arena extends itself by allocating one more region depending on available memory (max_mem - allocated_so_far). An error is returned if available memory is less than the requested extension. - * ```arena_extend_strategy```: This can take only 2 values currently: kSameAsRequested or kNextPowerOfTwo. As the name suggests kNextPowerOfTwo (the default) extends the arena by a power of 2, while kSameAsRequested extends by a size that is the same as the allocation request each time. kSameAsRequested is suited for more advanced configurations where you know the expected memory usage in advance. - * ```max_dead_bytes_per_chunk```: This controls whether a chunk is split to service an allocation request. Currently if the difference between the chunk size and requested size is less than this value, the chunk is not split. This has the potential to waste memory by keeping a part of the chunk unused (hence called dead bytes) throughout the process thereby increasing the memory usage (until this chunk is returned to the arena). - -* **Share initializer(s) between sessions:** - * *Description*: This feature allows a user to share the same instance of an initializer across -multiple sessions. - * *Scenario*: You've several models that use the same set of initializers except the last few layers of the model and you load these models in the same process. When every model (session) creates a separate instance of the same initializer, it leads to excessive and wasteful memory usage since in this case it's the same initializer. You want to optimize memory usage while having the flexibility to allocate the initializers (possibly even store them in shared memory). - * *Example Usage*: Use the ```AddInitializer``` API to add a pre-allocated initializer to session options before calling ```CreateSession```. Use the same instance of session options to create several sessions allowing the initializer(s) to be shared between the sessions. See [C API sample usage (TestSharingOfInitializer)](../onnxruntime/test/shared_lib/test_inference.cc) and [C# API sample usage (TestWeightSharingBetweenSessions)](../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs). - -## Usage Overview - -1. Include [onnxruntime_c_api.h](/include/onnxruntime/core/session/onnxruntime_c_api.h). -2. Call OrtCreateEnv -3. Create Session: OrtCreateSession(env, model_uri, nullptr,...) - - Optionally add more execution providers (e.g. for CUDA use OrtSessionOptionsAppendExecutionProvider_CUDA) -4. Create Tensor - 1) OrtCreateMemoryInfo - 2) OrtCreateTensorWithDataAsOrtValue -5. OrtRun - -## Sample code - -The example below shows a sample run using the SqueezeNet model from ONNX model zoo, including dynamically reading model inputs, outputs, shape and type information, as well as running a sample vector and fetching the resulting class probabilities for inspection. - -* [../csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp](../csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp) - -## Deployment - -### Windows 10 - -Your installer should put the onnxruntime.dll into the same folder as your application. Your application can either use [load-time dynamic linking](https://docs.microsoft.com/en-us/windows/win32/dlls/using-load-time-dynamic-linking) or [run-time dynamic linking](https://docs.microsoft.com/en-us/windows/win32/dlls/using-run-time-dynamic-linking) to bind to the dll. - -#### Dynamic Link Library Search Order - -This is an important article on how Windows finds supporting dlls: [Dynamic Link Library Search Order](https://docs.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order). - -There are some cases where the app is not directly consuming the onnxruntime but instead calling into a DLL that is consuming the onnxruntime. People building these DLLs that consume the onnxruntime need to take care about folder structures. Do not modify the system %path% variable to add your folders. This can conflict with other software on the machine that is also using the onnxruntme. Instead place your DLL and the onnxruntime DLL in the same folder and use [run-time dynamic linking](https://docs.microsoft.com/en-us/windows/win32/dlls/using-run-time-dynamic-linking) to bind explicity to that copy. You can use code like this sample does in [GetModulePath()](https://github.com/microsoft/Windows-Machine-Learning/blob/master/Samples/SampleSharedLib/SampleSharedLib/FileHelper.cpp) to find out what folder your dll is loaded from. - -## Telemetry - -To turn on/off telemetry collection on official Windows builds, please use Enable/DisableTelemetryEvents() in the C API. See the [Privacy](./Privacy.md) page for more information on telemetry collection and Microsoft's privacy policy. diff --git a/docs/ExportPyTorchCustomOps.md b/docs/ExportPyTorchCustomOps.md deleted file mode 100644 index 20835ad76f825..0000000000000 --- a/docs/ExportPyTorchCustomOps.md +++ /dev/null @@ -1,186 +0,0 @@ - -## Export of Custom Ops for ONNX Runtime - -This document explains the process of exporting PyTorch models with custom ONNX Runtime ops. -The aim is to export a PyTorch model with operators that are not supported in ONNX, and extend ONNX Runtime to support these custom ops. - -Currently, a torch op can be exported as a custom operator using our custom op (symbolic) registration API. We can -use this API to register custom ONNX Runtime ops under "com.microsoft" domain. - - -### 1. Exporting a Custom Op - -In this example, we take Inverse operator as an example. To enable export of ```torch.inverse```, a symbolic function -can be created and registered as part of custom ops: - -```python -from torch.onnx import register_custom_op_symbolic - -def my_inverse(g, self): - return g.op("com.microsoft::Inverse", self) - -# register_custom_op_symbolic('::inverse', my_inverse, ) -register_custom_op_symbolic('::inverse', my_inverse, 1) - -``` -`````` is a part of the torch operator name. For standard torch operators, namespace can be omitted. - -```com.microsoft``` should be used as the custom opset domain for ONNX Runtime ops. You can choose the custom opset -version during op registration. - -All symbolics for ONNX Runtime custom ops are defined in ``tools/python/register_custom_ops_pytorch_exporter.py``. -If you are adding a symbolic function for a new custom op, add the function to this file. - - -### 2. Extending ONNX Runtime with Custom Ops -The next step is to add op schema and kernel implementation in ONNX Runtime. -Consider the Inverse custom op as an example added in: -https://github.com/microsoft/onnxruntime/pull/3485 - - -Custom op schema and shape inference function should be added in ```onnxruntime/core/graph/contrib_ops/contrib_defs.cc ``` -using ```ONNX_CONTRIB_OPERATOR_SCHEMA```. - -```c++ -ONNX_CONTRIB_OPERATOR_SCHEMA(Inverse) - .SetDomain(kMSDomain) // kMSDomain = "com.microsoft" - .SinceVersion(1) // Same version used at op (symbolic) registration - ... -``` - -To comply with ONNX guideline for new operators, a new operator should have complete reference implementation tests and -shape inference tests. - -Reference implementation python tests should be added in: -``onnxruntime/test/python/contrib_ops`` -E.g.: ``onnxruntime/test/python/contrib_ops/onnx_test_trilu.py`` - -Shape inference C++ tests should be added in: -``onnxruntime/test/contrib_ops`` -E.g.: ``onnxruntime/test/contrib_ops/trilu_shape_inference_test.cc`` - -The operator kernel should be implemented using ```Compute``` function -under contrib namespace in ```onnxruntime/contrib_ops/cpu/.cc``` -for CPU and ```onnxruntime/contrib_ops/cuda/.cc``` for CUDA. - -```c -namespace onnxruntime { -namespace contrib { - -class Inverse final : public OpKernel { - public: - explicit Inverse(const OpKernelInfo& info) : OpKernel(info) {} - Status Compute(OpKernelContext* ctx) const override; - - private: - ... -}; - -ONNX_OPERATOR_KERNEL_EX( - Inverse, - kMSDomain, - 1, - kCpuExecutionProvider, - KernelDefBuilder() - .TypeConstraint("T", BuildKernelDefConstraints()), - Inverse); - -Status Inverse::Compute(OpKernelContext* ctx) const { -... // kernel implementation -} - -} // namespace contrib -} // namespace onnxruntime - -``` - -Operator kernel should be registered in ```onnxruntime/contrib_ops/cpu_contrib_kernels.cc``` -for CPU and ```onnxruntime/contrib_ops/cuda_contrib_kernels.cc``` for CUDA. - -Now you should be able to build and install ONNX Runtime to start using your custom op. - -##### ONNX Runtime Tests - -ONNX Runtime custom op kernel tests should be added in: ```onnxruntime/test/contrib_ops/_test.cc ``` - -```c++ -namespace onnxruntime { -namespace test { - -// Add a comprehensive set of unit tests for custom op kernel implementation - -TEST(InverseContribOpTest, two_by_two_float) { - OpTester test("Inverse", 1, kMSDomain); // custom opset version and domain - test.AddInput("X", {2, 2}, {4, 7, 2, 6}); - test.AddOutput("Y", {2, 2}, {0.6f, -0.7f, -0.2f, 0.4f}); - test.Run(); -} - -... - -} // namespace test -} // namespace onnxruntime - -``` - - -### 3. Test model Export End to End - -Once the custom op is registered in the exporter and implemented in ONNX Runtime, you should be able to -export it as part of you ONNX model and run it with ONNX Runtime. - -Below you can find a sample script for exporting and running the inverse operator as part of a model. -The exported model includes a combination of ONNX standard ops and the custom ops. - -This test also compares the output of PyTorch model with ONNX Runtime outputs to test both the operator export and -implementation. - -```python -import torch -import onnxruntime -import io -import numpy - - -class CustomInverse(torch.nn.Module): - def forward(self, x): - return torch.inverse(x) + x - -x = torch.randn(3, 3) - -# Export model to ONNX -f = io.BytesIO() -torch.onnx.export(CustomInverse(), (x,), f) - -model = CustomInverse() -pt_outputs = model(x) - -# Run the exported model with ONNX Runtime -ort_sess = onnxruntime.InferenceSession(f.getvalue()) -ort_inputs = dict((ort_sess.get_inputs()[i].name, input.cpu().numpy()) for i, input in enumerate((x,))) -ort_outputs = ort_sess.run(None, ort_inputs) - -# Validate PyTorch and ONNX Runtime results -numpy.testing.assert_allclose(pt_outputs.cpu().numpy(), ort_outputs[0], rtol=1e-03, atol=1e-05) -``` - -By default, the opset version will be set to ``1`` for custom opsets. If you'd like to export your -custom op to a higher opset version, you can specify the custom opset domain and version using -the ``custom_opsets argument`` when calling the export API. Note that this is different than the opset -version associated with default ```ONNX``` domain. - -``` -torch.onnx.export(CustomInverse(), (x,), f, custom_opsets={"com.microsoft": 5}) -``` - -Note that you can export a custom op to any version >= the opset version used at registration. - -We have a set of tests for export and output validation of ONNX models with ONNX Runtime custom ops in -``tools/test/test_test_custom_ops_pytorch_exporter.py``. If you're adding a new custom operator, please -make sure to include tests in this file. - -You can run these tests using the command: - -``` -PYTHONPATH= pytest -v test_custom_ops_pytorch_exporter.py -``` diff --git a/docs/InferenceHighLevelDesign.md b/docs/InferenceHighLevelDesign.md deleted file mode 100644 index d5f0e73c1c632..0000000000000 --- a/docs/InferenceHighLevelDesign.md +++ /dev/null @@ -1,135 +0,0 @@ -# ONNX Runtime High Level Design - -This document outlines the high level design of -ONNX Runtime - a high performance, cross platform engine. - -## Key objectives -* Maximally and automatically leverage the custom accelerators and runtimes -available on disparate platforms. -* Provide the right abstraction and runtime support for custom accelerators and -runtimes. We call this abstraction an [execution -provider](../include/onnxruntime/core/framework/execution_provider.h). It defines and exposes a set of -its capabilities to ONNXRuntime: a set of single or fused nodes it can -execute, its memory allocator, and more. Custom accelerators and runtimes are -instances of execution providers. -* We don't expect that an execution provider can always run an ONNX model fully -on its device. This means that ONNXRuntime must be able to execute a single -model in a heterogeneous environment involving multiple execution providers. -* Provide support for high-level optimizations that can be expressed as -model-to-model transformations via a [graph-transformation -API](../include/onnxruntime/core/optimizer/graph_transformer.h). Such -transformations fall into two categories: global transformations, those that -require analysis and transformation of the entire graph, and local -transformations, which can be captured as simple (algebraic) [rewriting -rules](../include/onnxruntime/core/optimizer/rewrite_rule.h). - -## High-level system architecture -The flow is quite simple. Starting from an ONNX model, ONNXRuntime first -converts the model graph into its in-memory graph representation. It then -applies a number of graph transformations that a) perform a set of provider -independent optimizations such cast transformations between float16 and float32, and b) partition the -graph into a set of subgraphs based on the available execution providers. Each -subgraph is assigned to an execution provider. We ensure that a subgraph can be -executed by an execution provider by querying the capability of the execution -provider using the GetCapability() API. - -![ONNXRuntime high level system architecture](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/228d22d3-6e3e-48b1-811c-1d48353f031c.png) - -### More about partitioning -ONNXRuntime partitions a model graph into subgraphs based on the available execution providers, one for each distinct provider. ONNXRuntime provides -a default execution provider that is used as the fallback execution for the -operators that cannot be pushed onto the more specialized but more efficient -execution providers. Intuitively we want to push computation to more -specialized execution providers whenever possible. - -We use a simple graph partitioning technique. The available execution providers -will be considered in a specific order, and each will be assigned the maximal -subgraphs (possibly more than one) that it is able to handle. The -ONNXRuntime-provided default execution provider will be the last one -considered, and it ensures completeness. More sophisticated optimizations can be -considered in the future (or can even be implemented as a composite execution -provider). - -Conceptually, each partition is reduced to a single fused operator. It is -created by invoking the execution provider's Compile() method and wraps it as a -custom operator. Currently we support only synchronous mode of execution. An execution -provider exposes its memory allocator, which is used to allocate the input -tensors for the execution provider. The rewriting and partitioning transform the -initial model graph into a new graph composed of operators assigned to either -the default execution provider or other registered execution -providers. The ONNXRuntime execution engine is responsible for running this graph. - -## Key design decisions -* Multiple threads can invoke the Run() method on the same -inference session object. See [API doc](C_API.md) for more details. -* To facilitate this, the Compute() function of all kernels is const -implying the kernels are stateless. -* Implementations of the operators by execution providers are called -kernels. Each execution provider supports a subset of the (ONNX) -operators/kernels. -* The ONNX Runtime guarantees that all operators are supported by the default -execution provider. -* Tensor representation: ONNXRuntime will utilize a standard representation for -the tensor runtime values. The execution providers can internally use a -different representation if they choose to, but it is their responsibility to -convert the values from/to the standard representation at the boundaries of -their subgraph. - -## Extensibility Options -* [Add a custom operator/kernel](AddingCustomOp.md) -* [Add an execution provider](AddingExecutionProvider.md) -* [Add a new graph -transform](../include/onnxruntime/core/optimizer/graph_transformer.h) -* [Add a new rewrite rule](../include/onnxruntime/core/optimizer/rewrite_rule.h) - -## The ONNX Runtime and Windows OS integration - -The ONNX runtime shipped with the Windows operating system in build 1809 (RS5). The runtime was embedded inside the Windows.AI.MachineLearning.dll and was exposed via that WinRT API (WinML for short). It includes CPU support and a DirectML execution provider for GPU support. Since then it has continued to ship in every version of Windows. - -Starting with the ONNX Runtime 1.2 release we are bringing a new layered architecture to the ONNX Runtime and Windows ML. -*Note: This feature is preview as of the 1.2 release* - -The high level design looks like this - -![ONNX + WinML layered architecture](images/layered-architecture.png) - -You can see we replaced the embedded ONNX runtime with the new ONNXRuntime.dll. With this new approach customers have flexibility on which API they choose to use and on how they want to distribute the binaries. - -### API choice - -Developers can now choose which API works best for their scenario. - -||WinRT|C API| -|--|--|--| -|Type system| Integration with Windows RT types| Platform neutral types| -|Language support| Language support via WinRT Projections| Language support via per language projections| -|Tensorization| Accepts VideoFrames and converts to tensors (support for CPU and GPU)| Accepts tensors| - -### Distribution choice - -You can also choose to use runtimes included in the Windows OS, or use the redist nuget to ship the runtime with the app. - -|Distribution|Inbox|App NuGet| -|--|--|--| -|Disk footprint| Included in the OS| Included in the App| -|Servicing fixes| Serviced by OS updates| Serviced by the App| -|Execution Providers| CPU & DirectML EP | App chosen EP| -|Compatability testing| Tested with OS flights against supported GPU's and CPU's | App performs compatibility testing| -|Opset| Refreshed in OS updates| App chooses| - - -### Using the NuGet WinRT API with other C-API distributions -The WinRT API NuGet is distributed with a curated build of the OnnxRuntime engine. App developers may wish to use the WinRT API, but find themselves limited to the functionality provided by the curated OnnxRuntime engine distributed as part of the WinRT API NuGet package. This can happen because the OnnxRuntime engine shipped with the WinRT API NuGet package only contains the CPU and DML execution providers. - -App developers may additionally wish to use a custom build-from-source version of the OnnxRuntime engine as well, or use a prebuilt version of the OnnxRuntime engine from another distribution source like the Micorosoft.ML.OnnxRuntime.MKLML distribution. - -To enable this, the WinRT API NuGet has been made to be compatible with a set of OnnxRuntime engines that ship in different NuGet packages. - -Please refer to the following table listing the distributions with compatible OnnxRuntime engines. -- [Microsoft.ML.OnnxRuntime](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime) -- [Microsoft.ML.OnnxRuntime.DirectML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.DirectML/) -- [Microsoft.ML.OnnxRuntime.MKLML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.MKLML) - -Note that compatible distributions must match in release version. - -In order to use compatible engines, replace the onnxruntime.dll with the desired engine binary and its associated binaries. \ No newline at end of file diff --git a/docs/Java_API.md b/docs/Java_API.md deleted file mode 100644 index 72978e07700c7..0000000000000 --- a/docs/Java_API.md +++ /dev/null @@ -1,79 +0,0 @@ -# ONNX Runtime Java API -The ONNX runtime provides a Java binding for running inference on ONNX models on a JVM, using Java 8 or newer. - -Release artifacts are published to Maven Central for use as a dependency in most Java build tools. The artifacts are built with support for some popular plaforms. - -![Version Shield](https://img.shields.io/maven-central/v/com.microsoft.onnxruntime/onnxruntime) - -| Artifact | Description | Supported Platforms | -|-----------|-------------|---------------------| -| [com.microsoft.onnxruntime:onnxruntime](https://search.maven.org/artifact/com.microsoft.onnxruntime/onnxruntime) | CPU | Windows x64, Linux x64, macOS x64 | -| [com.microsoft.onnxruntime:onnxruntime_gpu](https://search.maven.org/artifact/com.microsoft.onnxruntime/onnxruntime_gpu) | GPU (CUDA) | Windows x64, Linux x64 | - -For building locally, please see the [Java API development documentation](../java/README.md) for more details. - -For customization of the loading mechanism of the shared library, please see [advanced loading instructions](../java/README.md#advanced-loading). - -## API Reference - -The Javadoc is available [here](https://javadoc.io/doc/com.microsoft.onnxruntime/onnxruntime). - -## Sample Code - -An example implementation is located in -[src/test/java/sample/ScoreMNIST.java](../java/src/test/java/sample/ScoreMNIST.java). -Once compiled the sample code expects the following arguments `ScoreMNIST - `. MNIST is expected -to be in libsvm format. If the optional scikit-learn flag is supplied the model -is expected to be produced by skl2onnx (so expects a flat feature vector, and -produces a structured output), otherwise the model is expected to be a CNN from -pytorch (expecting a `[1][1][28][28]` input, producing a vector of -probabilities). Two example models are provided in [testdata](../java/testdata), -`cnn_mnist_pytorch.onnx` and `lr_mnist_scikit.onnx`. The first is a LeNet5 style -CNN trained using PyTorch, the second is a logistic regression trained using scikit-learn. - -The unit tests contain several examples of loading models, inspecting input/output node shapes and types, as well as constructing tensors for scoring. - -* [../java/src/test/java/ai/onnxruntime/InferenceTest.java#L66](../java/src/test/java/ai/onnxruntime/InferenceTest.java#L66) - -## Getting Started -Here is simple tutorial for getting started with running inference on an existing ONNX model for a given input data. The model is typically trained using any of the well-known training frameworks and exported into the ONNX format. -Note the code presented below uses syntax available from Java 10 onwards. The Java 8 syntax is similar but more verbose. -To start a scoring session, first create the `OrtEnvironment`, then open a session using the `OrtSession` class, passing in the file path to the model as a parameter. - - var env = OrtEnvironment.getEnvironment(); - var session = env.createSession("model.onnx",new OrtSession.SessionOptions()); - -Once a session is created, you can execute queries using the `run` method of the `OrtSession` object. -At the moment we support `OnnxTensor` inputs, and models can produce `OnnxTensor`, `OnnxSequence` or `OnnxMap` outputs. The latter two are more likely when scoring models produced by frameworks like scikit-learn. -The run call expects a `Map` where the keys match input node names stored in the model. These can be viewed by calling `session.getInputNames()` or `session.getInputInfo()` on an instantiated session. -The run call produces a `Result` object, which contains a `Map` representing the output. The `Result` object is `AutoCloseable` and can be used in a try-with-resources statement to -prevent references from leaking out. Once the `Result` object is closed, all it's child `OnnxValue`s are closed too. - - OnnxTensor t1,t2; - var inputs = Map.of("name1",t1,"name2",t2); - try (var results = session.run(inputs)) { - // manipulate the results - } - -You can load your input data into OnnxTensor objects in several ways. The most efficient way is to use a `java.nio.Buffer`, but it's possible to use multidimensional arrays too. If constructed using arrays the arrays must not be ragged. - - FloatBuffer sourceData; // assume your data is loaded into a FloatBuffer - long[] dimensions; // and the dimensions of the input are stored here - var tensorFromBuffer = OnnxTensor.createTensor(env,sourceData,dimensions); - - float[][] sourceArray = new float[28][28]; // assume your data is loaded into a float array - var tensorFromArray = OnnxTensor.createTensor(env,sourceArray); - -Here is a [complete sample program](../java/src/test/java/sample/ScoreMNIST.java) that runs inference on a pretrained MNIST model. - -## Running on a GPU or with another provider (Optional) -To enable other execution providers like GPUs simply turn on the appropriate flag on SessionOptions when creating an OrtSession. - - int gpuDeviceId = 0; // The GPU device ID to execute on - var sessionOptions = new OrtSession.SessionOptions(); - sessionOptions.addCUDA(gpuDeviceId); - var session = environment.createSession("model.onnx", sessionOptions); - -The execution providers are preferred in the order they were enabled. - diff --git a/docs/ONNX_Runtime_Graph_Optimizations.md b/docs/ONNX_Runtime_Graph_Optimizations.md deleted file mode 100644 index 9b23d30fed051..0000000000000 --- a/docs/ONNX_Runtime_Graph_Optimizations.md +++ /dev/null @@ -1,148 +0,0 @@ -# Graph Optimizations in ONNX Runtime - -ONNX Runtime provides various graph optimizations to improve model performance. Graph optimizations are essentially graph-level transformations, ranging from small graph simplifications and node eliminations to more complex node fusions and layout optimizations. - -Graph optimizations are divided in several categories (or *levels*) based on their complexity and functionality. They can be performed either *online* or *offline*. In online mode, the optimizations are done before performing the inference, while in offline mode, the runtime saves the optimized graph to disk. ONNX Runtime provides Python, C#, C++, and C APIs to enable different optimization levels and to choose between offline vs. online mode. - -Below we provide details on the optimization levels, the online/offline mode, and the various APIs to control them. - -## Graph Optimization Levels - -Graph optimizations are divided in three levels: -* Basic -* Extended -* Layout Optimizations - -The optimizations belonging to one level are performed after the optimizations of the previous level have been applied (e.g., extended optimizations are applied after basic optimizations have been applied). - -**All optimizations are enabled by default.** - -### Basic Graph Optimizations - -These are semantics-preserving graph rewrites which remove redundant nodes and redundant computation. They run before graph partitioning and thus apply to all the execution providers. Available basic graph optimizations are as follows: - -* Constant Folding: Statically computes parts of the graph that rely only on constant initializers. This eliminates the need to compute them during runtime. - -* Redundant node eliminations: Remove all redundant nodes without changing the graph structure. The following such optimizations are currently supported: - * Identity Elimination - * Slice Elimination - * Unsqueeze Elimination - * Dropout Elimination - -* Semantics-preserving node fusions : Fuse/fold multiple nodes into a single node. For example, Conv Add fusion folds the Add operator as the bias of the Conv operator. The following such optimizations are currently supported: - * Conv Add Fusion - * Conv Mul Fusion - * Conv BatchNorm Fusion - * Relu Clip Fusion - * Reshape Fusion - -### Extended Graph Optimizations - -These optimizations include complex node fusions. They are run after graph partitioning and are only applied to the nodes assigned to the CPU or CUDA execution provider. Available extended graph optimizations are as follows: - -| Optimization | Execution Provider | Comment | -|---------------------------------|--------------------|-----------------------------------------------------------------------------| -| GEMM Activation Fusion | cpu | | -| Matmul Add Fusion | cpu | | -| Conv Activation Fusion | cpu | | -| GELU Fusion | cpu or cuda | | -| Layer Normalization Fusion | cpu or cuda | | -| BERT Embedding Layer Fusion | cpu or cuda | Fuse BERT embedding layer, layer normalization and attention mask length | -| Attention Fusion | cpu or cuda | Attention mask has approximation in cuda execution provider | -| Skip Layer Normalization Fusion | cpu or cuda | Fuse bias of fully connected layer, skip connection and layer normalization | -| Bias GELU Fusion | cpu or cuda | Fuse bias of fully connected layer and GELU activation | -| GELU Approximation | cuda | Erf is approximated by a formula using tanh function | - -To optimize inference performance of BERT model, approximation is used in GELU approximation and Attention fusion for cuda execution provider. There might be slight difference in result. The impact on accuracy could be neglected based on our evaluation: F1 score for a BERT model on SQuAD v1.1 is almost same (87.05 vs 87.03). - -GELU approximation is disabled by default. - -### Layout Optimizations - -These optimizations change the data layout for applicable nodes to achieve higher performance improvements. They are run after graph partitioning and are only applied to nodes assigned to CPU execution provider. Available layout optimizations are as follows: - -* NCHWc Optimizer: Optimizes the graph by using NCHWc layout instead of NCHW layout. - -## Online/Offline Mode - -All optimizations can be performed either online or offline. In online mode, when initializing an inference session, we also apply all enabled graph optimizations before performing model inference. Applying all optimizations each time we initiate a session can add overhead to the model startup time (especially for complex models), which can be critical in production scenarios. This is where the offline mode can bring a lot of benefit. In offline mode, after performing graph optimizations, ONNX Runtime serializes the resulting model to disk. Subsequently, when new inference sessions are created for this model, we can instead use the already optimized model to reduce startup time. - -**Notes**: - -* When running in offline mode, make sure to use the exact same options (e.g., execution providers, optimization level) and hardware as the target machine that the model inference will run on (e.g., you cannot run a model pre-optimized for a GPU execution provider on a machine that is equipped only with CPU). -* When layout optimizations are enabled, the offline mode can only be used on compatible hardware to the environment when the offline model is saved. For example, if model has layout optimized for AVX2, the offline model would require CPUs that support AVX2. - -## Usage - -### General Note -**Levels**: -ONNX Runtime defines the `GraphOptimizationLevel` enum to determine which of the aforementioned optimization levels will be enabled. Choosing a level enables the optimizations of that level, as well as the optimizations of all preceding levels. For example, enabling Extended optimizations, also enables Basic optimizations. The mapping of these levels to the enum is as follows: - -* GraphOptimizationLevel::ORT_DISABLE_ALL -> Disables all optimizations -* GraphOptimizationLevel::ORT_ENABLE_BASIC -> Enables basic optimizations -* GraphOptimizationLevel::ORT_ENABLE_EXTENDED -> Enables basic and extended optimizations -* GraphOptimizationLevel::ORT_ENABLE_ALL -> Enables all available optimizations including layout optimizations - -**Online/Offline Mode**: -To enable serialization of the optimized model to disk, set the SessionOptions option `optimized_model_path` to the desired path where the optimized model will be stored. - -### Python API Usage -```python -import onnxruntime as rt - -sess_options = rt.SessionOptions() - -# Set graph optimization level -sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - -# To enable model serialization after graph optimization set this -sess_options.optimized_model_filepath = "" - -session = rt.InferenceSession("", sess_options) -``` - -### C API Example: -```c - const OrtApi* Ort::g_api = OrtGetApi(ORT_API_VERSION); - OrtEnv* env; - g_ort->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "test", &env); - OrtSessionOptions* session_options; - g_ort->CreateSessionOptions(&session_options) - - // Set graph optimization level - g_ort->SetSessionGraphOptimizationLevel(session_options, ORT_ENABLE_EXTENDED); - - // To enable model serialization after graph optimization set this - const wchar_t* optimized_model_path = L"optimized_model_path"; - g_ort->SetOptimizedModelFilePath(session_options, optimized_model_path); - - OrtSession* session; - const wchar_t* model_path = L"model_path"; - g_ort->CreateSession(env, model_path, session_option, &session); -``` - -### C# API Example: -```c# -SessionOptions so = new SessionOptions(); - -// Set graph optimization level -so.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_EXTENDED; - -// To enable model serialization after graph optimization set this -so.OptimizedModelFilePath = "model_output_path\optimized_model.onnx" - -var session = new InferenceSession(modelPath, so); -``` - -### C++ API Example: -```c++ -Ort::SessionOptions session_options; - -// Set graph optimization level -session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); - -// To enable model serialization after graph optimization set this -session_options.SetOptimizedModelFilePath("optimized_file_path"); - -auto session_ = Ort::Session(env, "model_file_path", session_options); -``` diff --git a/docs/ONNX_Runtime_Perf_Tuning.md b/docs/ONNX_Runtime_Perf_Tuning.md deleted file mode 100644 index c7040b8a0e390..0000000000000 --- a/docs/ONNX_Runtime_Perf_Tuning.md +++ /dev/null @@ -1,179 +0,0 @@ -# ONNX Runtime Performance Tuning - -ONNX Runtime gives high performance across a range of hardware options by providing "Execution Providers" to interface to different execution environments. See: [design overview](./HighLevelDesign.md), [supported execution providers](../README.md#supported-accelerators). - -Along with this flexibility comes decisions for tuning and usage. For each model running with each execution provider, there are settings that can be tuned (e.g. thread number, wait policy, etc) to improve performance. - -This document covers basic tools and knobs that can be leveraged to find the best performance for your model and hardware. - -**Topics** -* [Performance Tuning Tools](#Performance-Tuning-Tools) -* [Using different Execution Providers](#Using-different-Execution-Providers) -* [Which Execution Provider will provide the best performance?](#Which-Execution-Provider-will-provide-the-best-performance) -* [Tuning performance for specific Execution Providers](#Tuning-performance-for-specific-Execution-Providers) -* [Troubleshooting model performance issues](#Troubleshooting-model-performance-issues) -*** - -## Performance Tuning Tools -The [ONNX Go Live "OLive" tool](https://github.com/microsoft/OLive) is an easy-to-use pipeline for converting models to ONNX and optimizing performance with ONNX Runtime. The tool can help identify the optimal runtime configuration to get the best performance on the target hardware for the model. -As a quickstart, please see the notebooks: [Python](https://github.com/microsoft/OLive/blob/master/notebook/Convert_Models_and_Tune_Performance_with_OLive_Python_SDK.ipynb), [Docker images](https://github.com/microsoft/OLive/blob/master/notebook/Convert_Models_and_Tune_Performance_with_OLive_Docker_Images.ipynb) - - -### Profiling and Performance Report - -The onnxruntime_perf_test.exe tool (available from the build drop) can be used to test various knobs. Please find the usage instructions using `onnxruntime_perf_test.exe -h`. - -You can enable ONNX Runtime latency profiling in code: - -```python -import onnxruntime as rt - -sess_options = rt.SessionOptions() -sess_options.enable_profiling = True -``` -If you are using the onnxruntime_perf_test.exe tool, you can add `-p [profile_file]` to enable performance profiling. - -In both cases, you will get a JSON file which contains the detailed performance data (threading, latency of each operator, etc). This file is a standard performance tracing file, and to view it in a user friendly way, you can open it by using chrome://tracing: -* Open chrome browser -* Type chrome://tracing in the address bar -* Load the generated JSON file - -## Using different Execution Providers -To learn more about different Execution Providers, see [docs/exeuction_providers](./execution_providers). - -### Python API -Official Python packages on Pypi only support the default CPU (MLAS) and default GPU (CUDA) execution providers. For other execution providers, you need to build from source. Please refer to the [build instructions](../BUILD.md). The recommended instructions build the wheel with debug info in parallel. - -For example: - -`DNNL: ./build.sh --config RelWithDebInfo --use_dnnl --build_wheel --parallel` - -` CUDA: ./build.sh --config RelWithDebInfo --use_cuda --build_wheel --parallel` - - -### C and C# API -Official release (nuget package) supports default (MLAS) and MKL-ML for CPU, and CUDA for GPU. For other execution providers, you need to build from source. Append `--build_csharp` to the instructions to build both C# and C packages. - -For example: - -`DNNL: ./build.sh --config RelWithDebInfo --use_dnnl --build_csharp --parallel` - -`CUDA: ./build.sh --config RelWithDebInfo --use_cuda --build_csharp --parallel` - -In order to use DNNL, CUDA, or TensorRT execution provider, you need to call the C API OrtSessionOptionsAppendExecutionProvider. Here is an example for the CUDA execution provider: - -C API Example: -```c - const OrtApi* g_ort = OrtGetApi(ORT_API_VERSION); - OrtEnv* env; - g_ort->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "test", &env) - OrtSessionOptions* session_option; - g_ort->OrtCreateSessionOptions(&session_options); - g_ort->OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, 0); - OrtSession* session; - g_ort->CreateSession(env, model_path, session_option, &session); -``` - -C# API Example: -```c# -SessionOptions so = new SessionOptions(); -so.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_EXTENDED; -so.AppendExecutionProvider_CUDA(0); -var session = new InferenceSession(modelPath, so); -``` - -Python API Example: -```python -import onnxruntime as rt - -so = rt.SessionOptions() -so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL -session = rt.InferenceSession(model, sess_options=so) -session.set_providers(['CUDAExecutionProvider']) -``` - -## Which Execution Provider will provide the best performance? -Performance is dependent on the specific model you're trying to run, the session and run options you've selected, and of course, your specific hardware target. Below you'll find some more information that may be helpful to select the right Execution Provider. - -### CUDA (Default GPU) or CPU? -The CPU version of ONNX Runtime provides a complete implementation of all operators in the ONNX spec. This ensures that your ONNX-compliant model can execute successfully. In order to keep the binary size small, common data types are supported for the ops. If you are using an uncommon data type that is not supported, you can file an issue and/or contribute a PR (see examples - [PR #2112](https://github.com/microsoft/onnxruntime/pull/2112), [PR #2034](https://github.com/microsoft/onnxruntime/pull/2034), [PR #1565](https://github.com/microsoft/onnxruntime/pull/1565)). Please make sure you provide details on usage justification. - -Additionally, not all CUDA kernels are implemented, as these have been prioritized on an as-needed basis. This means that if your model contains operators that do not have a CUDA implementation, it will fall back to CPU. Switching between CPU and GPU can cause significant performance impact. If you require a specific operator that is not currently supported, please consider [contributing](./../CONTRIBUTING.md) and/or [file an issue](https://github.com/microsoft/onnxruntime/issues) clearly describing your use case and share your model if possible. - -### TensorRT or CUDA? -TensorRT and CUDA are separate execution providers for ONNX Runtime. On the same hardware, TensorRT will generally provide better performance; however, this depends on the specific model and whether the operators in the model can be supported by TensorRT. In cases where TensorRT cannot handle the subgraph(s), it will fall back to CUDA. Note that the TensorRT EP may depend on a different version of CUDA than the CUDA EP. - -### TensorRT/CUDA or DirectML? -DirectML is the hardware-accelerated DirectX 12 library for machine learning on Windows and supports all DirectX 12 capable devices (Nvidia, Intel, AMD). This means that if you are targeting Windows GPUs, using the DirectML Execution Provider is likely your best bet. This can be used with both the ONNX Runtime as well as [WinML APIs](./WinRT_API.md). - -## Tuning performance for specific Execution Providers - -### Thread management -* If ORT is built with OpenMP, use the OpenMP env variable to control the number of intra op num threads. -* If ORT is not built with OpenMP, use the appropriate ORT API to control intra op num threads. -* Inter op num threads (used only when parallel execution is enabled) is not affected by OpenMP settings and should -always be set using the ORT APIs. - -### Default CPU Execution Provider (MLAS) -The default execution provider uses different knobs to control the thread number. - -For the default CPU execution provider, you can try following knobs in the Python API: -```python -import onnxruntime as rt - -sess_options = rt.SessionOptions() - -sess_options.intra_op_num_threads = 2 -sess_options.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL -sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL -``` - -* Thread Count - * `sess_options.intra_op_num_threads = 2` controls the number of threads to use to run the model -* Sequential vs Parallel Execution - * `sess_options.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL` controls whether the operators in the graph run sequentially or in parallel. Usually when a model has many branches, setting this option to false will provide better performance. - * When `sess_options.execution_mode = rt.ExecutionMode.ORT_PARALLEL`, you can set `sess_options.inter_op_num_threads` to control the -number of threads used to parallelize the execution of the graph (across nodes). - -* sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL. Default is already ORT_ENABLE_ALL(99). Please see [onnxruntime_c_api.h](../include/onnxruntime/core/session/onnxruntime_c_api.h#L241) (enum GraphOptimizationLevel) for the full list of all optimization levels. For details regarding available optimizations and usage please refer to the [Graph Optimizations Doc](../docs/ONNX_Runtime_Graph_Optimizations.md). - -### MKL_DNN/MKL_ML Execution Provider -MKL_DNN and MKL_ML depend on openmp for parallelization. For those execution providers, we need to use the openmp environment variable to tune the performance. - -The most widely used environment variables are: - -* OMP_NUM_THREADS=n - * Controls the thread pool size - -* OMP_WAIT_POLICY=PASSIVE/ACTIVE - * Controls whether thread spinning is enabled - * PASSIVE is also called throughput mode and will yield CPU after finishing current task - * ACTIVE will not yield CPU, instead it will have a while loop to check whether the next task is ready - * Use PASSIVE if your CPU usage already high, and use ACTIVE when you want to trade CPU with latency - -## Using and configuring shared arena based allocator to reduce memory consumption between multiple sessions -See `Share allocator(s) between sessions` section in [C API documentation](C_API.md). - -## Troubleshooting model performance issues -The answers below are troubleshooting suggestions based on common previous user-filed issues and questions. This list is by no means exhaustive and there is a lot of case-by-case fluctuation depending on the model and specific usage scenario. Please use this information to guide your troubleshooting, search through previously filed issues for related topics, and/or file a new issue if your problem is still not resolved. - -### Performance Troubleshooting Checklist -Here is a list of things to check through when assessing performance issues. -* Are you using OpenMP? OpenMP will parallelize some of the code for potential performance improvements. This is not recommended for running on single threads. -* Have you enabled all [graph optimizations](./ONNX_Runtime_Graph_Optimizations.md)? The official published packages do enable all by default, but when building from source, check that these are enabled in your build. -* Have you searched through prior filed [Github issues](https://github.com/microsoft/onnxruntime/issues) to see if your problem has been discussed previously? Please do this before filing new issues. -* If using CUDA or TensorRT, do you have the right versions of the dependent libraries installed? - -### I need help performance tuning for BERT models. -For BERT models, sometimes ONNX Runtime cannot apply the best optimization due to reasons such as framework version updates. We recommend trying out the [BERT optimization tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/bert), which reflects the latest changes in graph pattern matching and model conversions, and a set of [notebooks](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/bert/notebooks) to help get started. - -### Why is the model graph not optimized even with graph_optimization_level set to ORT_ENABLE_ALL? -The ONNX model from IR_VERSION 4 only treats initializers that appear in graph input as non-constant. This may fail some of the graph optimizations, like const folding, operator fusion and etc. Move initializers out of graph inputs if there is no need to override them, by either re-generating the model with latest exporter/converter or with the tool [remove_initializer_from_input.py](./../tools/python/remove_initializer_from_input.py). - -### Why is my model running slower on GPU than CPU? -Depending on which execution provider you're using, it may not have full support for all the operators in your model. Fallback to CPU ops can cause hits in performance speed. Moreover even if an op is implemented by the CUDA execution provider, it may not necessarily assign/place the op to the CUDA EP due to performance reasons. To see the placement decided by ORT, turn on verbose logging and look at the console output. - -### My converted Tensorflow model is slow - why? -NCHW and NHWC are two different memory layout for 4-D tensors. - -Most TensorFlow operations used by a CNN support both NHWC and NCHW data format. The Tensorflow team suggests that on GPU NCHW is faster but on CPU NHWC is sometimes faster in Tensorflow. However, ONNX only supports NCHW. As a result, if the original model is in NHWC format, when the model is converted extra transposes may be added. The [tensorflow-onnx](https://github.com/onnx/tensorflow-onnx) and [keras-onnx](https://github.com/onnx/keras-onnx) converters do remove many of these transposes, but if this doesn't help sufficiently, consider retraining the model using NCHW. diff --git a/docs/PyOp.md b/docs/PyOp.md deleted file mode 100644 index a494362205b6d..0000000000000 --- a/docs/PyOp.md +++ /dev/null @@ -1,136 +0,0 @@ -# Python Operator - -**Deprecation Note: This feature is deprecated and no longer supported, please refer to [onnxruntime_customops](https://github.com/microsoft/ort-customops) project for this function.** - -The Python Operator provides the capability to easily invoke any custom Python code within a single node of an ONNX graph using ONNX Runtime. This can be useful for quicker experimentation when a model requires operators that are not officially supported in ONNX and ONNX Runtime, particularly if there is already a Python implementation for the required functionality. This should be used with discretion in production scenarios, and all security or other risks should be considered beforehand. - -## Design Overview -The feature can be found under [onnxruntime/core/language_interop_ops](../onnxruntime/core/language_interop_ops). -Here is a chart of calling sequence: -
-onnxruntime                        python capi                         script
-     |                                  |                                 |
-     | ------------------------------>  |                                 |
-     |       call with tensor(s)        | ------------------------------> |
-     |                                  |         call with numpy(s)      | 
-     |                                  |                                 | compute
-     |                                  | <------------------------------ |
-     | <------------------------------  |           return numpys(s)      |
-     |         return tensor(s)         |                                 |
-
- -## How to Use -### Step 1 -Build onnxruntime with `--config Release --enable_language_interop_ops --build_wheel` and pip install the latest wheel file. - -### Step 2 -Create an onnx model containing Python operator nodes: -```python -ad1_node = helper.make_node('Add', ['A','B'], ['S']) -mul_node = helper.make_node('Mul', ['C','D'], ['P']) -py1_node = helper.make_node(op_type = 'PyOp', #required, must be 'PyOp' - inputs = ['S','P'], #required - outputs = ['L','M','N'], #required - domain = 'pyopmulti_1', #required, must be unique - input_types = [TensorProto.FLOAT, TensorProto.FLOAT], #required - output_types = [TensorProto.FLOAT, TensorProto.FLOAT, TensorProto.FLOAT], #required - module = 'mymodule', #required - class_name = 'Multi_1', #required - compute = 'compute', #optional, 'compute' by default - W1 = '5', W2 = '7', W3 = '9') #optional, must all be strings -ad2_node = helper.make_node('Add', ['L','M'], ['H']) -py2_node = helper.make_node('PyOp',['H','N','E'],['O','W'], domain = 'pyopmulti_2', - input_types = [TensorProto.FLOAT, TensorProto.FLOAT, TensorProto.FLOAT], - output_types = [TensorProto.FLOAT, TensorProto.FLOAT], - module = 'mymodule', class_name = 'Multi_2') -sub_node = helper.make_node('Sub', ['O','W'], ['F']) -graph = helper.make_graph([ad1_node,mul_node,py1_node,ad2_node,py2_node,sub_node], 'multi_pyop_graph', [A,B,C,D,E], [F]) -model = helper.make_model(graph, producer_name = 'pyop_model') -onnx.save(model, './model.onnx') -``` -### Step 3 -Implement mymodule.py: -```python -class Multi_1: - def __init__(self, W1, W2, W3): - self.W1 = int(W1) - self.W2 = int(W2) - self.W3 = int(W3) - def compute(self, S, P): - ret = S + P - return ret + self.W1, ret + self.W2, ret + self.W3 -class Multi_2: - def compute(self, *kwargs): - return sum(kwargs[0:-1]), sum(kwargs[1:]) -``` -### Step 4 -Copy mymodule.py into Python sys.path, then run the model with onnxruntime python API. On Windows, please set PYTHONHOME beforehand. It should point to directory where the python is installed, such as C:\Python37 or C:\ProgramData\Anaconda3\envs\myconda1 if it is in conda. - -## Supported Data Types -* TensorProto.BOOL -* TensorProto.UINT8 -* TensorProto.UINT16 -* TensorProto.UINT32 -* TensorProto.INT16 -* TensorProto.INT32 -* TensorProto.FLOAT -* TensorProto.DOUBLE - -## Limitations -* Inferencing and compiling environments must be installed with same version of python. -* On Windows, `--config Debug` has known issues. Please build with `--config RelWithDebInfo` if debugging symbols are needed. -* Due to Python C API restrictions, multi-threading is disabled so Python operators will run sequentially. - -## Test Coverage -The operator has been tested on multiple platforms, with or without conda: - -Platform | Python 3.5 | Python 3.6 | Python 3.7 ------------ | ------------| ----------- | ----------- -Windows | (conda) passed | (conda) passed | passed -Linux | (conda) passed | (conda) passed | passed -Mac | (conda) passed | (conda) passed | (conda) passed - -## Example -Developers could resort to PyOp during model conversion for missing operators: -```python -import os -import numpy as np -from onnx import * -from skl2onnx import convert_sklearn -from skl2onnx.common.data_types import FloatTensorType -from skl2onnx.common.utils import check_input_and_output_numbers - -X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]],dtype=np.single) -nmf = NMF(n_components=2, init='random', random_state=0) -W = np.array(nmf.fit_transform(X), dtype=np.single) - -def calculate_sklearn_nmf_output_shapes(operator): - check_input_and_output_numbers(operator, output_count_range=1, input_count_range=1) - operator.outputs[0].type.shape = operator.inputs[0].type.shape - -def convert_nmf(scope, operator, container): - ws = [str(w) for w in W.flatten()] - attrs = {'W':'|'.join(ws)} - container.add_node(op_type='PyOp', name='nmf', inputs=['X'], outputs=['variable'], - op_version=10, op_domain='MyDomain', module='mymodule', class_name='MyNmf', - input_types=[TensorProto.FLOAT], output_types=[TensorProto.FLOAT], **attrs) - -custom_shape_calculators = {type(nmf): calculate_sklearn_nmf_output_shapes} -custom_conversion_functions = {type(nmf): convert_nmf} -initial_types = [('X', FloatTensorType([6,2]))] -onx = convert_sklearn(nmf, '', initial_types, '', None, custom_conversion_functions, custom_shape_calculators) -with th open("model.onnx", "wb") as f: - f.write(onx.SerializeToString()) -``` -mymodule.py: -```python -import numpy as np -class MyNmf: - def __init__(self,W): - A = [] - for w in W.split('|'): - A.append(float(w)) - self.__W = np.array(A,dtype=np.single).reshape(6,2) - def compute(self,X): - return self.__W -``` diff --git a/docs/WinRT_API.md b/docs/WinRT_API.md deleted file mode 100644 index 566381bd9f820..0000000000000 --- a/docs/WinRT_API.md +++ /dev/null @@ -1,37 +0,0 @@ -# Windows Machine Learning WinRT API - -New in the ONNX Runtime Nuget package is the ability to use the full [WinML API](https://docs.microsoft.com/en-us/windows/ai/windows-ml/api-reference). - -This allows scenarios such as passing a [Windows.Media.VideoFrame](https://docs.microsoft.com/en-us/uwp/api/Windows.Media.VideoFrame) from your connected camera directly into the runtime for realtime inference. - -The WinML API is a WinRT API that shipped inside the Windows OS starting with build 1809 (RS5) in the Windows.AI.MachineLearning namespace. It embedded a version of the ONNX Runtime. - -Many customers have asked for a way to use this offering as an application redistributable package. - -With our new [layered architecture](InferenceHighLevelDesign.md#the-onnx-runtime-and-windows-os-integration) you can now do this, with some limitations. The WinML APIs have been lifted and mirrored into the Microsoft.AI.MachineLearning namespace in the redistributable. - -## NuGet Package - -The Microsoft.AI.MachineLearning [Nuget package](https://www.nuget.org/packages/Microsoft.AI.MachineLearning/) includes the precompiled binaries for using the ONNX runtime with the WinRT API. Support is compiled directly into *onnxruntime.dll* - -Note: As of the 1.3 release, you can use all of the CPU and GPU functionality from these binaries. - -## Sample Code - -Any code already written for the Windows.AI.MachineLearning API can be easily modified to run against the Microsoft.ML.OnnxRuntime package. All types originally referenced by inbox customers via the Windows namespace will need to be updated to now use the Microsoft namespace. Check out these [existing samples](https://github.com/microsoft/Windows-Machine-Learning/tree/master/Samples/SqueezeNetObjectDetection/Desktop/cpp) in github. - -## Deciding on whether to use WinML in the Windows SDK or the Redist -To detect if a particular OS version of Windows has the WinML APIs, use the [IsApiContractPresent](https://docs.microsoft.com/en-us/uwp/api/windows.foundation.metadata.apiinformation.isapicontractpresent) method. This can be called from either UWP or native apps. - -If the OS does not have the runtime you need you can switch to use the redist binaries instead. - -|Release|API contract version| -|--|--| -|Windows OS 1809| 1| -|Windows OS 1903| 2| -|Windows OS 1909| 2| -|ORT release 1.2| 3| -|ORT release 1.3| 3| -|ORT release 1.4| 3| - -See [here](https://docs.microsoft.com/en-us/windows/ai/windows-ml/onnx-versions) for more about opsets and ONNX version details in Windows OS distributions. diff --git a/docs/execution_providers/ACL-ExecutionProvider.md b/docs/execution_providers/ACL-ExecutionProvider.md deleted file mode 100644 index 40b9a4969345f..0000000000000 --- a/docs/execution_providers/ACL-ExecutionProvider.md +++ /dev/null @@ -1,21 +0,0 @@ -## ACL Execution Provider - -[Arm Compute Library](https://github.com/ARM-software/ComputeLibrary) is an open source inference engine maintained by Arm and Linaro companies. The integration of ACL as an execution provider (EP) into ONNX Runtime accelerates performance of ONNX model workloads across Armv8 cores. - -### Build ACL execution provider -For build instructions, please see the [BUILD page](../../BUILD.md#ARM-Compute-Library). - -### Using the ACL execution provider -#### C/C++ -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -bool enable_cpu_mem_arena = true; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ACL(sf, enable_cpu_mem_arena)); -``` -The C API details are [here](../C_API.md#c-api). - -### Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest), use the flag -e acl diff --git a/docs/execution_providers/ArmNN-ExecutionProvider.md b/docs/execution_providers/ArmNN-ExecutionProvider.md deleted file mode 100644 index 35f2fc2da8c4e..0000000000000 --- a/docs/execution_providers/ArmNN-ExecutionProvider.md +++ /dev/null @@ -1,22 +0,0 @@ -## ArmNN Execution Provider - -[ArmNN](https://github.com/ARM-software/armnn) is an open source inference engine maintained by Arm and Linaro companies. The integration of ArmNN as an execution provider (EP) into ONNX Runtime accelerates performance of ONNX model workloads across Armv8 cores. - -### Build ArmNN execution provider -For build instructions, please see the [BUILD page](../../BUILD.md#ArmNN). - -### Using the ArmNN execution provider -#### C/C++ -To use ArmNN as execution provider for inferencing, please register it as below. -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -bool enable_cpu_mem_arena = true; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ArmNN(sf, enable_cpu_mem_arena)); -``` -The C API details are [here](../C_API.md#c-api). - -### Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest), use the flag -e armnn diff --git a/docs/execution_providers/DNNL-ExecutionProvider.md b/docs/execution_providers/DNNL-ExecutionProvider.md deleted file mode 100644 index 0952cccbe1368..0000000000000 --- a/docs/execution_providers/DNNL-ExecutionProvider.md +++ /dev/null @@ -1,35 +0,0 @@ -# DNNL Execution Provider - -Intel® Math Kernel Library for Deep Neural Networks (Intel® DNNL) is an open-source performance library for deep-learning applications. The library accelerates deep-learning applications and frameworks on Intel® architecture and Intel® Processor Graphics Architecture. Intel DNNL contains vectorized and threaded building blocks that you can use to implement deep neural networks (DNN) with C and C++ interfaces. For more, please see the DNNL documentation on (https://intel.github.io/mkl-dnn/). - -Intel and Microsoft have developed DNNL Execution Provider (EP) for ONNX Runtime to accelerate performance of ONNX Runtime using Intel® Math Kernel Library for Deep Neural Networks (Intel® DNNL) optimized primitives. - -For information on how DNNL optimizes subgraphs, see [Subgraph Optimization](./MKL-DNN-Subgraphs.md) - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#dnnl-and-mklml). - -## Supported OS -* Ubuntu 16.04 -* Windows 10 -* Mac OS X - -## Supported backend -* CPU - -## Using the DNNL Execution Provider -### C/C++ -The DNNLExecutionProvider execution provider needs to be registered with ONNX Runtime to enable in the inference session. -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -bool enable_cpu_mem_arena = true; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Dnnl(sf, enable_cpu_mem_arena)); -``` -The C API details are [here](../C_API.md#c-api). - -### Python -When using the python wheel from the ONNX Runtime built with DNNL execution provider, it will be automatically prioritized over the CPU execution provider. Python APIs details are [here](https://aka.ms/onnxruntime-python). - -## Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) diff --git a/docs/execution_providers/DirectML-ExecutionProvider.md b/docs/execution_providers/DirectML-ExecutionProvider.md deleted file mode 100644 index 3a1c7b529237a..0000000000000 --- a/docs/execution_providers/DirectML-ExecutionProvider.md +++ /dev/null @@ -1,126 +0,0 @@ -# DirectML Execution Provider - -DirectML is a high-performance, hardware-accelerated DirectX 12 library for machine learning on Windows. DirectML provides GPU acceleration for common machine learning tasks across a broad range of supported hardware and drivers. - -When used standalone, the DirectML API is a low-level DirectX 12 library and is suitable for high-performance, low-latency applications such as frameworks, games, and other real-time applications. The seamless interoperability of DirectML with Direct3D 12 as well as its low overhead and conformance across hardware makes DirectML ideal for accelerating machine learning when both high performance is desired, and the reliability and predictability of results across hardware is critical. - -The *DirectML Execution Provider* is an optional component of ONNX Runtime that uses DirectML to accelerate inference of ONNX models. The DirectML execution provider is capable of greatly improving evaluation time of models using commodity GPU hardware, without sacrificing broad hardware support or requiring vendor-specific extensions to be installed. - -The DirectML Execution Provider currently uses DirectML version 1.4.0. - -## Table of contents - -- [DirectML Execution Provider](#directml-execution-provider) - - [Table of contents](#table-of-contents) - - [Minimum requirements](#minimum-requirements) - - [Building from source](#building-from-source) - - [Using the DirectML execution provider](#using-the-directml-execution-provider) - - [`OrtSessionOptionsAppendExecutionProvider_DML` function](#ortsessionoptionsappendexecutionprovider_dml-function) - - [`OrtSessionOptionsAppendExecutionProviderEx_DML` function](#ortsessionoptionsappendexecutionproviderex_dml-function) - - [ONNX opset support](#onnx-opset-support) - - [Multi-threading and supported session options](#multi-threading-and-supported-session-options) - - [Samples](#samples) - - [Performance best practices](#performance-best-practices) - - [See also](#see-also) - -## Minimum requirements - -The DirectML execution provider requires any DirectX 12 capable device. Almost all commercially-available graphics cards released in the last several years support DirectX 12. Examples of compatible hardware include: - -* NVIDIA Kepler (GTX 600 series) and above -* AMD GCN 1st Gen (Radeon HD 7000 series) and above -* Intel Haswell (4th-gen core) HD Integrated Graphics and above - -DirectML is compatible with Windows 10, version 1709 (10.0.16299; RS3, "Fall Creators Update") and newer. - - - -## Building from source - -For general information about building onnxruntime, see [BUILD.md](../../BUILD.md). - -Requirements for building the DirectML execution provider: -1. Visual Studio 2017 toolchain (see [cmake configuration instructions](../../BUILD.md)) -2. [The Windows 10 SDK (10.0.18362.0) for Windows 10, version 1903](https://developer.microsoft.com/en-us/windows/downloads/windows-10-sdk) (or newer) - -To build onnxruntime with the DML EP included, supply the `--use_dml` parameter to `build.bat`. e.g. - - build.bat --config RelWithDebInfo --build_shared_lib --parallel --use_dml - -The DirectML execution provider supports building for both x64 (default) and x86 architectures. - -Note that building onnxruntime with the DirectML execution provider enabled causes the the DirectML redistributable package to be automatically downloaded as part of the build. Its use is governed by a license whose text may be found as part of the NuGet package. - - - -## Using the DirectML execution provider - -When using the [C API](../C_API.md) with a DML-enabled build of onnxruntime (see [Building from source](#building-from-source)), the DirectML execution provider can be enabled using one of the two factory functions included in `include/onnxruntime/core/providers/dml/dml_provider_factory.h`. - -### `OrtSessionOptionsAppendExecutionProvider_DML` function - - Creates a DirectML Execution Provider which executes on the hardware adapter with the given `device_id`, also known as the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by [IDXGIFactory::EnumAdapters](https://docs.microsoft.com/windows/win32/api/dxgi/nf-dxgi-idxgifactory-enumadapters). A `device_id` of 0 always corresponds to the default adapter, which is typically the primary display GPU installed on the system. A negative `device_id` is invalid. - - OrtStatus* OrtSessionOptionsAppendExecutionProvider_DML( - _In_ OrtSessionOptions* options, - int device_id - ); - -### `OrtSessionOptionsAppendExecutionProviderEx_DML` function - -Creates a DirectML Execution Provider using the given DirectML device, and which executes work on the supplied D3D12 command queue. The DirectML device and D3D12 command queue must have the same parent [ID3D12Device](https://docs.microsoft.com/windows/win32/api/d3d12/nn-d3d12-id3d12device), or an error will be returned. The D3D12 command queue must be of type `DIRECT` or `COMPUTE` (see [D3D12_COMMAND_LIST_TYPE](https://docs.microsoft.com/windows/win32/api/d3d12/ne-d3d12-d3d12_command_list_type)). If this function succeeds, the inference session once created will maintain a strong reference on both the `dml_device` and `command_queue` objects. - - OrtStatus* OrtSessionOptionsAppendExecutionProviderEx_DML( - _In_ OrtSessionOptions* options, - _In_ IDMLDevice* dml_device, - _In_ ID3D12CommandQueue* cmd_queue - ); - -**See Also** - -[DMLCreateDevice function](https://docs.microsoft.com/windows/win32/api/directml/nf-directml-dmlcreatedevice) -[ID3D12Device::CreateCommandQueue method](https://docs.microsoft.com/windows/win32/api/d3d12/nf-d3d12-id3d12device-createcommandqueue) -[Direct3D 12 programming guide](https://docs.microsoft.com/windows/win32/direct3d12/directx-12-programming-guide) - -### ONNX opset support - -The DirectML execution provider currently supports ONNX opset 11 ([ONNX v1.6](https://github.com/onnx/onnx/releases/tag/v1.6.0)). Evaluating models which require a higher opset version is not supported, and may produce unexpected results. - -### Multi-threading and supported session options - -The DirectML execution provider does not support the use of memory pattern optimizations or parallel execution in onnxruntime. When supplying session options during InferenceSession creation, these options must be disabled or an error will be returned. - -If using the onnxruntime C API, you must call `DisableMemPattern` and `SetSessionExecutionMode` functions to set the options required by the DirectML execution provider. - -See [onnxruntime\include\onnxruntime\core\session\onnxruntime_c_api.h](../../include/onnxruntime/core/session/onnxruntime_c_api.h). - - OrtStatus*(ORT_API_CALL* DisableMemPattern)(_Inout_ OrtSessionOptions* options)NO_EXCEPTION; - - OrtStatus*(ORT_API_CALL* SetSessionExecutionMode)(_Inout_ OrtSessionOptions* options, ExecutionMode execution_mode)NO_EXCEPTION; - -If creating the onnxruntime InferenceSession object directly, you must set the appropriate fields on the `onnxruntime::SessionOptions` struct. Specifically, `execution_mode` must be set to `ExecutionMode::ORT_SEQUENTIAL`, and `enable_mem_pattern` must be `false`. - -Additionally, as the DirectML execution provider does not support parallel execution, it does not support multi-threaded calls to `Run` on the same inference session. That is, if an inference session using the DirectML execution provider, only one thread may call `Run` at a time. Multiple threads are permitted to call `Run` simultaneously if they operate on different inference session objects. - -## Samples - -A complete sample of onnxruntime using the DirectML execution provider can be found under [samples/c_cxx/fns_candy_style_transfer](../../samples/c_cxx/fns_candy_style_transfer). - -## Performance best practices -The DirectML execution provider works most efficiently when tensor shapes are known at the time a session is created. This provides a few performance benefits: -1) Because constant folding can occur more often, there may be fewer CPU / GPU copies and stalls during evaluations. -2) More initialization work occurs when sessions are created rather than during the first evaluation. -3) Weights may be pre-processed within DirectML, enabling more efficient algorithms to be used. -4) Graph optimization occurs within DirectML. For example, Concat operators may be removed, and more optimal tensor layouts may be used for the input and output of operators. - -Normally when the shapes of model inputs are known during session creation, the shapes for the rest of the model are inferred by OnnxRuntime when a session is created. However if a model input contains a free dimension (such as for batch size), steps must be taken to retain the above performance benefits. - -In this case, there are three options: -- Edit the model to replace an input's free dimension (specified through ONNX using "dim_param") with a fixed size (specified through ONNX using "dim_value"). -- Specify values of named dimensions within model inputs when creating the session using the OnnxRuntime *AddFreeDimensionOverrideByName* ABI. -- Edit the model to ensure that an input's free dimension has a [denotation](https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md) (such as "DATA_BATCH," or a custom denotation). Then when creating the session, specify the dimension size for each denotation. This can be done using the OnnxRuntime *AddFreeDimensionOverride* ABI. - - -## See also - -[DirectML documentation \(docs.microsoft.com\)](https://docs.microsoft.com/en-us/windows/win32/direct3d12/dml) diff --git a/docs/execution_providers/MIGraphX-ExecutionProvider.md b/docs/execution_providers/MIGraphX-ExecutionProvider.md deleted file mode 100644 index 4d4974eb5060d..0000000000000 --- a/docs/execution_providers/MIGraphX-ExecutionProvider.md +++ /dev/null @@ -1,35 +0,0 @@ -# MIGraphX Execution Provider - -ONNX Runtime's [MIGraphX](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/) execution provider uses AMD's Deep Learning graph optimization engine to accelerate ONNX model on AMD GPUs. - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#AMD-MIGraphX). - -## Using the MIGraphX execution provider -### C/C++ -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -int device_id = 0; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MiGraphX(sf, device_id)); -``` -You can check [here](https://github.com/scxiao/ort_test/tree/master/char_rnn) for a specific c/c++ program. - -The C API details are [here](../C_API.md#c-api). - -### Python -When using the Python wheel from the ONNX Runtime build with MIGraphX execution provider, it will be automatically -prioritized over the default GPU or CPU execution providers. There is no need to separately register the execution -provider. Python APIs details are [here](../python/api_summary.rst#api-summary). - -You can check [here](https://github.com/scxiao/ort_test/tree/master/python/run_onnx) for a python script to run an -model on either the CPU or MIGraphX Execution Provider. - -## Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest#onnxruntime-performance-test), use the flag `-e migraphx` - -## Configuring environment variables -MIGraphX providers an environment variable ORT_MIGRAPHX_FP16_ENABLE to enable the FP16 mode. - diff --git a/docs/execution_providers/MKL-DNN-Subgraphs.md b/docs/execution_providers/MKL-DNN-Subgraphs.md deleted file mode 100644 index 8bb451ddcb4c6..0000000000000 --- a/docs/execution_providers/MKL-DNN-Subgraphs.md +++ /dev/null @@ -1,65 +0,0 @@ -# Subgraph Optimization - -DNNL uses blocked layout (example: nhwc with channels blocked by 16 – nChw16c) to take advantage of vector operations using AVX512. To get best performance, we avoid reorders (example. Nchw16c to nchw) and propagate blocked layout to next primitive. - -Subgraph optimization achieves this in the following steps. -1. Parses ONNX Runtime graph and creates an Internal Representation of subgraph.. -2. Subgraph Operator (DnnlFunKernel) iterates through DNNL nodes and creates a vector DNNL Kernels -3. Compute Function of DnnlFunKernel iterates and binds data to DNNL primitives in the vector and submits vector for execution. - - -## Subgraph (IR) Internal Representation -DnnlExecutionProvicer::GetCapability() parses ONNX model graph and creates IR (Internal Representation) of subgraphs of DNNL operators. -Each subgraph contains a vector DnnlNodes, inputs, outputs and attributes for all its DnnlNodes. There can be attributes of same name. So, we prefix attribute names with Node name and its index. -Unique id for subgraph is set as an attribute. - -DnnlNode has an index to its inputs and outputs and pointer to its parent nodes. DnnlNode directly reads blocked memory from its parent to avoid data reordering. - -

- - -## Subgraph Classes -Primitive like DnnlConv, DnnlPool, etc are derived from DnnlKernel base class. - -The following UML diagram captures Subgraph classes. - -

- - -## Subgraph Execution - -DnnlExecutionProvicer::Compute() function creates DnnlFuncKernel and call it’s Compute Function. - - -DnnlFuncKernel::Compute function creates SubgraphPrimitve pool and add the object to a map. - -SubgraphPrimitve constructor calls the following member functions -``` -SubgraphPrimitve::CreatePrimitives() - for (auto& mklnode : mklnodes) { - if (mklnode.name == "Conv") { - kernel.reset(new DnnlConv()); - kernels.push_back(kernel); - } else if (mklnode.name == "BatchNormalization-Relu") { - kernel.reset(new DnnlBatchNorm()); - context_.kernels.push_back(kernel); - } else if (mklnode.name == "MaxPool") { - kernel.reset(new DnnlPool()); - context_.kernels.push_back(kernel); - } - . - . - . -``` -In CreatePrimitives method, we iterate DnnlNodes and creates DnnlKernel objects and add DNNL primitive to a vector. It also reads attributes. This is done only once, at first iteration. - -``` -SubgraphPrimitve::Compute() - for (auto& kernel : kernels) { - kernel->Bind(input_tensors, output_tensors); - } - stream->submit(net); -``` - -In SubgraphPrimitve::Compute() method, we iterate thru Dnnl Kernels and bind input data. Then we submit the vector of Primitives to DNNL stream. - diff --git a/docs/execution_providers/NNAPI-ExecutionProvider.md b/docs/execution_providers/NNAPI-ExecutionProvider.md deleted file mode 100644 index 0a96086768839..0000000000000 --- a/docs/execution_providers/NNAPI-ExecutionProvider.md +++ /dev/null @@ -1,21 +0,0 @@ -# NNAPI Execution Provider - -[Android Neural Networks API (NNAPI)](https://developer.android.com/ndk/guides/neuralnetworks) is a unified interface to CPU, GPU, and NN accelerators on Android. - -## Minimum requirements - -The NNAPI EP requires Android devices with Android 8.1 or higher, it is recommended to use Android devices with Android 9 or higher to achieve optimal performance. - -## Build NNAPI EP - -For build instructions, please see the [BUILD page](../../BUILD.md#Android-NNAPI-Execution-Provider). - -## Using NNAPI EP in C/C++ - -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(sf)); -Ort::Session session(env, model_path, sf); -``` -The C API details are [here](../C_API.md#c-api). diff --git a/docs/execution_providers/Nuphar-ExecutionProvider.md b/docs/execution_providers/Nuphar-ExecutionProvider.md deleted file mode 100644 index 8c7e0a8feb7a2..0000000000000 --- a/docs/execution_providers/Nuphar-ExecutionProvider.md +++ /dev/null @@ -1,170 +0,0 @@ -# Nuphar Execution Provider (preview) - -NUPHAR stands for Neural-network Unified Preprocessing Heterogeneous Architecture. As an execution provider in the ONNX Runtime, it is built on top of [TVM](https://github.com/dmlc/tvm) and [LLVM](https://llvm.org) to accelerate ONNX models by compiling nodes in subgraphs into optimized functions via JIT. It also provides JIT caching to save compilation time at runtime. - -Developers can tap into the power of Nuphar through ONNX Runtime to accelerate inferencing of ONNX models. The Nuphar execution provider comes with a common ONNX to TVM lowering [library](../../onnxruntime/core/codegen) that can potentially be reused by other execution providers to leverage TVM. With the Nuphar execution provider, the ONNX Runtime delivers better inferencing performance on the same hardware compared to generic X64 CPU acceleration, especially for quantized recurrent neural networks. Various products at Microsoft have seen up to a 5x improvement in performance with no loss of accuracy, by running quantized LSTMs via the Nuphar execution provider in the ONNX Runtime. - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#nuphar). - -## Using the Nuphar execution provider -### C/C++ -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nuphar(sf, /*allow_unaligned_buffers*/ 1, "")); -Ort::Session session(env, model_path, sf); - -### Python -You can use the Nuphar execution provider via the python wheel from the ONNX Runtime build. The Nuphar execution provider will be automatically prioritized over the default CPU execution providers, thus no need to separately register the execution provider. Python APIs details are [here](../python/api_summary.rst#api-summary). - -## Performance and Accuracy Testing -You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options. - -Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is: -- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS -- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool. - -This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on. - -## Model Conversion and Quantization -You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops. - -We use dynamic per-row quantization for inputs of LSTM MatMul, so MatMul becomes three parts: quantization, MatMulInteger and dequantization. Weights for MatMulInteger are statically quantized per-column to int8. We have observed good speed-up and no loss of accuracy with this quantization scheme inside Scan for various LSTM models. - -To convert models with LSTM/GRU/RNN ops to Scan ops: -``` -python model_editor.py --input /path/to/input/model --output /path/to/output/model --mode to_scan -``` - -To quantize MatMul ops to MatMulInteger ops (use option --only_for_scan to only quantize MatMuls inside Scan): -``` -python model_quantizer.py --input /path/to/input/model --output /path/to/output/model --only_for_scan -``` - -As an experiment, you may test conversion and quantization on [the BiDAF model](https://github.com/onnx/models/tree/master/bidaf) from the ONNX model zoo. This model has 5 bidirectional LSTM ops, and long sequence lengths. Our test shows that the quantized model has comparable accuracy of F1 76.24, EM 68.08, vs. floating point model accuracy of F1 76.20, EM 68.11. - -Speed-up in this model is ~20% on Intel Xeon E5-1620v4 (Note that AVX2 is required for Nuphar int8 GEMV performance), when comparing CPU execution provider with the floating point model with LSTM ops, vs. the Nuphar execution provider with quantized MatMulInteger inside Scan ops. Profile shows that most of the cost is in input projection outside of Scan ops, which uses MKL SGEMM. It's worth noting that MKL int8 GEMM is about the same speed as SGEMM in this model, so quantization of SGEMMs outside of Scan won't help performance. We are looking at ways to speedup int8 GEMM for better performance on quantized models. - -## JIT caching -You may cache JIT binaries to reduce model loading time spent in JIT, using [create_shared.cmd](../../onnxruntime/core/providers/nuphar/scripts/create_shared.cmd) on Windows with Visual Studio 2017, or [create_shared.sh](../../onnxruntime/core/providers/nuphar/scripts/create_shared.sh) on Linux with gcc. - -Windows -``` -REM You need to have Visual Studio 2017 for compile and link. Optionally, you can save model checksum to the output dll with FCIV tool from https://support.microsoft.com/en-us/help/841290 -set NUPHAR_CACHE_PATH=\path\to\jit\cache -REM Then run Nuphar inference from either onnx_test_runner or onnxruntime_perf_test, or whatever inference using C++ or Python -REM JIT object files would be saved to \path\to\jit\cache\ -create_shared.cmd \path\to\jit\cache\NUPHAR_CACHE_VERSION [optional_model_file_for_checksum] [optional_output_dll_name] -REM If checksum is embedded in dll, set NUPHAR_CACHE_MODEL_CHECKSUM to FCIV output for the model to inference to pass checksum verification at runtime -REM Checksum verification failure will cause Nuphar to fallback to JIT instead of loading binary from cache -REM Run Nuphar inference again with cached JIT dll -``` - -Linux -``` -# You need to have GCC of the same version Nuphar is built with, for compile and link. Optionally, you can save model checksum to jit.so with md5sum -export NUPHAR_CACHE_PATH=/path/to/jit/cache -# Then run Nuphar inference from either onnx_test_runner or onnxruntime_perf_test, or whatever inference using C++ or Python -# JIT object files would be saved to /path/to/jit/cache/ -create_shared.sh -c /path/to/jit/cache/NUPHAR_CACHE_VERSION [-m optional_model_file_for_checksum] [-o optional_output_so_name] -# If checksum is embedded in dll, set NUPHAR_CACHE_MODEL_CHECKSUM to md5sum output for the model to inference to pass checksum verification at runtime -# Checksum verification failure will cause Nuphar to fallback to JIT instead of loading binary from cache -# run Nuphar inference again with cached JIT dll -``` - - -## Debugging - -### NGEMM -NGEMM (Nuphar GEMM) is an optimized low-precision GEMM implementation based on compiler techniques. -Please refer to our paper for more details of NGEMM: ["NGEMM: Optimizing GEMM for Deep Learning via Compiler-based Techniques"](https://arxiv.org/abs/1910.00178). - -#### NGEMM Tiling / Permutation Configuration -NGEMM has default tiling parameters, but users can overwrite them through environment variables: -* NUPHAR_IGEMM_TILE_M / NUPHAR_IGEMM_TILE_N / NUPHAR_IGEMM_TILE_K - - These 3 parameters are the tiling sizes for the corresponding dimensions of GEMM ([M x K] x [K x N]). - Setting them to different values will generate GEMM with different tiling sizes. - -* NUPHAR_IGEMM_PERMUTE - - This enviornment variable is to control the loop permutation in GEMM. - The default is to not apply any loop permutation. Other options are "inner/outer/all",referring to apply permutations to only inner tile loops / only outer loops / both inner and outer loops, respectively. - - -There are several [environment variables](../../onnxruntime/core/codegen/common/settings.h) to dump debug information during code generation, plus [some more environment variables](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.h) to dump/control the Nuphar execution provider. You can set environment variables prior to inference to dump debug info to the console. To list some most useful ones: -* CODEGEN_DUMP_LOWER - - Dumps the lowered function from TVM. - - Set it to "verbose" to dump all nodes, or node op_type to dump specific nodes. You may use "concise" to dump just the op_type of nodes. - -* CODEGEN_DUMP_MODULE - - Dumps compiled binary. - - Set it to "ll" to dumps LLVM bit code, "asm" to dumps assembly. - -* CODEGEN_DUMP_SCHEDULE - - Dumps the schedule used in TVM nodes, like compute_root/compute_inline/compute_at. - - Set it to "verbose" to dump all nodes, or node op_type to dump specific nodes. You may use "concise" to dump just the op_type of nodes. - -* NUPHAR_DUMP_PARTITION - - Dumps nodes in each partition. - - Set it to "1" to dump partitions. - -## Settings -When there are conflicts of environment variables running Nuphar in multiple processes, user can specify settings string when creating the Nuphar execution provider. The string comprises of comma separated key:value pairs. Keys should be lower cased environment variable names as shown above, and separated from corresponding values with colon. For example, the equivalent string of setting environment variables of NUPHAR_CACHE_PATH/NUPHAR_CACHE_MODEL_CHECKSUM would be "nuphar_cache_path:, nuphar_cache_model_checksum:". - -* Using in C/C++ - -Settings string could be specified when creating execution provider to specify JIT cache path, as well as model checksum: - -``` -OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_Nuphar(session_options, 1, "nuphar_cache_path:/path/to/cache, nuphar_cache_model_checksum:")); -``` - -* Using in C# - -Settings string could be specified when creating session options: - -``` -SessionOptions.MakeSessionOptionWithNupharProvider("nuphar_cache_path:/path/to/cache, nuphar_cache_model_checksum:") -``` - -* Using in Python - -Settings string should be passed in before InferenceSession is created, as providers are not currently exposed yet. Here's an example in Python to set cache path and model checksum: - -``` -nuphar_settings = 'nuphar_cache_path:{}, nuphar_cache_model_checksum:{}'.format(cache_dir, model_checksum) -onnxruntime.capi._pybind_state.set_nuphar_settings(nuphar_settings) -sess = onnxruntime.InferenceSession(model_path) -``` - -## Known issues -* ONNX shape inference dependency - - To save runtime JIT cost, Nuphar requires models to have shape inference information from ONNX after model is loaded. Some nodes in ONNX can generate dynamic output tensor shapes from input data value, i.e. ConstantOfShape, Tile, Slice in opset 10, Compress, etc. Those ops may block ONNX shape inference and make the part of graph after such nodes not runnable in Nuphar. - - User may use Python script [symbolic_shape_infer.py](../../onnxruntime/python/tools/symbolic_shape_infer.py) to run symbolic shape inference in ONNX model. This script adds output tensor shapes in the model in graph.value_info field, by doing symbolic dimension computation using sympy when there are Shape ops in model. Besides, running symbolic shape inference on ONNX model would make the graph more readable. Note that when using [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to convert models with LSTM/GRU/RNN to Scan, the resulting model may have incomplete shape inference. Running symbolic_shape_infer.py is needed to get the Scan ops in the model to run in Nuphar. Besides, please note that quantization should be the last step, after verified accuracy and performance of the edited floating point model. - - In addition, user may also manually add shapes to graph.value_info using [onnx.helper.make_tensor_value_info](https://github.com/onnx/onnx/blob/v1.5.0/onnx/helper.py#L290) with model specific knowledge. For example, if you have Hardmax output casted to bool as Compress input condition, then the unknown dimension of the output of Compress is actually 1. - -* Performance benchmark - - Current Nuphar's speed-up in quantized RNNs is optimized for AVX2, when running in single thread and batch size is 1. To help understand RNN performance in different configurations, please use Python script [rnn_benchmark.py](../../onnxruntime/core/providers/nuphar/scripts/rnn_benchmark.py). For older X64 CPUs that do not support AVX2, quantized model may have worse performance than non-quantized ones. - -* Patches to TVM - - There are some changes/bug fixes in TVM for Nuphar to work properly. We are in the process of contributing them back to TVM, but for now patches are used in [our forked TVM](https://github.com/microsoft/onnxruntime-tvm). To build cleanly from scratch, please run following commands before running build.bat or build.sh: -``` -git submodule sync -git submodule foreach --recursive git stash -git submodule foreach --recursive git clean -fd -git submodule update --init --recursive -``` diff --git a/docs/execution_providers/OpenVINO-ExecutionProvider.md b/docs/execution_providers/OpenVINO-ExecutionProvider.md deleted file mode 100644 index 346e694a3f626..0000000000000 --- a/docs/execution_providers/OpenVINO-ExecutionProvider.md +++ /dev/null @@ -1,284 +0,0 @@ -# OpenVINO Execution Provider - -OpenVINO Execution Provider enables deep learning inference on Intel CPUs, Intel integrated GPUs and Intel® MovidiusTM Vision Processing Units (VPUs). Please refer to [this](https://software.intel.com/en-us/openvino-toolkit/hardware) page for details on the Intel hardware supported. - -### Build -For build instructions, please see the [BUILD page](../../BUILD.md#openvino). - -## Runtime configuration options ---- - -OpenVINO EP can be configured with certain options at runtime that control the behavior of the EP. These options can be set as key-value pairs as below:- - -### Python API -Key-Value pairs for config options can be set using the Session.set_providers API as follows:- - -``` -session = onnxruntime.InferenceSession(, options) -session.set_providers(['OpenVINOExecutionProvider'], [{Key1 : Value1, Key2 : Value2, ...}]) -``` -*Note that this causes the InferenceSession to be re-initialized, which may cause model recompilation and hardware re-initialization* - -### C/C++ API -All the options shown below are passed to SessionOptionsAppendExecutionProvider_OpenVINO() API and populated in the struct OrtOpenVINOProviderOptions in an example shown below, for example for CPU device type:- - -``` -OrtOpenVINOProviderOptions options; -options.device_type = "CPU_FP32"; -options.enable_vpu_fast_compile = 0; -options.device_id = ""; -options.num_of_threads = 8; -SessionOptionsAppendExecutionProvider_OpenVINO(session_options, &options); -``` - -### Available configuration options -The following table lists all the available configuratoin optoins and the Key-Value pairs to set them:- - -| **Key** | **Key type** | **Allowable Values** | **Value type** | **Description** | -| --- | --- | --- | --- | --- | -| device_type | string | CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16, VAD-F_FP32, Any valid Hetero combination, Any valid Multi-Device combination | string | Overrides the accelerator hardware type and precision with these values at runtime. If this option is not explicitly set, default hardware and precision specified during build time is used. | -| device_id | string | Any valid OpenVINO device ID | string | Selects a particular hardware device for inference. The list of valid OpenVINO device ID's available on a platform can be obtained either by Python API (`onnxruntime.capi._pybind_state.get_available_openvino_device_ids()`) or by [OpenVINO C/C++ API](https://docs.openvinotoolkit.org/latest/classInferenceEngine_1_1Core.html#acb212aa879e1234f51b845d2befae41c). If this option is not explicitly set, an arbitrary free device will be automatically selected by OpenVINO runtime.| -| enable_vpu_fast_compile | string | True/False | boolean | This option is only available for MYRIAD_FP16 VPU devices. During initialization of the VPU device with compiled model, Fast-compile may be optionally enabled to speeds up the model's compilation to VPU device specific format. This in-turn speeds up model initialization time. However, enabling this option may slowdown inference due to some of the optimizations not being fully applied, so caution is to be exercised while enabling this option. | -| num_of_threads | string | Any unsigned positive number other than 0 | size_t | Overrides the accelerator default value of number of threads with this value at runtime. If this option is not explicitly set, default value of 8 is used during build time. This option when set actually makes those number of free InferRequests made available in the pool so that each thread has a separate InferRequest available thus enabling Multi-threading during inference. Note: This option is not to set the num_of_threads for inferencing, it is to just set number of free InferRequests that should be made available. | - -Valid Hetero or Multi-Device combination's: -HETERO:,,... -MULTI:,,... -The can be any of these devices from this list ['CPU','GPU','MYRIAD','FPGA','HDDL'] - -A minimum of two DEVICE_TYPE'S should be specified for a valid HETERO or Multi-Device Build. - -Example: -HETERO:MYRIAD,CPU HETERO:HDDL,GPU,CPU MULTI:MYRIAD,GPU,CPU - -## Other configuration settings -### Onnxruntime Graph Optimization level -OpenVINO backend performs both hardware dependent as well as independent optimizations to the graph to infer it with on the target hardware with best possible performance. In most of the cases it has been observed that passing in the graph from the input model as is would lead to best possible optimizations by OpenVINO. For this reason, it is advised to turn off high level optimizations performed by ONNX Runtime before handing the graph over to OpenVINO backend. This can be done using Session options as shown below:- - -### Python API -``` -options = onnxruntime.SessionOptions() -options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL -sess = onnxruntime.InferenceSession(, options) -``` - -### C/C++ API -``` -SessionOptions::SetGraphOptimizationLevel(ORT_DISABLE_ALL); -``` - -### Deprecated: Dynamic device type selection -**Note: This API has been deprecated. Please use the mechanism mentioned above to set the 'device-type' option.** -When ONNX Runtime is built with OpenVINO Execution Provider, a target hardware option needs to be provided. This build time option becomes the default target harware the EP schedules inference on. However, this target may be overriden at runtime to schedule inference on a different hardware as shown below. - -Note. This dynamic hardware selection is optional. The EP falls back to the build-time default selection if no dynamic hardware option value is specified. - -### Python API -``` -import onnxruntime -onnxruntime.capi._pybind_state.set_openvino_device("") -# Create session after this -``` -*This property persists and gets applied to new sessions until it is explicity unset. To unset, assign a null string ("").* - -### C/C++ API - -Append the settings string "" to the EP settings string. Example shown below for the CPU_FP32 option: -``` -std::string settings_str; -... -settings_str.append("CPU_FP32"); -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_OpenVINO(sf, settings_str.c_str())); -``` - -## ONNX Layers supported using OpenVINO - -The table below shows the ONNX layers supported and validated using OpenVINO Execution Provider.The below table also lists the Intel hardware support for each of the layers. CPU refers to Intel® -Atom, Core, and Xeon processors. GPU refers to the Intel Integrated Graphics. VPU refers to USB based Intel® MovidiusTM -VPUs as well as Intel® Vision accelerator Design with Intel Movidius TM MyriadX VPU. - -| **ONNX Layers** | **CPU** | **GPU** | **VPU** | -| --- | --- | --- | --- | -| Abs | Yes | Yes | No | -| Acos | Yes | No | No | -| Acosh | Yes | No | No | -| Add | Yes | Yes | Yes | -| ArgMax | Yes | Yes | Yes | -| ArgMin | Yes | No | Yes | -| Asin | Yes | Yes | No | -| Asinh | Yes | Yes | No | -| Atan | Yes | Yes | No | -| Atanh | Yes | No | No | -| AveragePool | Yes | Yes | Yes | -| BatchNormalization | Yes | Yes | Yes | -| Cast | Yes | Yes | Yes | -| Ceil | No | Yes | No | -| Clip | Yes | Yes | Yes | -| Concat | Yes | Yes | Yes | -| Constant | Yes | Yes | Yes | -| ConstantOfShape | Yes | Yes | Yes | -| Conv | Yes | Yes | Yes | -| ConvTranspose | Yes | Yes | Yes | -| Cos | Yes | No | No | -| Cosh | Yes | No | No | -| DepthToSpace | Yes | Yes | Yes | -| Div | Yes | Yes | Yes | -| Dropout | Yes | Yes | Yes | -| Elu | Yes | Yes | Yes | -| Equal | Yes | Yes | Yes | -| Erf | Yes | Yes | Yes | -| Exp | Yes | Yes | Yes | -| Expand | No | No | Yes | -| Flatten | Yes | Yes | Yes | -| Floor | Yes | Yes | Yes | -| Gather | Yes | Yes | Yes | -| GatherND | No | No | Yes | -| Gemm | Yes | Yes | Yes | -| GlobalAveragePool | Yes | Yes | Yes | -| GlobalLpPool | Yes | Yes | No | -| HardSigmoid | Yes | Yes | No | -| Identity | Yes | Yes | Yes | -| InstanceNormalization | Yes | Yes | Yes | -| LeakyRelu | Yes | Yes | Yes | -| Less | Yes | Yes | Yes | -| Log | Yes | Yes | Yes | -| LRN | Yes | Yes | Yes | -| MatMul | Yes | Yes | Yes | -| Max | Yes | Yes | Yes | -| MaxPool | Yes | Yes | Yes | -| Mean | Yes | Yes | Yes | -| Min | Yes | Yes | Yes | -| Mul | Yes | Yes | Yes | -| Neg | Yes | Yes | Yes | -| NonMaxSuppression | No | No | Yes | -| NonZero | Yes | No | Yes | -| Not | Yes | Yes | Yes | -| OneHot | Yes | Yes | Yes | -| Pad | Yes | Yes | Yes | -| Pow | Yes | Yes | Yes | -| PRelu | Yes | Yes | Yes | -| Reciprocal | Yes | Yes | Yes | -| Range | No | No | Yes | -| ReduceLogSum | Yes | No | Yes | -| ReduceMax | Yes | Yes | Yes | -| ReduceMean | Yes | Yes | Yes | -| ReduceMin | Yes | Yes | Yes | -| ReduceProd | Yes | No | No | -| ReduceSum | Yes | Yes | Yes | -| ReduceSumSquare | Yes | No | Yes | -| Relu | Yes | Yes | Yes | -| Reshape | Yes | Yes | Yes | -| Resize | Yes | No | Yes | -| RoiAlign | No | No | Yes | -| Scatter | No | No | Yes | -| ScatterElements | No | No | Yes | -| Selu | Yes | Yes | No | -| Shape | Yes | Yes | Yes | -| Sigmoid | Yes | Yes | Yes | -| Sign | Yes | No | No | -| SinFloat | No | No | Yes | -| Sinh | Yes | No | No | -| Slice | Yes | Yes | Yes | -| Softmax | Yes | Yes | Yes | -| Softsign | Yes | No | No | -| SpaceToDepth | Yes | Yes | Yes | -| Split | Yes | Yes | Yes | -| Sqrt | Yes | Yes | Yes | -| Squeeze | Yes | Yes | Yes | -| Sub | Yes | Yes | Yes | -| Sum | Yes | Yes | Yes | -| Tan | Yes | Yes | No | -| Tanh | Yes | Yes | Yes | -| TopK | Yes | Yes | Yes | -| Transpose | Yes | Yes | Yes | -| Unsqueeze | Yes | Yes | Yes | -| Upsample | Yes | No | No | -| Where | No | No | Yes | - -## Topology Support - -Below topologies from ONNX open model zoo are fully supported on OpenVINO Execution Provider and many more are supported through sub-graph partitioning - -## Image Classification Networks - -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| bvlc_alexnet | Yes | Yes | Yes | Yes* | -| bvlc_googlenet | Yes | Yes | Yes | Yes* | -| bvlc_reference_caffenet | Yes | Yes | Yes | Yes* | -| bvlc_reference_rcnn_ilsvrc13 | Yes | Yes | Yes | Yes* | -| emotion ferplus | Yes | Yes | Yes | Yes* | -| densenet121 | Yes | Yes | Yes | Yes* | -| inception_v1 | Yes | Yes | Yes | Yes* | -| inception_v2 | Yes | Yes | Yes | Yes* | -| mobilenetv2 | Yes | Yes | Yes | Yes* | -| resnet18v1 | Yes | Yes | Yes | Yes* | -| resnet34v1 | Yes | Yes | Yes | Yes* | -| resnet101v1 | Yes | Yes | Yes | Yes* | -| resnet152v1 | Yes | Yes | Yes | Yes* | -| resnet18v2 | Yes | Yes | Yes | Yes* | -| resnet34v2 | Yes | Yes | Yes | Yes* | -| resnet101v2 | Yes | Yes | Yes | Yes* | -| resnet152v2 | Yes | Yes | Yes | Yes* | -| resnet50 | Yes | Yes | Yes | Yes* | -| resnet50v2 | Yes | Yes | Yes | Yes* | -| shufflenet | Yes | Yes | Yes | Yes* | -| squeezenet1.1 | Yes | Yes | Yes | Yes* | -| vgg19 | Yes | Yes | Yes | Yes* | -| vgg16 | Yes | Yes | Yes | Yes* | -| zfnet512 | Yes | Yes | Yes | Yes* | -| arcface | Yes | Yes | Yes | Yes* | - - -## Image Recognition Networks -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| mnist | Yes | Yes | Yes | Yes* | - -## Object Detection Networks -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| tiny_yolov2 | Yes | Yes | Yes | Yes* | - -## Image Manipulation Networks -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| mosaic | Yes | No | No | No* | -| candy | Yes | No | No | No* | -| rain_princess | Yes | No | No | No* | -| pointilism | Yes | No | No | No* | -| udnie | Yes | No | No | No* | - -*FPGA only runs in HETERO mode wherein the layers that are not supported on FPGA fall back to OpenVINO CPU. - -## Inferencing on FP16 Models -FP16 models can be inferenced on a VPU with device_type = "MYRIAD_FP16" and on GPU with -device_type = "GPU_FP16" - -## CSharp API - -To use csharp api for openvino execution provider create a custom nuget package. Follow the instructions [here](../../BUILD.md##build-nuget-packages) to install prerequisites for nuget creation. Once prerequisites are installed follow the instructions to [build openvino](../../BUILD.md#openvino) and add an extra flag `--build_nuget` to create nuget packages. Two nuget packages will be created Microsoft.ML.OnnxRuntime.Managed and Microsoft.ML.OnnxRuntime.Openvino. - -## Multi-threading for OpenVINO EP - -OpenVINO Execution Provider enables thread-safe deep learning inference - -## Heterogeneous Execution for OpenVINO EP - -The heterogeneous Execution enables computing for inference on one network on several devices. Purposes to execute networks in heterogeneous mode - -To utilize accelerators power and calculate heaviest parts of network on accelerator and execute not supported layers on fallback devices like CPU -To utilize all available hardware more efficiently during one inference - -For more information on Heterogeneous plugin of OpenVINO, please refer to the following -[documentation](https://docs.openvinotoolkit.org/latest/openvino_docs_IE_DG_supported_plugins_HETERO.html). - -## Multi-Device Execution for OpenVINO EP - -Multi-Device plugin automatically assigns inference requests to available computational devices to execute the requests in parallel. Potential gains are as follows - -Improved throughput that multiple devices can deliver (compared to single-device execution) -More consistent performance, since the devices can now share the inference burden (so that if one device is becoming too busy, another device can take more of the load) - -For more information on Multi-Device plugin of OpenVINO, please refer to the following -[documentation](https://docs.openvinotoolkit.org/latest/openvino_docs_IE_DG_supported_plugins_MULTI.html#introducing_multi_device_execution). diff --git a/docs/execution_providers/README.md b/docs/execution_providers/README.md deleted file mode 100644 index dccaaff747059..0000000000000 --- a/docs/execution_providers/README.md +++ /dev/null @@ -1,64 +0,0 @@ -# Introduction - -ONNX Runtime is capable of working with different HW acceleration libraries to execute the ONNX models on the hardware platform. ONNX Runtime supports an extensible framework, called **Execution Providers** (EP), to integrate with the HW specific libraries. This interface enables flexibility for the AP application developer to deploy their ONNX models in different environments in the cloud and the edge and optimize the execution by taking advantage of the compute capabilities of the platform. - -

Executing ONNX models across different HW environments

- -ONNX Runtime works with the execution provider(s) using the `GetCapability()` interface to allocate specific nodes or sub-graphs for execution by the EP library in supported hardware. The EP libraries that are preinstalled in the execution environment processes and executes the ONNX sub-graph on the hardware. This architecture abstracts out the details of the hardware specific libraries that are essential to optimizing the execution of deep neural networks across hardware platforms like CPU, GPU, FPGA or specialized NPUs. - -

ONNX Runtime GetCapability()

- -ONNX Runtime supports many different execution providers today. Some of the EPs are in GA and used in live service. Many are in released in preview to enable developers to develop and customize their application using the different options. - -### Adding an Execution Provider - -Developers of specialized HW acceleration solutions can integrate with ONNX Runtime to execute ONNX models on their stack. To create an EP to interface with ONNX Runtime you must first identify a unique name for the EP. Follow the steps outlined [here](../AddingExecutionProvider.md) to integrate your code in the repo. - -### Building ONNX Runtime package with EPs - -The ONNX Runtime package can be built with any combination of the EPs along with the default CPU execution provider. **Note** that if multiple EPs are combined into the same ONNX Runtime package then all the dependent libraries must be present in the execution environment. The steps for producing the ONNX Runtime package with different EPs is documented [here](../../BUILD.md#execution-providers). - -### APIs for Execution Provider - -The same ONNX Runtime API is used across all EPs. This provides the consistent interface for applications to run with different HW acceleration platforms. The APIs to set EP options are available across Python, C/C++/C#, Java and node.js. **Note** we are updating our API support to get parity across all language binding and will update specifics here. - - `get_providers`: Return list of registered execution providers. - `get_provider_options`: Return the registered execution providers' configurations. - `set_providers`: Register the given list of execution providers. The underlying session is re-created. - The list of providers is ordered by Priority. For example ['CUDAExecutionProvider', 'CPUExecutionProvider'] - means execute a node using CUDAExecutionProvider if capable, otherwise execute using CPUExecutionProvider. - -### Using Execution Providers - -``` python -import onnxruntime as rt - -#define the priority order for the execution providers -# prefer CUDA Execution Provider over CPU Execution Provider -EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - -# initialize the model.onnx -sess = rt.InferenceSession("model.onnx", providers=EP_list) - -# get the outputs metadata as a list of :class:`onnxruntime.NodeArg` -output_name = sess.get_outputs()[0].name - -# get the inputs metadata as a list of :class:`onnxruntime.NodeArg` -input_name = sess.get_inputs()[0].name - -# inference run using image_data as the input to the model -detections = sess.run([output_name], {input_name: image_data})[0] - -print("Output shape:", detections.shape) - -# Process the image to mark the inference points -image = post.image_postprocess(original_image, input_size, detections) -image = Image.fromarray(image) -image.save("kite-with-objects.jpg") - -# Update EP priority to only CPUExecutionProvider -sess.set_providers('CPUExecutionProvider') - -cpu_detection = sess.run(...) - -``` diff --git a/docs/execution_providers/RKNPU-ExecutionProvider.md b/docs/execution_providers/RKNPU-ExecutionProvider.md deleted file mode 100644 index 7b1431aa6b922..0000000000000 --- a/docs/execution_providers/RKNPU-ExecutionProvider.md +++ /dev/null @@ -1,70 +0,0 @@ -# RKNPU Execution Provider (preview) -RKNPU DDK is an advanced interface to access Rockchip NPU. RKNPU Execution Provider enables deep learning inference on Rockchip NPU via RKNPU DDK. - -## Supported platforms - -* RK1808 Linux - -*Note: RK3399Pro platform is not supported.* - - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#RKNPU). - -## Usage -### C/C++ -To use RKNPU as execution provider for inferencing, please register it as below. -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_RKNPU(sf)); -Ort::Session session(env, model_path, sf); -``` -The C API details are [here](../C_API.md#c-api). - - -## Supported Operators - -The table below shows the ONNX Ops supported using RKNPU Execution Provider and the mapping between ONNX Ops and RKNPU Ops. - -| **ONNX Ops** | **RKNPU Ops** | -| --- | --- | -| Add | ADD | -| Mul | MULTIPLY | -| Conv | CONV2D | -| QLinearConv | CONV2D | -| Gemm | FULLCONNECT | -| Softmax | SOFTMAX | -| AveragePool | POOL | -| GlobalAveragePool | POOL | -| MaxPool | POOL | -| GlobalMaxPool | POOL | -| LeakyRelu | LEAKY_RELU | -| Concat | CONCAT | -| BatchNormalization | BATCH_NORM | -| Reshape | RESHAPE | -| Flatten | RESHAPE | -| Squeeze | RESHAPE | -| Unsqueeze | RESHAPE | -| Transpose | PERMUTE | -| Relu | RELU | -| Sub | SUBTRACT | -| Clip(0~6)| RELU6 | -| DequantizeLinear | DATACONVERT | -| Clip | CLIP | - - -## Supported Models - -Below Models are supported from ONNX open model zoo using RKNPU Execution Provider - -### Image Classification -- squeezenet -- mobilenetv2-1.0 -- resnet50v1 -- resnet50v2 -- inception_v2 - -### Object Detection -- ssd -- yolov3 \ No newline at end of file diff --git a/docs/execution_providers/TensorRT-ExecutionProvider.md b/docs/execution_providers/TensorRT-ExecutionProvider.md deleted file mode 100644 index 8d26966e3f6c0..0000000000000 --- a/docs/execution_providers/TensorRT-ExecutionProvider.md +++ /dev/null @@ -1,114 +0,0 @@ -# TensorRT Execution Provider - -The TensorRT execution provider in the ONNX Runtime makes use of NVIDIA's [TensorRT](https://developer.nvidia.com/tensorrt) Deep Learning inferencing engine to accelerate ONNX model in their family of GPUs. Microsoft and NVIDIA worked closely to integrate the TensorRT execution provider with ONNX Runtime. - -With the TensorRT execution provider, the ONNX Runtime delivers better inferencing performance on the same hardware compared to generic GPU acceleration. - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#TensorRT). - -The TensorRT execution provider for ONNX Runtime is built and tested with TensorRT 7.1.3.4. - -## Using the TensorRT execution provider -### C/C++ -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -int device_id = 0; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id)); -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id)); -Ort::Session session(env, model_path, sf); -``` -The C API details are [here](../C_API.md#c-api). - -#### Shape Inference for TensorRT Subgraphs -If some operators in the model are not supported by TensorRT, ONNX Runtime will partition the graph and only send supported subgraphs to TensorRT execution provider. Because TensorRT requires that all inputs of the subgraphs have shape specified, ONNX Runtime will throw error if there is no input shape info. In this case please run shape inference for the entire model first by running script [here](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/symbolic_shape_infer.py). - -#### Sample -This example shows how to run Faster R-CNN model on TensorRT execution provider, - -First, download Faster R-CNN onnx model from onnx model zoo [here](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/faster-rcnn). - -Second, infer shapes in the model by running shape inference script [here](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/symbolic_shape_infer.py), -``` -python symbolic_shape_infer.py --input /path/to/onnx/model/model.onnx --output /path/to/onnx/model/new_model.onnx --auto_merge -``` - -Third, replace original model with the new model and run onnx_test_runner tool under ONNX Runtime build directory, -``` -./onnx_test_runner -e tensorrt /path/to/onnx/model/ -``` - -### Python -When using the Python wheel from the ONNX Runtime build with TensorRT execution provider, it will be automatically prioritized over the default GPU or CPU execution providers. There is no need to separately register the execution provider. Python APIs details are . - -#### Sample -Please see [this Notebook](../python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb) for an example of running a model on GPU using ONNX Runtime through Azure Machine Learning Services. - -## Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest#onnxruntime-performance-test), use the flag `-e tensorrt` - -## Configuring environment variables -There are several environment variables for TensorRT execution provider. - -* ORT_TENSORRT_MAX_WORKSPACE_SIZE: maximum workspace size for TensorRT engine. Default value: 1073741824 (1GB). - -* ORT_TENSORRT_MAX_PARTITION_ITERATIONS: maximum number of iterations allowed in model partitioning for TensorRT. If target model can't be successfully partitioned when the maximum number of iterations is reached, the whole model will fall back to other execution providers such as CUDA or CPU. Default value: 1000. - -* ORT_TENSORRT_MIN_SUBGRAPH_SIZE: minimum node size in a subgraph after partitioning. Subgraphs with smaller size will fall back to other execution providers. Default value: 1. - -* ORT_TENSORRT_FP16_ENABLE: Enable FP16 mode in TensorRT. 1: enabled, 0: disabled. Default value: 0. - -* ORT_TENSORRT_INT8_ENABLE: Enable INT8 mode in TensorRT. 1: enabled, 0: disabled. Default value: 0. - -* ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME: Specify INT8 calibration table file name. By default the name is "INT8_calibration_table". - -* ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE: Select what calibration table is used. If 1, native TensorRT generated calibration table is used; if 0, ONNXRUNTIME tool generated calibration table is used. Default value: 0. -**Note: Please copy up-to-date calibration table file to ORT_TENSORRT_CACHE_PATH before inference. Calibration table is specific to models and calibration data sets. Whenever new calibration table is generated, old file in the path should be cleaned up or be replaced. - -* ORT_TENSORRT_ENGINE_CACHE_ENABLE: Enable TensorRT engine caching. The purpose of using engine caching is to save engine build time in the cases that TensorRT may take long time to optimize and build engine. Engine will be cached after it's built at the first time so that next time when inference session is created the engine can be loaded directly from cache. In order to validate that the loaded engine is usable for current inference, engine profile is also cached and loaded along with engine. If current input shapes are in the range of the engine profile, that means the loaded engine can be safely used. Otherwise if input shapes are out of range, profile cache will be updated to cover the new shape and engine will be recreated based on the new profile (and also refreshed in the engine cache). Note each engine is created for specific settings such as precision (FP32/FP16/INT8 etc), workspace, profiles etc, and specific GPUs and it's not portable, so it's essential to make sure those settings are not changing, otherwise the engines need to be rebuilt and cached again. 1: enabled, 0: disabled. Default value: 0. -**Warning: Please clean up any old engine and profile cache files (.engine and .profile) if any of the following changes:** - - Model changes (if there are any changes to the model topology, opset version etc.) - - ORT version changes (i.e. moving from ORT version 1.4 to 1.5) - - TensorRT version changes (i.e. moving from TensorRT 7.0 to 7.1) - - Hardware changes. (Engine and profile files are not portable and optimized for specific Nvidia hardware) - -* ORT_TENSORRT_ENGINE_CACHE_PATH: This variable is deprecated. Please use ORT_TENSORRT_CACHE_PATH instead. - -* ORT_TENSORRT_CACHE_PATH: Specify path for TensorRT engine and profile files if ORT_TENSORRT_ENGINE_CACHE_ENABLE is 1, or path for INT8 calibration table file if ORT_TENSORRT_INT8_ENABLE is 1. - -* ORT_TENSORRT_DUMP_SUBGRAPHS: Dumps the subgraphs that are transformed into TRT engines in onnx format to the filesystem. This can help debugging subgraphs, e.g. by using `trtexec --onnx my_model.onnx` and check the outputs of the parser. 1: enabled, 0: disabled. Default value: 0. - -One can override default values by setting environment variables ORT_TENSORRT_MAX_WORKSPACE_SIZE, ORT_TENSORRT_MAX_PARTITION_ITERATIONS, ORT_TENSORRT_MIN_SUBGRAPH_SIZE, ORT_TENSORRT_FP16_ENABLE, ORT_TENSORRT_INT8_ENABLE, ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME, ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE, ORT_TENSORRT_ENGINE_CACHE_ENABLE, ORT_TENSORRT_CACHE_PATH and ORT_TENSORRT_DUMP_SUBGRAPHS. -e.g. on Linux - -### override default max workspace size to 2GB -export ORT_TENSORRT_MAX_WORKSPACE_SIZE=2147483648 - -### override default maximum number of iterations to 10 -export ORT_TENSORRT_MAX_PARTITION_ITERATIONS=10 - -### override default minimum subgraph node size to 5 -export ORT_TENSORRT_MIN_SUBGRAPH_SIZE=5 - -### Enable FP16 mode in TensorRT -export ORT_TENSORRT_FP16_ENABLE=1 - -### Enable INT8 mode in TensorRT -export ORT_TENSORRT_INT8_ENABLE=1 - -### Use native TensorRT calibration table -export ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE=1 - -### Enable TensorRT engine caching -export ORT_TENSORRT_ENGINE_CACHE_ENABLE=1 -* Please Note warning above. This feature is experimental. Engine cache files must be invalidated if there are any changes to the model, ORT version, TensorRT version or if the -underlying hardware changes. Engine files are not portable across devices. - -### Specify TensorRT cache path -export ORT_TENSORRT_CACHE_PATH="/path/to/cache" - -### Dump out subgraphs to run on TensorRT -export ORT_TENSORRT_DUMP_SUBGRAPHS = 1 diff --git a/docs/execution_providers/Vitis-AI-ExecutionProvider.md b/docs/execution_providers/Vitis-AI-ExecutionProvider.md deleted file mode 100644 index 0063d6bfb6209..0000000000000 --- a/docs/execution_providers/Vitis-AI-ExecutionProvider.md +++ /dev/null @@ -1,118 +0,0 @@ -

- -

- -# Vitis-AI Execution Provider - -[Vitis-AI](https://github.com/Xilinx/Vitis-AI) is Xilinx's development stack for hardware-accelerated AI inference on Xilinx platforms, including both edge devices and Alveo cards. It consists of optimized IP, tools, libraries, models, and example designs. It is designed with high efficiency and ease of use in mind, unleashing the full potential of AI acceleration on Xilinx FPGA and ACAP. - -The current Vitis-AI execution provider inside ONNXRuntime enables acceleration of Neural Network model inference using DPUv1. DPUv1 is a hardware accelerator for Convolutional Neural Networks (CNN) on top of the Xilinx [Alveo](https://www.xilinx.com/products/boards-and-kits/alveo.html) platform and targets U200 and U250 accelerator cards. - -On this page you will find information on how to [build](#Build) ONNXRuntime with Vitis-AI and on how to [get started](#Getting-started) with an example. - -## Build - -For building ONNXRuntime with the Vitis-AI execution provider, you will have to setup the hardware environment and build the docker, see [build steps](#Hardware-setup-and-docker-build). - -### System requirements - -The following table lists system requirements for running docker containers as well as Alveo cards. - - -| **Component** | **Requirement** | -|-----------------------------------------------------|------------------------------------------------------------| -| Motherboard | PCI Express 3\.0\-compliant with one dual\-width x16 slot | -| System Power Supply | 225W | -| Operating System | Ubuntu 16\.04, 18\.04 | -| | CentOS 7\.4, 7\.5 | -| | RHEL 7\.4, 7\.5 | -| CPU | Intel i3/i5/i7/i9/Xeon 64-bit CPU | -| GPU \(Optional to accelerate quantization\) | NVIDIA GPU with a compute capability > 3.0 | -| CUDA Driver \(Optional to accelerate quantization\) | nvidia\-410 | -| FPGA | Xilinx Alveo U200 or U250 | -| Docker Version | 19\.03\.1 | - -### Hardware setup and docker build - -1. Clone the Vitis AI repository: - ``` - git clone https://github.com/xilinx/vitis-ai - ``` -2. Install the Docker, and add the user to the docker group. Link the user to docker installation instructions from the following docker's website: - * https://docs.docker.com/install/linux/docker-ce/ubuntu/ - * https://docs.docker.com/install/linux/docker-ce/centos/ - * https://docs.docker.com/install/linux/linux-postinstall/ -3. Any GPU instructions will have to be separated from Vitis AI. -4. Set up Vitis AI to target Alveo cards. To target Alveo cards with Vitis AI for machine learning workloads, you must install the following software components: - * Xilinx Runtime (XRT) - * Alveo Deployment Shells (DSAs) - * Xilinx Resource Manager (XRM) (xbutler) - * Xilinx Overlaybins (Accelerators to Dynamically Load - binary programming files) - - While it is possible to install all of these software components individually, a script has been provided to automatically install them at once. To do so: - * Run the following commands: - ``` - cd Vitis-AI/alveo/packages - sudo su - ./install.sh - ``` - * Power cycle the system. -5. Build and start the ONNXRuntime Vitis-AI Docker Container. - ``` - cd {onnxruntime-root}/dockerfiles - docker build -t onnxruntime-vitisai -f Dockerfile.vitisai . - ./scripts/docker_run_vitisai.sh - ``` - - Setup inside container - ``` - source /opt/xilinx/xrt/setup.sh - conda activate vitis-ai-tensorflow - ``` - -## Getting started - -### On-the-fly quantization - -Usually, to be able to accelerate inference of Neural Network models with Vitis-AI DPU accelerators, those models need to quantized upfront. In the ONNXRuntime Vitis-AI execution provider we make use of on-the-fly quantization to remove this additional preprocessing step. In this flow, one doesn't need to quantize his/her model upfront but can make use of the typical inference execution calls (InferenceSession.run) to quantize the model on-the-fly using the first N inputs that are provided (see more information below). This will set up and calibrate the Vitis-AI DPU and from that point onwards inference will be accelerated for all next inputs. - -### Config/Settings - -A couple of environment variables can be used to customize the Vitis-AI execution provider. - -| **Environment Variable** | **Default if unset** | **Explanation** | -|----------------------------|---------------------------|---------------------------------------------------------| -| PX_QUANT_SIZE | 128 | The number of inputs that will be used for quantization (necessary for Vitis-AI acceleration) | -| PX_BUILD_DIR | Use the on-the-fly quantization flow | Loads the quantization and compilation information from the provided build directory and immediately starts Vitis-AI hardware acceleration. This configuration can be used if the model has been executed before using on-the-fly quantization during which the quantization and comilation information was cached in a build directory. | - -### Samples - -When using python, you can base yourself on the following example: - -``` -# Import pyxir before onnxruntime -import pyxir -import pyxir.frontend.onnx -import pyxir.contrib.dpuv1.dpuv1 - -import onnxruntime - -# Add other imports -# ... - -# Load inputs and do preprocessing -# ... - -# Create an inference session using the Vitis-AI execution provider -session = onnxruntime.InferenceSession('[model_file].onnx', None,["VitisAIExecutionProvider"]) - -# First N (default = 128) inputs are used for quantization calibration and will -# be executed on the CPU -# This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64) -imput_name = [...] -outputs = [session.run([], {input_name: calib_inputs[i]})[0] for i in range(128)] - -# Afterwards, computations will be accelerated on the FPGA -input_data = [...] -result = session.run([], {input_name: input_data}) -``` diff --git a/docs/python/examples/plot_dl_keras.py b/docs/python/examples/plot_dl_keras.py deleted file mode 100644 index 949ee895e5912..0000000000000 --- a/docs/python/examples/plot_dl_keras.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -""" - -.. _l-example-backend-api-tensorflow: - -ONNX Runtime for Keras -====================== - -The following demonstrates how to compute the predictions -of a pretrained deep learning model obtained from -`keras `_ -with *onnxruntime*. The conversion requires -`keras `_, -`tensorflow `_, -`keras-onnx `_, -`onnxmltools `_ -but then only *onnxruntime* is required -to compute the predictions. -""" -import os -if not os.path.exists('dense121.onnx'): - from keras.applications.densenet import DenseNet121 - model = DenseNet121(include_top=True, weights='imagenet') - - from keras2onnx import convert_keras - onx = convert_keras(model, 'dense121.onnx') - with open("dense121.onnx", "wb") as f: - f.write(onx.SerializeToString()) - -################################## -# Let's load an image (source: wikipedia). - -from keras.preprocessing.image import array_to_img, img_to_array, load_img -img = load_img('Sannosawa1.jpg') -ximg = img_to_array(img) - -import matplotlib.pyplot as plt -plt.imshow(ximg / 255) -plt.axis('off') - -############################################# -# Let's load the model with onnxruntime. -import onnxruntime as rt -from onnxruntime.capi.onnxruntime_pybind11_state import InvalidGraph - -try: - sess = rt.InferenceSession('dense121.onnx') - ok = True -except (InvalidGraph, TypeError, RuntimeError) as e: - # Probably a mismatch between onnxruntime and onnx version. - print(e) - ok = False - -if ok: - print("The model expects input shape:", sess.get_inputs()[0].shape) - print("image shape:", ximg.shape) - -####################################### -# Let's resize the image. - -if ok: - from skimage.transform import resize - import numpy - - ximg224 = resize(ximg / 255, (224, 224, 3), anti_aliasing=True) - ximg = ximg224[numpy.newaxis, :, :, :] - ximg = ximg.astype(numpy.float32) - - print("new shape:", ximg.shape) - -################################## -# Let's compute the output. - -if ok: - input_name = sess.get_inputs()[0].name - res = sess.run(None, {input_name: ximg}) - prob = res[0] - print(prob.ravel()[:10]) # Too big to be displayed. - - -################################## -# Let's get more comprehensive results. - -if ok: - from keras.applications.densenet import decode_predictions - decoded = decode_predictions(prob) - - import pandas - df = pandas.DataFrame(decoded[0], columns=["class_id", "name", "P"]) - print(df) - - diff --git a/docs/python/requirements.txt b/docs/python/requirements.txt index da66f486e8a0d..8bfb4e15bdea5 100644 --- a/docs/python/requirements.txt +++ b/docs/python/requirements.txt @@ -1,5 +1,3 @@ -keras -keras-onnx sphinx sphinx-gallery pyquickhelper diff --git a/include/onnxruntime/core/common/code_location.h b/include/onnxruntime/core/common/code_location.h index ff6506c9a7abb..2fdb2d3a41630 100644 --- a/include/onnxruntime/core/common/code_location.h +++ b/include/onnxruntime/core/common/code_location.h @@ -19,7 +19,7 @@ struct CodeLocation { */ CodeLocation(const char* file_path, const int line, const char* func) : file_and_path{file_path}, line_num{line}, function{func} { - } + } /** @param file_path Usually the value of __FILE__ @@ -29,7 +29,7 @@ struct CodeLocation { */ CodeLocation(const char* file_path, const int line, const char* func, const std::vector& stacktrace) : file_and_path{file_path}, line_num{line}, function{func}, stacktrace(stacktrace) { - } + } std::string FileNoPath() const { // assuming we always have work to do, so not trying to avoid creating a new string if diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h index 2d9dc34e348da..6394c8f387e00 100644 --- a/include/onnxruntime/core/common/common.h +++ b/include/onnxruntime/core/common/common.h @@ -193,19 +193,16 @@ void LogRuntimeError(uint32_t session_id, const common::Status& status, const ch ::onnxruntime::MakeString(__VA_ARGS__)) // Check condition. if met, return status. -#define ORT_RETURN_IF(condition, ...) \ - if (condition) { \ - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, \ - "Satisfied, but should not be: " #condition "\n", \ - ORT_WHERE.ToString(), ::onnxruntime::MakeString(__VA_ARGS__)); \ +#define ORT_RETURN_IF(condition, ...) \ + if (condition) { \ + return ::onnxruntime::common::Status(::onnxruntime::common::ONNXRUNTIME, \ + ::onnxruntime::common::FAIL, \ + ::onnxruntime::MakeString(ORT_WHERE.ToString(), " ", __VA_ARGS__)); \ } // Check condition. if not met, return status. -#define ORT_RETURN_IF_NOT(condition, ...) \ - if (!(condition)) { \ - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not satisfied: " #condition "\n", \ - ORT_WHERE.ToString(), ::onnxruntime::MakeString(__VA_ARGS__)); \ - } +#define ORT_RETURN_IF_NOT(condition, ...) \ + ORT_RETURN_IF(!(condition), __VA_ARGS__) // Macros to disable the copy and/or move ctor and assignment methods // These are usually placed in the private: declarations for a class. diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h index 4e7f3c6e60343..c71376d3164b3 100644 --- a/include/onnxruntime/core/framework/data_types.h +++ b/include/onnxruntime/core/framework/data_types.h @@ -59,50 +59,10 @@ struct MLFloat16 { explicit MLFloat16(uint16_t x) : val(x) {} explicit MLFloat16(float f); - // Taken from https://stackoverflow.com/a/60047308/12627730 - float AsFloat(uint32_t x) const { - float out = 0.0f; - std::memcpy(&out, &x, sizeof(x)); - return out; - } - - // Taken from https://stackoverflow.com/a/60047308/12627730 - uint32_t AsUint(float x) const { - uint32_t out = 0; - std::memcpy(&out, &x, sizeof(x)); - return out; - } - - float HalfToFloat(const uint16_t x) const { - uint16_t half = x; - if (endian::native == endian::big) { - // Taken from https://stackoverflow.com/a/2182184/12627730 - half = (x >> 8) | (x << 8); - } - - // Taken from https://stackoverflow.com/a/60047308/12627730 - // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, - // +-5.9604645E-8, 3.311 digits - const uint32_t e = (half & 0x7C00) >> 10; // exponent - const uint32_t m = (half & 0x03FF) << 13; // mantissa - // evil log2 bit hack to count leading zeros in denormalized format - const uint32_t v = AsUint(static_cast(m)) >> 23; - uint32_t full = (half & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) | - ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)); // sign : normalized : denormalized - - if (endian::native == endian::big) { - // Taken from https://stackoverflow.com/a/2182184/12627730 - full = ((full >> 24) & 0xff) | // move byte 3 to byte 0 - ((full << 8) & 0xff0000) | // move byte 1 to byte 2 - ((full >> 8) & 0xff00) | // move byte 2 to byte 1 - ((full << 24) & 0xff000000); // byte 0 to byte 3 - } - - return AsFloat(full); - } + float ToFloat() const; operator float() const { - return HalfToFloat(val); + return ToFloat(); } }; diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h index c195deaee0329..8e1649c5523ea 100644 --- a/include/onnxruntime/core/framework/data_types_internal.h +++ b/include/onnxruntime/core/framework/data_types_internal.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "boost/mp11.hpp" @@ -16,11 +17,6 @@ #include "core/framework/data_types.h" #include "core/graph/onnx_protobuf.h" -#ifdef _MSC_VER -#pragma warning(push) -//TODO: fix the warning in CallableDispatchableRetHelper -#pragma warning(disable : 4702) -#endif namespace onnxruntime { namespace utils { @@ -116,7 +112,7 @@ constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType(__VA_ARGS__); \ break; \ case ONNX_NAMESPACE::TensorProto_DataType_UINT8: \ - function(__VA_ARGS__); \ + function(__VA_ARGS__); \ break; \ case ONNX_NAMESPACE::TensorProto_DataType_INT16: \ function(__VA_ARGS__); \ @@ -223,11 +219,13 @@ inline bool IsPrimitiveDataType(const PrimitiveDataTypeBase* prim_type) { // This implementation contains a workaround for GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47226 // GCC until very recently does not support template parameter pack expansion within lambda context. namespace mltype_dispatcher_internal { + // T - type handled by this helper -struct CallableDispatchableHelper { +class CallableDispatchableHelper { int32_t dt_type_; // Type currently dispatched size_t called_; + public: explicit CallableDispatchableHelper(int32_t dt_type) noexcept : dt_type_(dt_type), called_(0) {} // Must return integer to be in a expandable context @@ -239,6 +237,10 @@ struct CallableDispatchableHelper { } return 0; } + + void CheckCalledOnce() { + ORT_ENFORCE(called_ == 1, "Unsupported data type: ", dt_type_); + } }; // Default policy is to throw with no return type. @@ -250,17 +252,16 @@ struct UnsupportedTypeDefaultPolicy { }; // Helper with the result type -template > -struct CallableDispatchableRetHelper { +template +class CallableDispatchableRetHelper { int32_t dt_type_; // Type currently dispatched size_t called_; Ret result_; + public: explicit CallableDispatchableRetHelper(int32_t dt_type) noexcept : dt_type_(dt_type), called_(0), result_() {} Ret Get() { - // See if there were multiple invocations.It is a bug. - ORT_ENFORCE(called_ < 2, "Check for duplicate types in MLTypeCallDispatcherRet"); // No type was invoked if (called_ == 0) { result_ = UnsupportedPolicy()(dt_type_); @@ -279,119 +280,142 @@ struct CallableDispatchableRetHelper { } }; +template +using TensorProtoElementTypeConstant = + std::integral_constant()>; + +using UndefinedTensorProtoElementTypeConstant = + std::integral_constant; + } // namespace mltype_dispatcher_internal -// This class helps to efficiently dispatch calls for templated -// kernel implementation functions that has no return value. -// If your implementation function must return a value such as Status -// Use MLTypeCallDispatcherRet class. -// -// The first template parameter is a template struct/class functor -// that must implement operator() with arbitrary number of arguments -// and void return turn. It must return Ret type if you are using MLTypeCallDispatcherRet. -// Fn must be default constructible. -// -// Types is a type list that are supported by this kernel implementation. -// There should be no duplicate types. An exception will be thrown if there -// a duplicate. -// -// The constructor accepts an enum that is obtained from -// input_tensor->DataType()->AsPrimitiveType()->GetDataType(). -// Fn will be called only once the type designated by dt_type value. -// If current dt_type is not handled, the Dispatcher will throw an exception. -// -template