Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pipelines/nuget-publishing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ parameters:
- name: ort_version
displayName: 'OnnxRuntime version'
type: string
default: '1.22.0'
default: '1.23.0'

- name: ort_winml_version
displayName: 'Microsoft.WindowsAppSDK.ML Version (should match CMakeList.txt)'
Expand All @@ -76,12 +76,12 @@ parameters:
- name: ort_cuda_version
displayName: 'OnnxRuntime GPU version'
type: string
default: '1.22.0'
default: '1.23.0'

- name: ort_dml_version
displayName: 'OnnxRuntime DML version'
type: string
default: '1.22.0'
default: '1.23.0'

- name: cuda_version
displayName: 'CUDA version'
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ if(MSVC)
"$<$<COMPILE_LANGUAGE:C>:/wd4100>"
"$<$<COMPILE_LANGUAGE:CXX>:/wd4100>"

# Suppress warning C4819: file contains character that cannot be represented in current code page
"$<$<COMPILE_LANGUAGE:C>:/wd4819>"
"$<$<COMPILE_LANGUAGE:CXX>:/wd4819>"

# Enable warning level 4 (more aggressive than default /W3)
# Captures more potential bugs or code smells
"$<$<COMPILE_LANGUAGE:C>:/W4>"
Expand Down
8 changes: 4 additions & 4 deletions cmake/ortlib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,16 @@ if(ORT_HOME)
endif()
else()
# If ORT_HOME is not specified, download the onnxruntime headers and libraries from the nightly feed
set(ORT_VERSION "1.22.0")
set(ORT_VERSION "1.23.0")
set(ORT_FEED_ORG_NAME "aiinfra")
set(ORT_FEED_PROJECT "2692857e-05ef-43b4-ba9c-ccf1c22c437c")
set(ORT_NIGHTLY_FEED_ID "7982ae20-ed19-4a35-a362-a96ac99897b7")

if (USE_DML)
set(ORT_VERSION "1.22.0")
set(ORT_VERSION "1.23.0")
set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime.DirectML")
elseif(USE_CUDA)
set(ORT_VERSION "1.22.0")
set(ORT_VERSION "1.23.0")
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime.Gpu.Linux")
elseif(WIN32)
Expand All @@ -99,7 +99,7 @@ else()
message(FATAL_ERROR "Unsupported platform for CUDA")
endif()
elseif(USE_ROCM)
set(ORT_VERSION "1.22.0")
set(ORT_VERSION "1.23.0")
set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime.Rocm")
else()
set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime")
Expand Down
6 changes: 3 additions & 3 deletions examples/slm_engine/build_scripts/build_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,9 +577,9 @@ def main():
ort_home = None
if args.build_ort_from_source:
if args.ort_version_to_use is None:
# If not Windows then use 1.22.0
# If not Windows then use 1.23.0
if platform.system() != "Windows":
args.ort_version_to_use = "v1.22.0"
args.ort_version_to_use = "v1.23.0"
else:
args.ort_version_to_use = "main"
ort_home = build_ort(args, dep_src_dir, artifacts_dir)
Expand All @@ -590,7 +590,7 @@ def main():
# The ORT binaries are available as they were downloaded during the GenAI build
# This is the supported version for most platforms
if args.ort_version_to_use is None:
ORT_VERSION = "1.22.0"
ORT_VERSION = "1.23.0"
else:
ORT_VERSION = args.ort_version_to_use
# Copy the ORT artifacts to the artifacts directory.
Expand Down
7 changes: 7 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,13 @@ bool IsGraphCaptureEnabled(const Config::SessionOptions& session_options) {
}
} else if (provider_options->name == "DML") {
return true;
} else if (provider_options->name == "WebGPU") {
for (const auto& value : provider_options->options) {
if (value.first == "enableGraphCapture" && value.second == "1") {
return true;
}
}
return false;
} else if (provider_options->name == "NvTensorRtRtx") {
for (const auto& value : provider_options->options) {
if (value.first == "enable_cuda_graph" && value.second == "1") {
Expand Down
4 changes: 3 additions & 1 deletion src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,9 @@ Model::Model(std::unique_ptr<Config> config) : config_{std::move(config)} {
EnsureDeviceOrtInit(*p_device_, *config_, arena_cfg_);

// Only CUDA, TRT-RTX and DML does every input on the device
if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx)
// For WebGPU, use device memory only if graph capture is enabled, otherwise use CPU
if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx ||
(p_device_->GetType() == DeviceType::WEBGPU && IsGraphCaptureEnabled(config_->model.decoder.session_options)))
p_device_inputs_ = p_device_;
else
p_device_inputs_ = GetDeviceInterface(DeviceType::CPU);
Expand Down
28 changes: 28 additions & 0 deletions src/models/onnxruntime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,14 @@ struct OrtEnv {

OrtEnv& CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, const OrtArenaCfg& arena_cfg); ///< Wraps OrtApi::CreateAndRegisterAllocator

/// \brief Copy tensors between devices. Wraps OrtApi::CopyTensors
/// \param src_tensors Array of source OrtValue tensors
/// \param dst_tensors Array of destination OrtValue tensors (must be pre-allocated)
/// \param stream Optional sync stream for asynchronous copy (can be nullptr for synchronous)
void CopyTensors(const std::vector<const OrtValue*>& src_tensors,
const std::vector<OrtValue*>& dst_tensors,
OrtSyncStream* stream = nullptr) const;

std::vector<const OrtEpDevice*> GetEpDevices();

static void operator delete(void* p) { Ort::api->ReleaseEnv(reinterpret_cast<OrtEnv*>(p)); }
Expand Down Expand Up @@ -848,6 +856,26 @@ struct OrtShape {
size_t shape_len;
};

/** \brief Wrapper around ::OrtSyncStream
*
* Used for asynchronous operations like CopyTensors.
* Requires ONNX Runtime 1.23.0 or later.
*/
struct OrtSyncStream {
/// \brief Create a sync stream for a specific execution provider device
/// \param ep_device The execution provider device (from OrtEnv::GetEpDevices)
/// \param stream_options Optional stream configuration options
static std::unique_ptr<OrtSyncStream> Create(const OrtEpDevice* ep_device, const OrtKeyValuePairs* stream_options = nullptr);

/// \brief Get the native stream handle (e.g., cudaStream_t for CUDA)
void* GetHandle() const;

static void operator delete(void* p) {
if (p) Ort::api->ReleaseSyncStream(reinterpret_cast<OrtSyncStream*>(p));
}
Ort::Abstract make_abstract;
};

/** \brief Wrapper around ::OrtValue
*
*/
Expand Down
19 changes: 19 additions & 0 deletions src/models/onnxruntime_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,16 @@ inline std::unique_ptr<OrtMemoryInfo> OrtMemoryInfo::Create(const char* name, Or
return std::unique_ptr<OrtMemoryInfo>{p};
}

inline std::unique_ptr<OrtSyncStream> OrtSyncStream::Create(const OrtEpDevice* ep_device, const OrtKeyValuePairs* stream_options) {
OrtSyncStream* p_stream = nullptr;
Ort::ThrowOnError(Ort::api->CreateSyncStreamForEpDevice(ep_device, stream_options, &p_stream));
return std::unique_ptr<OrtSyncStream>(p_stream);
}

inline void* OrtSyncStream::GetHandle() const {
return Ort::api->SyncStream_GetHandle(const_cast<OrtSyncStream*>(this));
}

inline std::unique_ptr<OrtIoBinding> OrtIoBinding::Create(OrtSession& session) {
OrtIoBinding* p;
Ort::ThrowOnError(Ort::api->CreateIoBinding(&session, &p));
Expand Down Expand Up @@ -398,6 +408,15 @@ inline OrtEnv& OrtEnv::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info,
return *this;
}

inline void OrtEnv::CopyTensors(const std::vector<const OrtValue*>& src_tensors,
const std::vector<OrtValue*>& dst_tensors,
OrtSyncStream* stream) const {
if (src_tensors.size() != dst_tensors.size()) {
throw std::runtime_error("Number of source and destination tensors must match");
}
Ort::ThrowOnError(Ort::api->CopyTensors(this, src_tensors.data(), dst_tensors.data(), stream, src_tensors.size()));
}

inline std::vector<const OrtEpDevice*> OrtEnv::GetEpDevices() {
size_t num_devices = 0;
const OrtEpDevice* const* device_ptrs = nullptr;
Expand Down
121 changes: 114 additions & 7 deletions src/webgpu/interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,136 @@ const char* device_label = "WebGPU";
struct WebGPUMemory final : DeviceBuffer {
WebGPUMemory(size_t size) : owned_{true} {
size_in_bytes_ = size;
p_cpu_ = p_device_ = static_cast<uint8_t*>(ort_allocator_->Alloc(size_in_bytes_));
p_device_ = static_cast<uint8_t*>(ort_allocator_->Alloc(size_in_bytes_));
}

WebGPUMemory(void* p, size_t size) : owned_{false} {
size_in_bytes_ = size;
p_cpu_ = p_device_ = static_cast<uint8_t*>(p);
p_device_ = static_cast<uint8_t*>(p);
}

~WebGPUMemory() override {
if (owned_)
ort_allocator_->Free(p_device_);
if (p_cpu_)
free(p_cpu_);
}

const char* GetType() const override { return device_label; }
void AllocateCpu() override { throw std::runtime_error("CPU can't access WebGPU memory"); }
void CopyDeviceToCpu() override { throw std::runtime_error("CPU can't access WebGPU memory"); }
void CopyCpuToDevice() override { throw std::runtime_error("CPU can't access WebGPU memory"); }

void AllocateCpu() override {
if (!p_cpu_)
p_cpu_ = static_cast<uint8_t*>(malloc(size_in_bytes_));
}

void CopyDeviceToCpu() override {
if (!ort_allocator_) {
throw std::runtime_error("WebGPU allocator not initialized");
}

AllocateCpu();

// Get WebGPU allocator's memory info
const OrtMemoryInfo* webgpu_mem_info = nullptr;
Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));

// Create source tensor (WebGPU device memory) - treat as 1D uint8 array
int64_t shape_val = static_cast<int64_t>(size_in_bytes_);
std::span<const int64_t> shape{&shape_val, 1};
auto src_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);

// Create CPU memory info and destination tensor
auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
auto dst_tensor = OrtValue::CreateTensor(*cpu_mem_info, p_cpu_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);

// Use ORT C API's CopyTensors (synchronous copy, stream = nullptr)
OrtValue* src_ptrs[] = {src_tensor.get()};
OrtValue* dst_ptrs[] = {dst_tensor.get()};
Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
}

void CopyCpuToDevice() override {
if (!ort_allocator_) {
throw std::runtime_error("WebGPU allocator not initialized");
}
assert(p_cpu_);

// Get WebGPU allocator's memory info
const OrtMemoryInfo* webgpu_mem_info = nullptr;
Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));

// Create source tensor (CPU memory) - treat as 1D uint8 array
int64_t shape_val = static_cast<int64_t>(size_in_bytes_);
std::span<const int64_t> shape{&shape_val, 1};
auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, p_cpu_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);

// Create destination tensor (WebGPU device memory)
auto dst_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);

// Use ORT C API's CopyTensors (synchronous copy, stream = nullptr)
OrtValue* src_ptrs[] = {src_tensor.get()};
OrtValue* dst_ptrs[] = {dst_tensor.get()};
Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
}

void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override {
throw std::runtime_error("CPU can't access WebGPU memory");
if (!ort_allocator_) {
throw std::runtime_error("WebGPU allocator not initialized");
}

// Fast path: WebGPU-to-WebGPU copy with zero offsets
// NOTE: p_device_ is a WGPUBuffer handle (cast to uint8_t*), not a memory pointer.
// We cannot use pointer arithmetic (p_device_ + offset) to create sub-buffer views.
// OrtValue::CreateTensor expects the actual buffer handle, not an offset pointer.
if (source.GetType() == device_label && begin_source == 0 && begin_dest == 0) {
// Get WebGPU allocator's memory info
const OrtMemoryInfo* webgpu_mem_info = nullptr;
Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));

// Full buffer copy using CopyTensors (no offsets)
int64_t shape_val = static_cast<int64_t>(size_in_bytes);
std::span<const int64_t> shape{&shape_val, 1};
auto src_tensor = OrtValue::CreateTensor(*webgpu_mem_info, source.p_device_, size_in_bytes, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
auto dst_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);

// Use ORT C API's CopyTensors for GPU-to-GPU copy
OrtValue* src_ptrs[] = {src_tensor.get()};
OrtValue* dst_ptrs[] = {dst_tensor.get()};
Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
} else {
// Fallback: Copy through CPU for:
// - WebGPU-to-WebGPU copies with non-zero offsets (buffer handles don't support offset arithmetic)
// - Cross-device copies (e.g., CPU to WebGPU or vice versa)
CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes);
}
}

void Zero() override {
throw std::runtime_error("Zeroing not implemented for WebGPU memory");
if (!ort_allocator_) {
throw std::runtime_error("WebGPU allocator not initialized");
}

// Allocate zeroed CPU memory
std::vector<uint8_t> zero_buffer(size_in_bytes_, 0);

// Get WebGPU allocator's memory info
const OrtMemoryInfo* webgpu_mem_info = nullptr;
Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));

// Create source tensor (CPU memory with zeros) - treat as 1D uint8 array
int64_t shape_val = static_cast<int64_t>(size_in_bytes_);
std::span<const int64_t> shape{&shape_val, 1};
auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, zero_buffer.data(), size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);

// Create destination tensor (WebGPU device memory)
auto dst_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);

// Use ORT C API's CopyTensors to copy zeros to GPU (synchronous copy, stream = nullptr)
OrtValue* src_ptrs[] = {src_tensor.get()};
OrtValue* dst_ptrs[] = {dst_tensor.get()};
Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
}

bool owned_;
Expand Down
2 changes: 1 addition & 1 deletion test/python/cpu/ort/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
onnxruntime==1.22.0
onnxruntime==1.23.0
2 changes: 1 addition & 1 deletion test/python/cuda/ort/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
onnxruntime-gpu==1.22.0
onnxruntime-gpu==1.23.0
2 changes: 1 addition & 1 deletion test/python/directml/ort/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
onnxruntime-directml==1.22.0
onnxruntime-directml==1.23.0
2 changes: 1 addition & 1 deletion test/python/macos/ort/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
onnxruntime==1.22.0
onnxruntime==1.23.0
Loading