Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmake/global_variables.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ file(GLOB generator_srcs CONFIGURE_DEPENDS
"${GENERATORS_ROOT}/webgpu/*.cpp"
"${GENERATORS_ROOT}/openvino/*.h"
"${GENERATORS_ROOT}/openvino/*.cpp"
"${GENERATORS_ROOT}/ryzenai/*.h"
"${GENERATORS_ROOT}/ryzenai/*.cpp"
"${MODELS_ROOT}/*.h"
"${MODELS_ROOT}/*.cpp"
"${ENGINE_ROOT}/*.h"
Expand Down
10 changes: 9 additions & 1 deletion src/generators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "qnn/interface.h"
#include "webgpu/interface.h"
#include "openvino/interface.h"
#include "ryzenai/interface.h"
#include "engine/engine.h"

#if defined(_WIN32)
Expand Down Expand Up @@ -94,6 +95,8 @@ void Shutdown() {
}

GetOrtGlobals().reset(); // Delete now because on process exit is too late

RyzenAIInterface::Shutdown();
}

OrtEnv& GetOrtEnv() {
Expand Down Expand Up @@ -224,6 +227,8 @@ std::string to_string(DeviceType device_type) {
return "OpenVINO";
case DeviceType::NvTensorRtRtx:
return "NvTensorRtRtx";
case DeviceType::RyzenAI:
return "RyzenAI";
default:
throw std::runtime_error("Unknown device type");
}
Expand All @@ -247,6 +252,8 @@ DeviceInterface* GetDeviceInterface(DeviceType type) {
return GetQNNInterface();
case DeviceType::OpenVINO:
return GetOpenVINOInterface();
case DeviceType::RyzenAI:
return GetRyzenAIInterface();
}
}

Expand Down Expand Up @@ -358,7 +365,8 @@ void Generator::AppendTokens(cpu_span<const int32_t> input_ids) {
DeviceType::CUDA,
DeviceType::WEBGPU,
DeviceType::OpenVINO,
DeviceType::NvTensorRtRtx};
DeviceType::NvTensorRtRtx,
DeviceType::RyzenAI};

if (search_->GetSequenceLength() != 0 &&
std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(),
Expand Down
4 changes: 2 additions & 2 deletions src/models/logits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ Logits::Logits(State& state)

input_sequence_lengths.resize(state_.params_->search.batch_size);

if (IsOpenVINOStatefulModel(state.model_)) {
// In the case of OpenVINO stateful models, they are patched in a way so that they only return the
if (IsOpenVINOStatefulModel(state.model_) || state_.model_.p_device_->GetType() == DeviceType::RyzenAI) {
// In the case of OpenVINO stateful models or RyzenAI models, they are patched in a way so that they only return the
Comment thread
baijumeswani marked this conversation as resolved.
// sliced logits needed for sampling. For example, given 43 prompt tokens, instead of returning
// logits of the shape: [1,43,<vocab_size>]
// they will have shape: [1, 1,<vocab_size>].
Expand Down
12 changes: 10 additions & 2 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "qwen2_5_vl_image_processor.h"
#include "../dml/interface.h"
#include "../openvino/interface.h"
#include "../ryzenai/interface.h"

#if defined(_WIN32)
#include <direct.h>
Expand Down Expand Up @@ -653,6 +654,12 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
} else if (provider_options.name == "OpenVINO") {
p_device = GetDeviceInterface(DeviceType::OpenVINO);
OpenVINO_AppendProviderOptions(session_options, config, provider_options);
} else if (provider_options.name == "RyzenAI") {
p_device = GetDeviceInterface(DeviceType::RyzenAI);

session_options.AddConfigEntry("model_root", config.config_path.string().c_str());

GetRyzenAIInterface()->SetupProvider(session_options, provider_options.options);
} else {
// For providers that go through the extensible AppendExecutionProvider API:
if (provider_options.name == "QNN") {
Expand Down Expand Up @@ -810,7 +817,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni
// This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI.

// Names for the device types used by 'SetProviderSessionOptions'
static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx"};
static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx", "RyzenAI"};
static_assert(std::size(device_type_names) == static_cast<size_t>(DeviceType::MAX));

// Create an OrtSessionOptions and set the options to use the DeviceType we're using here
Expand All @@ -829,7 +836,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni
allocator.session_ = OrtSession::Create(GetOrtEnv(), g_trivial_model, sizeof(g_trivial_model), session_options.get());

// Names for the device memory types used by 'OrtMemoryInfo::Create'
static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda"};
static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda", "Cpu"};
Comment thread
baijumeswani marked this conversation as resolved.
static_assert(std::size(device_memory_type_names) == static_cast<size_t>(DeviceType::MAX));

// Get the allocator from the OrtSession for the DeviceType (it's called 'AllocatorCreate' but it's really 'AllocatorGet')
Expand Down Expand Up @@ -914,6 +921,7 @@ Model::Model(std::unique_ptr<Config> config) : config_{std::move(config)} {
// Only CUDA, TRT-RTX and DML does every input on the device
// For WebGPU, use device memory only if graph capture is enabled, otherwise use CPU
if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx ||
p_device_->GetType() == DeviceType::RyzenAI ||
Comment thread
baijumeswani marked this conversation as resolved.
(p_device_->GetType() == DeviceType::WEBGPU && IsGraphCaptureEnabled(config_->model.decoder.session_options)))
p_device_inputs_ = p_device_;
else
Expand Down
184 changes: 184 additions & 0 deletions src/ryzenai/interface.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#include "../generators.h"
#include "../search.h"
#include "interface.h"
#include <filesystem>
#include <mutex>
#include <span>

#if !defined(_WIN32)
#include <dlfcn.h>
#endif

namespace Generators {
namespace RyzenAI {

static constexpr auto ep_path_env_key_ = "RYZENAI_EP_PATH";
static constexpr auto ep_name_ = "RyzenAILightExecutionProvider";
#if defined(_WIN32)
static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.dll";
#else
static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.so";
#endif
static constexpr auto func_custom_ops_ = "RyzenAI_RegisterCustomOps";
static constexpr auto func_shutdown_ = "RyzenAI_Shutdown";

static Ort::Allocator* ort_allocator_{};

struct Memory : DeviceBuffer {
Memory(size_t size) : owned_{true} {
size_in_bytes_ = size;
p_cpu_ = p_device_ = static_cast<uint8_t*>(ort_allocator_->Alloc(size_in_bytes_));
}

Memory(void* p, size_t size) : owned_{false} {
size_in_bytes_ = size;
p_cpu_ = p_device_ = static_cast<uint8_t*>(p);
}

~Memory() override {
if (owned_)
ort_allocator_->Free(p_device_);
}

const char* GetType() const override { return "RyzenAI"; }

void AllocateCpu() override {}
void CopyDeviceToCpu() override {}
void CopyCpuToDevice() override {}

void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override {
CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes);
}

void Zero() override {
memset(p_device_, 0, size_in_bytes_);
}
Comment thread
baijumeswani marked this conversation as resolved.

bool owned_;
};

struct Interface : RyzenAIInterface {
Interface() {
// If already loaded then nothing to do
#if defined(_WIN32)
if (GetModuleHandleA(ep_filename_))
return;
#else
if (dlopen(ep_filename_, RTLD_NOLOAD | RTLD_NOW))
return;
#endif

std::error_code ec;

ep_path_ = GetEnv(ep_path_env_key_);

#if defined(_WIN32)
if (ep_path_.empty()) {
wchar_t buffer[MAX_PATH + 1] = {0};
const auto len = sizeof(buffer) / sizeof(buffer[0]);

if (MEMORY_BASIC_INFORMATION mbi; VirtualQuery(Ort::api->RegisterExecutionProviderLibrary, &mbi, sizeof(mbi)))
if (HMODULE mod = (HMODULE)mbi.AllocationBase; GetModuleFileNameW(mod, buffer, len))
if (const auto dir = std::filesystem::path{buffer}.remove_filename(); !dir.empty())
if (auto path = dir / ep_filename_; std::filesystem::exists(path, ec))
ep_path_ = std::move(path);
}
#endif // _WIN32

if (ep_path_.empty())
ep_path_ = std::filesystem::current_path(ec) / ep_filename_;

Ort::ThrowOnError(Ort::api->RegisterExecutionProviderLibrary(GetOrtGlobals()->env_.get(), ep_name_, ep_path_.native().c_str()));
}

~Interface() {
// TODO: make it linux compatible
#if defined(_WIN32)
if (const auto mod = GetModuleHandleA(ep_filename_))
if (const auto func = reinterpret_cast<void (*)()>(GetProcAddress(mod, func_shutdown_)))
func();
#endif // _WIN32
}

void SetupProvider(OrtSessionOptions& session_options, const ProviderOptions& provider_options) override {
std::vector<const OrtEpDevice*> supported_devices;

{
const OrtEpDevice* const* devices = nullptr;
size_t ndevices = 0;

Ort::ThrowOnError(Ort::api->GetEpDevices(&GetOrtEnv(), &devices, &ndevices));

for (const auto& device : std::span{devices, devices + ndevices})

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

syntax error: missing ';' before ':'

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'device': a symbol whose type contains 'auto' must have an initializer

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'device': references must be initialized

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'std::generators_span::span<T> std::generators_span::span(std::generators_span::span<T>)': expects 1 arguments - 2 provided

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'std::generators_span::span<T> std::generators_span::span(void)': expects 0 arguments - 2 provided

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'initializing': cannot convert from 'const OrtEpDevice *const *' to 'size_t'

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'std::generators_span::span<T> std::generators_span::span(const std::generators_span::span<T> &) noexcept': expects 1 arguments - 2 provided

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'std::generators_span::span<T> std::generators_span::span(std::vector<T,std::allocator<_Ty>> &) noexcept': expects 1 arguments - 2 provided

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

'std::generators_span::span<T> std::generators_span::span(std::array<_Ty,_Size> &) noexcept': expects 1 arguments - 2 provided

Check failure on line 112 in src/ryzenai/interface.cpp

View workflow job for this annotation

GitHub Actions / windows-cuda-x64-build

cannot deduce template arguments for 'std::generators_span::span'
if (std::string_view{ep_name_} == Ort::api->EpDevice_EpName(device) &&
OrtHardwareDeviceType_NPU == Ort::api->HardwareDevice_Type(Ort::api->EpDevice_Device(device)))
supported_devices.push_back(device);
}

if (supported_devices.empty())
throw std::runtime_error{"No RyzenAI devices detected"};

{
std::vector<const char*> ep_keys, ep_values;

for (auto& option : provider_options) {
ep_keys.emplace_back(option.first.c_str());
ep_values.emplace_back(option.second.c_str());
}

// this call merges provider_options into session_options
Ort::ThrowOnError(Ort::api->SessionOptionsAppendExecutionProvider_V2(&session_options,
&GetOrtEnv(), supported_devices.data(), supported_devices.size(),
ep_keys.data(), ep_values.data(), ep_keys.size()));
}

Ort::ThrowOnError(Ort::api->RegisterCustomOpsUsingFunction(&session_options, func_custom_ops_));
}

DeviceType GetType() const override { return DeviceType::RyzenAI; }

void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override {
assert(!ort_allocator_);
ort_allocator_ = &allocator;
}

Ort::Allocator& GetAllocator() override {
return *ort_allocator_;
}

std::shared_ptr<DeviceBuffer> AllocateBase(size_t size) override {
return std::make_shared<Memory>(size);
}

std::shared_ptr<DeviceBuffer> WrapMemoryBase(void* p, size_t size) override {
return std::make_shared<Memory>(p, size);
}

std::unique_ptr<Search> CreateGreedy(const GeneratorParams& params) override { return std::make_unique<GreedySearch_Cpu>(params); }
std::unique_ptr<Search> CreateBeam(const GeneratorParams& params) override { return std::make_unique<BeamSearch_Cpu>(params); }

void Synchronize() override {}

private:
std::filesystem::path ep_path_;
};

static std::unique_ptr<Interface> interface_;

} // namespace RyzenAI

void RyzenAIInterface::Shutdown() {
Comment thread
baijumeswani marked this conversation as resolved.
RyzenAI::interface_.reset();
}

RyzenAIInterface* GetRyzenAIInterface() {
static std::once_flag once;

std::call_once(once, []() {
RyzenAI::interface_ = std::make_unique<RyzenAI::Interface>();
});

return RyzenAI::interface_.get();
}

} // namespace Generators
15 changes: 15 additions & 0 deletions src/ryzenai/interface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once

namespace Generators {

struct RyzenAIInterface : DeviceInterface {
using ProviderOptions = std::vector<std::pair<std::string, std::string>>;

virtual void SetupProvider(OrtSessionOptions&, const ProviderOptions&) = 0;

static void Shutdown();
};

RyzenAIInterface* GetRyzenAIInterface();

} // namespace Generators
1 change: 1 addition & 0 deletions src/smartptrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ enum struct DeviceType {
QNN,
OpenVINO,
NvTensorRtRtx,
RyzenAI,
MAX
};

Expand Down
Loading