Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmake/global_variables.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ file(GLOB generator_srcs CONFIGURE_DEPENDS
"${GENERATORS_ROOT}/webgpu/*.cpp"
"${GENERATORS_ROOT}/openvino/*.h"
"${GENERATORS_ROOT}/openvino/*.cpp"
"${GENERATORS_ROOT}/ryzenai/*.h"
"${GENERATORS_ROOT}/ryzenai/*.cpp"
"${MODELS_ROOT}/*.h"
"${MODELS_ROOT}/*.cpp"
"${ENGINE_ROOT}/*.h"
Expand Down
10 changes: 9 additions & 1 deletion src/generators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "qnn/interface.h"
#include "webgpu/interface.h"
#include "openvino/interface.h"
#include "ryzenai/interface.h"
#include "engine/engine.h"

#if defined(_WIN32)
Expand Down Expand Up @@ -94,6 +95,8 @@ void Shutdown() {
}

GetOrtGlobals().reset(); // Delete now because on process exit is too late

RyzenAIInterface::Shutdown();
}

OrtEnv& GetOrtEnv() {
Expand Down Expand Up @@ -224,6 +227,8 @@ std::string to_string(DeviceType device_type) {
return "OpenVINO";
case DeviceType::NvTensorRtRtx:
return "NvTensorRtRtx";
case DeviceType::RyzenAI:
return "RyzenAI";
default:
throw std::runtime_error("Unknown device type");
}
Expand All @@ -247,6 +252,8 @@ DeviceInterface* GetDeviceInterface(DeviceType type) {
return GetQNNInterface();
case DeviceType::OpenVINO:
return GetOpenVINOInterface();
case DeviceType::RyzenAI:
return GetRyzenAIInterface();
}
}

Expand Down Expand Up @@ -358,7 +365,8 @@ void Generator::AppendTokens(cpu_span<const int32_t> input_ids) {
DeviceType::CUDA,
DeviceType::WEBGPU,
DeviceType::OpenVINO,
DeviceType::NvTensorRtRtx};
DeviceType::NvTensorRtRtx,
DeviceType::RyzenAI};

if (search_->GetSequenceLength() != 0 &&
std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(),
Expand Down
4 changes: 2 additions & 2 deletions src/models/logits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ Logits::Logits(State& state)

input_sequence_lengths.resize(state_.params_->search.batch_size);

if (IsOpenVINOStatefulModel(state.model_)) {
// In the case of OpenVINO stateful models, they are patched in a way so that they only return the
if (IsOpenVINOStatefulModel(state.model_) || state_.model_.p_device_->GetType() == DeviceType::RyzenAI) {
// In the case of OpenVINO stateful models or RyzenAI models, they are patched in a way so that they only return the
Comment thread
baijumeswani marked this conversation as resolved.
// sliced logits needed for sampling. For example, given 43 prompt tokens, instead of returning
// logits of the shape: [1,43,<vocab_size>]
// they will have shape: [1, 1,<vocab_size>].
Expand Down
14 changes: 11 additions & 3 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "qwen2_5_vl_image_processor.h"
#include "../dml/interface.h"
#include "../openvino/interface.h"
#include "../ryzenai/interface.h"

#if defined(_WIN32)
#include <direct.h>
Expand Down Expand Up @@ -653,6 +654,12 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
} else if (provider_options.name == "OpenVINO") {
p_device = GetDeviceInterface(DeviceType::OpenVINO);
OpenVINO_AppendProviderOptions(session_options, config, provider_options);
} else if (provider_options.name == "RyzenAI") {
p_device = GetDeviceInterface(DeviceType::RyzenAI);

session_options.AddConfigEntry("model_root", config.config_path.string().c_str());

GetRyzenAIInterface()->SetupProvider(session_options, provider_options.options);
} else {
// For providers that go through the extensible AppendExecutionProvider API:
if (provider_options.name == "QNN") {
Expand Down Expand Up @@ -810,7 +817,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni
// This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI.

// Names for the device types used by 'SetProviderSessionOptions'
static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx"};
static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx", "RyzenAI"};
static_assert(std::size(device_type_names) == static_cast<size_t>(DeviceType::MAX));

// Create an OrtSessionOptions and set the options to use the DeviceType we're using here
Expand All @@ -829,7 +836,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni
allocator.session_ = OrtSession::Create(GetOrtEnv(), g_trivial_model, sizeof(g_trivial_model), session_options.get());

// Names for the device memory types used by 'OrtMemoryInfo::Create'
static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda"};
static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda", "Cpu"};
Comment thread
baijumeswani marked this conversation as resolved.
static_assert(std::size(device_memory_type_names) == static_cast<size_t>(DeviceType::MAX));

// Get the allocator from the OrtSession for the DeviceType (it's called 'AllocatorCreate' but it's really 'AllocatorGet')
Expand Down Expand Up @@ -911,9 +918,10 @@ Model::Model(std::unique_ptr<Config> config) : config_{std::move(config)} {
CreateSessionOptions();
EnsureDeviceOrtInit(*p_device_, *config_, arena_cfg_);

// Only CUDA, TRT-RTX and DML does every input on the device
// Only CUDA, TRT-RTX, RyzenAI and DML does every input on the device
// For WebGPU, use device memory only if graph capture is enabled, otherwise use CPU
if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx ||
p_device_->GetType() == DeviceType::RyzenAI ||
Comment thread
baijumeswani marked this conversation as resolved.
(p_device_->GetType() == DeviceType::WEBGPU && IsGraphCaptureEnabled(config_->model.decoder.session_options)))
p_device_inputs_ = p_device_;
else
Expand Down
210 changes: 210 additions & 0 deletions src/ryzenai/interface.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#include "../generators.h"
#include "../search.h"
#include "interface.h"
#include <filesystem>
#include <mutex>
#include <span>

#if !defined(_WIN32)
#include <dlfcn.h>
#endif

namespace Generators {
namespace RyzenAI {

static constexpr auto ep_path_env_key_ = "RYZENAI_EP_PATH";
static constexpr auto ep_name_ = "RyzenAILightExecutionProvider";
#if defined(_WIN32)
static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.dll";
#else
static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.so";
#endif
static constexpr auto func_custom_ops_ = "RyzenAI_RegisterCustomOps";
static constexpr auto func_shutdown_ = "RyzenAI_Shutdown";

static Ort::Allocator* ort_allocator_{};

struct Memory : DeviceBuffer {
Memory(size_t size) : owned_{true} {
size_in_bytes_ = size;
p_cpu_ = p_device_ = static_cast<uint8_t*>(ort_allocator_->Alloc(size_in_bytes_));
}

Memory(void* p, size_t size) : owned_{false} {
size_in_bytes_ = size;
p_cpu_ = p_device_ = static_cast<uint8_t*>(p);
}

~Memory() override {
if (owned_)
ort_allocator_->Free(p_device_);
}

const char* GetType() const override { return "RyzenAI"; }

void AllocateCpu() override {}
void CopyDeviceToCpu() override {}
void CopyCpuToDevice() override {}

void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override {
CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes);
}

void Zero() override {
memset(p_device_, 0, size_in_bytes_);
}
Comment thread
baijumeswani marked this conversation as resolved.

bool owned_;
};

struct Interface : RyzenAIInterface {
Interface() {
// If already loaded then nothing to do
#if defined(_WIN32)
if (GetModuleHandleA(ep_filename_))
return;
#else
if (dlopen(ep_filename_, RTLD_NOLOAD | RTLD_NOW))
return;
#endif

std::error_code ec;

ep_path_ = GetEnv(ep_path_env_key_);

#if defined(_WIN32)
const auto get_hmod_for_method = [](LPCVOID func) -> HMODULE {
MEMORY_BASIC_INFORMATION mbi;

if (VirtualQuery(func, &mbi, sizeof(mbi)) && mbi.AllocationBase)
return (HMODULE)mbi.AllocationBase;

return nullptr;
};

const auto find_next_to_module = [&](HMODULE hmod) -> std::filesystem::path {
wchar_t buffer[MAX_PATH + 1] = {0};
const auto len = sizeof(buffer) / sizeof(buffer[0]);

if (GetModuleFileNameW(hmod, buffer, len))
if (const auto dir = std::filesystem::path{buffer}.remove_filename(); !dir.empty())
if (auto path = dir / ep_filename_; std::filesystem::exists(path, ec))
return path;

return {};
};

if (ep_path_.empty())
// check next to onnxruntime-genai.dll
if (const auto hmod = get_hmod_for_method(GetRyzenAIInterface))
ep_path_ = find_next_to_module(hmod);

if (ep_path_.empty())
// check next to onnxruntime.dll
if (const auto hmod = get_hmod_for_method(Ort::api->RegisterExecutionProviderLibrary))
ep_path_ = find_next_to_module(hmod);

if (ep_path_.empty())
// check next to current executable
if (const auto hmod = GetModuleHandleA(NULL))
ep_path_ = find_next_to_module(hmod);
#endif // _WIN32

if (ep_path_.empty())
// fallback to current working directory
ep_path_ = std::filesystem::current_path(ec) / ep_filename_;

Ort::ThrowOnError(Ort::api->RegisterExecutionProviderLibrary(GetOrtGlobals()->env_.get(), ep_name_, ep_path_.native().c_str()));
}

~Interface() {
// TODO: make it linux compatible
#if defined(_WIN32)
if (const auto mod = GetModuleHandleA(ep_filename_))
if (const auto func = reinterpret_cast<void (*)()>(GetProcAddress(mod, func_shutdown_)))
func();
#endif // _WIN32
}

void SetupProvider(OrtSessionOptions& session_options, const ProviderOptions& provider_options) override {
std::vector<const OrtEpDevice*> supported_devices;

{
const OrtEpDevice* const* devices = nullptr;
size_t ndevices = 0;

Ort::ThrowOnError(Ort::api->GetEpDevices(&GetOrtEnv(), &devices, &ndevices));

for (const auto& device : std::span{devices, ndevices})
if (std::string_view{ep_name_} == Ort::api->EpDevice_EpName(device) &&
OrtHardwareDeviceType_NPU == Ort::api->HardwareDevice_Type(Ort::api->EpDevice_Device(device)))
supported_devices.push_back(device);
}

if (supported_devices.empty())
throw std::runtime_error{"No RyzenAI devices detected"};

{
std::vector<const char*> ep_keys, ep_values;

for (auto& option : provider_options) {
ep_keys.emplace_back(option.first.c_str());
ep_values.emplace_back(option.second.c_str());
}

// this call merges provider_options into session_options
Ort::ThrowOnError(Ort::api->SessionOptionsAppendExecutionProvider_V2(&session_options,
&GetOrtEnv(), supported_devices.data(), supported_devices.size(),
ep_keys.data(), ep_values.data(), ep_keys.size()));
}

Ort::ThrowOnError(Ort::api->RegisterCustomOpsUsingFunction(&session_options, func_custom_ops_));
}

DeviceType GetType() const override { return DeviceType::RyzenAI; }

void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override {
assert(!ort_allocator_);
ort_allocator_ = &allocator;
}

Ort::Allocator& GetAllocator() override {
return *ort_allocator_;
}

std::shared_ptr<DeviceBuffer> AllocateBase(size_t size) override {
return std::make_shared<Memory>(size);
}

std::shared_ptr<DeviceBuffer> WrapMemoryBase(void* p, size_t size) override {
return std::make_shared<Memory>(p, size);
}

std::unique_ptr<Search> CreateGreedy(const GeneratorParams& params) override { return std::make_unique<GreedySearch_Cpu>(params); }
std::unique_ptr<Search> CreateBeam(const GeneratorParams& params) override { return std::make_unique<BeamSearch_Cpu>(params); }

void Synchronize() override {}

private:
std::filesystem::path ep_path_;
};

static std::unique_ptr<Interface> interface_;

} // namespace RyzenAI

void RyzenAIInterface::Shutdown() {
Comment thread
baijumeswani marked this conversation as resolved.
RyzenAI::interface_.reset();
}

RyzenAIInterface* GetRyzenAIInterface() {
static std::once_flag once;

std::call_once(once, []() {
RyzenAI::interface_ = std::make_unique<RyzenAI::Interface>();
});

return RyzenAI::interface_.get();
}

} // namespace Generators
16 changes: 16 additions & 0 deletions src/ryzenai/interface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once

namespace Generators {

// Note: memory allocated through RyzenAI interface is host/cpu accessible
struct RyzenAIInterface : DeviceInterface {
using ProviderOptions = std::vector<std::pair<std::string, std::string>>;

virtual void SetupProvider(OrtSessionOptions&, const ProviderOptions&) = 0;

static void Shutdown();
};

RyzenAIInterface* GetRyzenAIInterface();

} // namespace Generators
1 change: 1 addition & 0 deletions src/smartptrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ enum struct DeviceType {
QNN,
OpenVINO,
NvTensorRtRtx,
RyzenAI,
MAX
};

Expand Down
Loading