diff --git a/cmake/global_variables.cmake b/cmake/global_variables.cmake index 76ce188336..5ab359139f 100644 --- a/cmake/global_variables.cmake +++ b/cmake/global_variables.cmake @@ -78,6 +78,8 @@ file(GLOB generator_srcs CONFIGURE_DEPENDS "${GENERATORS_ROOT}/webgpu/*.cpp" "${GENERATORS_ROOT}/openvino/*.h" "${GENERATORS_ROOT}/openvino/*.cpp" + "${GENERATORS_ROOT}/ryzenai/*.h" + "${GENERATORS_ROOT}/ryzenai/*.cpp" "${MODELS_ROOT}/*.h" "${MODELS_ROOT}/*.cpp" "${ENGINE_ROOT}/*.h" diff --git a/src/generators.cpp b/src/generators.cpp index d19751217c..cf24ea200c 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -15,6 +15,7 @@ #include "qnn/interface.h" #include "webgpu/interface.h" #include "openvino/interface.h" +#include "ryzenai/interface.h" #include "engine/engine.h" #if defined(_WIN32) @@ -94,6 +95,8 @@ void Shutdown() { } GetOrtGlobals().reset(); // Delete now because on process exit is too late + + RyzenAIInterface::Shutdown(); } OrtEnv& GetOrtEnv() { @@ -224,6 +227,8 @@ std::string to_string(DeviceType device_type) { return "OpenVINO"; case DeviceType::NvTensorRtRtx: return "NvTensorRtRtx"; + case DeviceType::RyzenAI: + return "RyzenAI"; default: throw std::runtime_error("Unknown device type"); } @@ -247,6 +252,8 @@ DeviceInterface* GetDeviceInterface(DeviceType type) { return GetQNNInterface(); case DeviceType::OpenVINO: return GetOpenVINOInterface(); + case DeviceType::RyzenAI: + return GetRyzenAIInterface(); } } @@ -358,7 +365,8 @@ void Generator::AppendTokens(cpu_span input_ids) { DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO, - DeviceType::NvTensorRtRtx}; + DeviceType::NvTensorRtRtx, + DeviceType::RyzenAI}; if (search_->GetSequenceLength() != 0 && std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(), diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 0ff2db1f0a..522ae711b0 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -15,8 +15,8 @@ Logits::Logits(State& state) input_sequence_lengths.resize(state_.params_->search.batch_size); - if (IsOpenVINOStatefulModel(state.model_)) { - // In the case of OpenVINO stateful models, they are patched in a way so that they only return the + if (IsOpenVINOStatefulModel(state.model_) || state_.model_.p_device_->GetType() == DeviceType::RyzenAI) { + // In the case of OpenVINO stateful models or RyzenAI models, they are patched in a way so that they only return the // sliced logits needed for sampling. For example, given 43 prompt tokens, instead of returning // logits of the shape: [1,43,] // they will have shape: [1, 1,]. diff --git a/src/models/model.cpp b/src/models/model.cpp index c4980b255b..517a9b9487 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -23,6 +23,7 @@ #include "qwen2_5_vl_image_processor.h" #include "../dml/interface.h" #include "../openvino/interface.h" +#include "../ryzenai/interface.h" #if defined(_WIN32) #include @@ -653,6 +654,12 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options, } else if (provider_options.name == "OpenVINO") { p_device = GetDeviceInterface(DeviceType::OpenVINO); OpenVINO_AppendProviderOptions(session_options, config, provider_options); + } else if (provider_options.name == "RyzenAI") { + p_device = GetDeviceInterface(DeviceType::RyzenAI); + + session_options.AddConfigEntry("model_root", config.config_path.string().c_str()); + + GetRyzenAIInterface()->SetupProvider(session_options, provider_options.options); } else { // For providers that go through the extensible AppendExecutionProvider API: if (provider_options.name == "QNN") { @@ -810,7 +817,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni // This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI. // Names for the device types used by 'SetProviderSessionOptions' - static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx"}; + static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx", "RyzenAI"}; static_assert(std::size(device_type_names) == static_cast(DeviceType::MAX)); // Create an OrtSessionOptions and set the options to use the DeviceType we're using here @@ -829,7 +836,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni allocator.session_ = OrtSession::Create(GetOrtEnv(), g_trivial_model, sizeof(g_trivial_model), session_options.get()); // Names for the device memory types used by 'OrtMemoryInfo::Create' - static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda"}; + static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda", "Cpu"}; static_assert(std::size(device_memory_type_names) == static_cast(DeviceType::MAX)); // Get the allocator from the OrtSession for the DeviceType (it's called 'AllocatorCreate' but it's really 'AllocatorGet') @@ -911,9 +918,10 @@ Model::Model(std::unique_ptr config) : config_{std::move(config)} { CreateSessionOptions(); EnsureDeviceOrtInit(*p_device_, *config_, arena_cfg_); - // Only CUDA, TRT-RTX and DML does every input on the device + // Only CUDA, TRT-RTX, RyzenAI and DML does every input on the device // For WebGPU, use device memory only if graph capture is enabled, otherwise use CPU if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx || + p_device_->GetType() == DeviceType::RyzenAI || (p_device_->GetType() == DeviceType::WEBGPU && IsGraphCaptureEnabled(config_->model.decoder.session_options))) p_device_inputs_ = p_device_; else diff --git a/src/ryzenai/interface.cpp b/src/ryzenai/interface.cpp new file mode 100644 index 0000000000..837f597428 --- /dev/null +++ b/src/ryzenai/interface.cpp @@ -0,0 +1,210 @@ +#include "../generators.h" +#include "../search.h" +#include "interface.h" +#include +#include +#include + +#if !defined(_WIN32) +#include +#endif + +namespace Generators { +namespace RyzenAI { + +static constexpr auto ep_path_env_key_ = "RYZENAI_EP_PATH"; +static constexpr auto ep_name_ = "RyzenAILightExecutionProvider"; +#if defined(_WIN32) +static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.dll"; +#else +static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.so"; +#endif +static constexpr auto func_custom_ops_ = "RyzenAI_RegisterCustomOps"; +static constexpr auto func_shutdown_ = "RyzenAI_Shutdown"; + +static Ort::Allocator* ort_allocator_{}; + +struct Memory : DeviceBuffer { + Memory(size_t size) : owned_{true} { + size_in_bytes_ = size; + p_cpu_ = p_device_ = static_cast(ort_allocator_->Alloc(size_in_bytes_)); + } + + Memory(void* p, size_t size) : owned_{false} { + size_in_bytes_ = size; + p_cpu_ = p_device_ = static_cast(p); + } + + ~Memory() override { + if (owned_) + ort_allocator_->Free(p_device_); + } + + const char* GetType() const override { return "RyzenAI"; } + + void AllocateCpu() override {} + void CopyDeviceToCpu() override {} + void CopyCpuToDevice() override {} + + void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { + CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes); + } + + void Zero() override { + memset(p_device_, 0, size_in_bytes_); + } + + bool owned_; +}; + +struct Interface : RyzenAIInterface { + Interface() { + // If already loaded then nothing to do +#if defined(_WIN32) + if (GetModuleHandleA(ep_filename_)) + return; +#else + if (dlopen(ep_filename_, RTLD_NOLOAD | RTLD_NOW)) + return; +#endif + + std::error_code ec; + + ep_path_ = GetEnv(ep_path_env_key_); + +#if defined(_WIN32) + const auto get_hmod_for_method = [](LPCVOID func) -> HMODULE { + MEMORY_BASIC_INFORMATION mbi; + + if (VirtualQuery(func, &mbi, sizeof(mbi)) && mbi.AllocationBase) + return (HMODULE)mbi.AllocationBase; + + return nullptr; + }; + + const auto find_next_to_module = [&](HMODULE hmod) -> std::filesystem::path { + wchar_t buffer[MAX_PATH + 1] = {0}; + const auto len = sizeof(buffer) / sizeof(buffer[0]); + + if (GetModuleFileNameW(hmod, buffer, len)) + if (const auto dir = std::filesystem::path{buffer}.remove_filename(); !dir.empty()) + if (auto path = dir / ep_filename_; std::filesystem::exists(path, ec)) + return path; + + return {}; + }; + + if (ep_path_.empty()) + // check next to onnxruntime-genai.dll + if (const auto hmod = get_hmod_for_method(GetRyzenAIInterface)) + ep_path_ = find_next_to_module(hmod); + + if (ep_path_.empty()) + // check next to onnxruntime.dll + if (const auto hmod = get_hmod_for_method(Ort::api->RegisterExecutionProviderLibrary)) + ep_path_ = find_next_to_module(hmod); + + if (ep_path_.empty()) + // check next to current executable + if (const auto hmod = GetModuleHandleA(NULL)) + ep_path_ = find_next_to_module(hmod); +#endif // _WIN32 + + if (ep_path_.empty()) + // fallback to current working directory + ep_path_ = std::filesystem::current_path(ec) / ep_filename_; + + Ort::ThrowOnError(Ort::api->RegisterExecutionProviderLibrary(GetOrtGlobals()->env_.get(), ep_name_, ep_path_.native().c_str())); + } + + ~Interface() { + // TODO: make it linux compatible +#if defined(_WIN32) + if (const auto mod = GetModuleHandleA(ep_filename_)) + if (const auto func = reinterpret_cast(GetProcAddress(mod, func_shutdown_))) + func(); +#endif // _WIN32 + } + + void SetupProvider(OrtSessionOptions& session_options, const ProviderOptions& provider_options) override { + std::vector supported_devices; + + { + const OrtEpDevice* const* devices = nullptr; + size_t ndevices = 0; + + Ort::ThrowOnError(Ort::api->GetEpDevices(&GetOrtEnv(), &devices, &ndevices)); + + for (const auto& device : std::span{devices, ndevices}) + if (std::string_view{ep_name_} == Ort::api->EpDevice_EpName(device) && + OrtHardwareDeviceType_NPU == Ort::api->HardwareDevice_Type(Ort::api->EpDevice_Device(device))) + supported_devices.push_back(device); + } + + if (supported_devices.empty()) + throw std::runtime_error{"No RyzenAI devices detected"}; + + { + std::vector ep_keys, ep_values; + + for (auto& option : provider_options) { + ep_keys.emplace_back(option.first.c_str()); + ep_values.emplace_back(option.second.c_str()); + } + + // this call merges provider_options into session_options + Ort::ThrowOnError(Ort::api->SessionOptionsAppendExecutionProvider_V2(&session_options, + &GetOrtEnv(), supported_devices.data(), supported_devices.size(), + ep_keys.data(), ep_values.data(), ep_keys.size())); + } + + Ort::ThrowOnError(Ort::api->RegisterCustomOpsUsingFunction(&session_options, func_custom_ops_)); + } + + DeviceType GetType() const override { return DeviceType::RyzenAI; } + + void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { + assert(!ort_allocator_); + ort_allocator_ = &allocator; + } + + Ort::Allocator& GetAllocator() override { + return *ort_allocator_; + } + + std::shared_ptr AllocateBase(size_t size) override { + return std::make_shared(size); + } + + std::shared_ptr WrapMemoryBase(void* p, size_t size) override { + return std::make_shared(p, size); + } + + std::unique_ptr CreateGreedy(const GeneratorParams& params) override { return std::make_unique(params); } + std::unique_ptr CreateBeam(const GeneratorParams& params) override { return std::make_unique(params); } + + void Synchronize() override {} + + private: + std::filesystem::path ep_path_; +}; + +static std::unique_ptr interface_; + +} // namespace RyzenAI + +void RyzenAIInterface::Shutdown() { + RyzenAI::interface_.reset(); +} + +RyzenAIInterface* GetRyzenAIInterface() { + static std::once_flag once; + + std::call_once(once, []() { + RyzenAI::interface_ = std::make_unique(); + }); + + return RyzenAI::interface_.get(); +} + +} // namespace Generators diff --git a/src/ryzenai/interface.h b/src/ryzenai/interface.h new file mode 100644 index 0000000000..3657ca9779 --- /dev/null +++ b/src/ryzenai/interface.h @@ -0,0 +1,16 @@ +#pragma once + +namespace Generators { + +// Note: memory allocated through RyzenAI interface is host/cpu accessible +struct RyzenAIInterface : DeviceInterface { + using ProviderOptions = std::vector>; + + virtual void SetupProvider(OrtSessionOptions&, const ProviderOptions&) = 0; + + static void Shutdown(); +}; + +RyzenAIInterface* GetRyzenAIInterface(); + +} // namespace Generators diff --git a/src/smartptrs.h b/src/smartptrs.h index 4276a75dca..c0a22b216f 100644 --- a/src/smartptrs.h +++ b/src/smartptrs.h @@ -92,6 +92,7 @@ enum struct DeviceType { QNN, OpenVINO, NvTensorRtRtx, + RyzenAI, MAX };