microsoft · baijumeswani · Jan 8, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
@@ -78,6 +78,8 @@ file(GLOB generator_srcs CONFIGURE_DEPENDS
   "${GENERATORS_ROOT}/webgpu/*.cpp"
   "${GENERATORS_ROOT}/openvino/*.h"
   "${GENERATORS_ROOT}/openvino/*.cpp"
+  "${GENERATORS_ROOT}/ryzenai/*.h"
+  "${GENERATORS_ROOT}/ryzenai/*.cpp"
   "${MODELS_ROOT}/*.h"
   "${MODELS_ROOT}/*.cpp"
   "${ENGINE_ROOT}/*.h"

@@ -15,6 +15,7 @@
 #include "qnn/interface.h"
 #include "webgpu/interface.h"
 #include "openvino/interface.h"
+#include "ryzenai/interface.h"
 #include "engine/engine.h"
 
 #if defined(_WIN32)
@@ -94,6 +95,8 @@ void Shutdown() {
   }
 
   GetOrtGlobals().reset();  // Delete now because on process exit is too late
+
+  RyzenAIInterface::Shutdown();
 }
 
 OrtEnv& GetOrtEnv() {
@@ -224,6 +227,8 @@ std::string to_string(DeviceType device_type) {
       return "OpenVINO";
     case DeviceType::NvTensorRtRtx:
       return "NvTensorRtRtx";
+    case DeviceType::RyzenAI:
+      return "RyzenAI";
     default:
       throw std::runtime_error("Unknown device type");
   }
@@ -247,6 +252,8 @@ DeviceInterface* GetDeviceInterface(DeviceType type) {
       return GetQNNInterface();
     case DeviceType::OpenVINO:
       return GetOpenVINOInterface();
+    case DeviceType::RyzenAI:
+      return GetRyzenAIInterface();
   }
 }
 
@@ -358,7 +365,8 @@ void Generator::AppendTokens(cpu_span<const int32_t> input_ids) {
       DeviceType::CUDA,
       DeviceType::WEBGPU,
       DeviceType::OpenVINO,
-      DeviceType::NvTensorRtRtx};
+      DeviceType::NvTensorRtRtx,
+      DeviceType::RyzenAI};
 
   if (search_->GetSequenceLength() != 0 &&
       std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(),

@@ -15,8 +15,8 @@ Logits::Logits(State& state)
 
   input_sequence_lengths.resize(state_.params_->search.batch_size);
 
-  if (IsOpenVINOStatefulModel(state.model_)) {
-    // In the case of OpenVINO stateful models, they are patched in a way so that they only return the
+  if (IsOpenVINOStatefulModel(state.model_) || state_.model_.p_device_->GetType() == DeviceType::RyzenAI) {
+    // In the case of OpenVINO stateful models or RyzenAI models, they are patched in a way so that they only return the
     // sliced logits needed for sampling. For example, given 43 prompt tokens, instead of returning
     // logits of the shape:  [1,43,<vocab_size>]
     // they will have shape: [1, 1,<vocab_size>].

@@ -23,6 +23,7 @@
 #include "qwen2_5_vl_image_processor.h"
 #include "../dml/interface.h"
 #include "../openvino/interface.h"
+#include "../ryzenai/interface.h"
 
 #if defined(_WIN32)
 #include <direct.h>
@@ -653,6 +654,12 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
     } else if (provider_options.name == "OpenVINO") {
       p_device = GetDeviceInterface(DeviceType::OpenVINO);
       OpenVINO_AppendProviderOptions(session_options, config, provider_options);
+    } else if (provider_options.name == "RyzenAI") {
+      p_device = GetDeviceInterface(DeviceType::RyzenAI);
+
+      session_options.AddConfigEntry("model_root", config.config_path.string().c_str());
+
+      GetRyzenAIInterface()->SetupProvider(session_options, provider_options.options);
     } else {
       // For providers that go through the extensible AppendExecutionProvider API:
       if (provider_options.name == "QNN") {
@@ -810,7 +817,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni
   // This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI.
 
   // Names for the device types used by 'SetProviderSessionOptions'
-  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx"};
+  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx", "RyzenAI"};
   static_assert(std::size(device_type_names) == static_cast<size_t>(DeviceType::MAX));
 
   // Create an OrtSessionOptions and set the options to use the DeviceType we're using here
@@ -829,7 +836,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config, std::uni
   allocator.session_ = OrtSession::Create(GetOrtEnv(), g_trivial_model, sizeof(g_trivial_model), session_options.get());
 
   // Names for the device memory types used by 'OrtMemoryInfo::Create'
-  static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda"};
+  static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda", "Cpu"};
   static_assert(std::size(device_memory_type_names) == static_cast<size_t>(DeviceType::MAX));
 
   // Get the allocator from the OrtSession for the DeviceType (it's called 'AllocatorCreate' but it's really 'AllocatorGet')
@@ -911,9 +918,10 @@ Model::Model(std::unique_ptr<Config> config) : config_{std::move(config)} {
   CreateSessionOptions();
   EnsureDeviceOrtInit(*p_device_, *config_, arena_cfg_);
 
-  // Only CUDA, TRT-RTX and DML does every input on the device
+  // Only CUDA, TRT-RTX, RyzenAI and DML does every input on the device
   // For WebGPU, use device memory only if graph capture is enabled, otherwise use CPU
   if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx ||
+      p_device_->GetType() == DeviceType::RyzenAI ||
       (p_device_->GetType() == DeviceType::WEBGPU && IsGraphCaptureEnabled(config_->model.decoder.session_options)))
     p_device_inputs_ = p_device_;
   else

@@ -0,0 +1,210 @@
+#include "../generators.h"
+#include "../search.h"
+#include "interface.h"
+#include <filesystem>
+#include <mutex>
+#include <span>
+
+#if !defined(_WIN32)
+#include <dlfcn.h>
+#endif
+
+namespace Generators {
+namespace RyzenAI {
+
+static constexpr auto ep_path_env_key_ = "RYZENAI_EP_PATH";
+static constexpr auto ep_name_ = "RyzenAILightExecutionProvider";
+#if defined(_WIN32)
+static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.dll";
+#else
+static constexpr auto ep_filename_ = "onnxruntime_providers_ryzenai.so";
+#endif
+static constexpr auto func_custom_ops_ = "RyzenAI_RegisterCustomOps";
+static constexpr auto func_shutdown_ = "RyzenAI_Shutdown";
+
+static Ort::Allocator* ort_allocator_{};
+
+struct Memory : DeviceBuffer {
+  Memory(size_t size) : owned_{true} {
+    size_in_bytes_ = size;
+    p_cpu_ = p_device_ = static_cast<uint8_t*>(ort_allocator_->Alloc(size_in_bytes_));
+  }
+
+  Memory(void* p, size_t size) : owned_{false} {
+    size_in_bytes_ = size;
+    p_cpu_ = p_device_ = static_cast<uint8_t*>(p);
+  }
+
+  ~Memory() override {
+    if (owned_)
+      ort_allocator_->Free(p_device_);
+  }
+
+  const char* GetType() const override { return "RyzenAI"; }
+
+  void AllocateCpu() override {}
+  void CopyDeviceToCpu() override {}
+  void CopyCpuToDevice() override {}
+
+  void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override {
+    CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes);
+  }
+
+  void Zero() override {
+    memset(p_device_, 0, size_in_bytes_);
+  }
+
+  bool owned_;
+};
+
+struct Interface : RyzenAIInterface {
+  Interface() {
+    // If already loaded then nothing to do
+#if defined(_WIN32)
+    if (GetModuleHandleA(ep_filename_))
+      return;
+#else
+    if (dlopen(ep_filename_, RTLD_NOLOAD | RTLD_NOW))
+      return;
+#endif
+
+    std::error_code ec;
+
+    ep_path_ = GetEnv(ep_path_env_key_);
+
+#if defined(_WIN32)
+    const auto get_hmod_for_method = [](LPCVOID func) -> HMODULE {
+      MEMORY_BASIC_INFORMATION mbi;
+
+      if (VirtualQuery(func, &mbi, sizeof(mbi)) && mbi.AllocationBase)
+        return (HMODULE)mbi.AllocationBase;
+
+      return nullptr;
+    };
+
+    const auto find_next_to_module = [&](HMODULE hmod) -> std::filesystem::path {
+      wchar_t buffer[MAX_PATH + 1] = {0};
+      const auto len = sizeof(buffer) / sizeof(buffer[0]);
+
+      if (GetModuleFileNameW(hmod, buffer, len))
+        if (const auto dir = std::filesystem::path{buffer}.remove_filename(); !dir.empty())
+          if (auto path = dir / ep_filename_; std::filesystem::exists(path, ec))
+            return path;
+
+      return {};
+    };
+
+    if (ep_path_.empty())
+      // check next to onnxruntime-genai.dll
+      if (const auto hmod = get_hmod_for_method(GetRyzenAIInterface))
+        ep_path_ = find_next_to_module(hmod);
+
+    if (ep_path_.empty())
+      // check next to onnxruntime.dll
+      if (const auto hmod = get_hmod_for_method(Ort::api->RegisterExecutionProviderLibrary))
+        ep_path_ = find_next_to_module(hmod);
+
+    if (ep_path_.empty())
+      // check next to current executable
+      if (const auto hmod = GetModuleHandleA(NULL))
+        ep_path_ = find_next_to_module(hmod);
+#endif  // _WIN32
+
+    if (ep_path_.empty())
+      // fallback to current working directory
+      ep_path_ = std::filesystem::current_path(ec) / ep_filename_;
+
+    Ort::ThrowOnError(Ort::api->RegisterExecutionProviderLibrary(GetOrtGlobals()->env_.get(), ep_name_, ep_path_.native().c_str()));
+  }
+
+  ~Interface() {
+    // TODO: make it linux compatible
+#if defined(_WIN32)
+    if (const auto mod = GetModuleHandleA(ep_filename_))
+      if (const auto func = reinterpret_cast<void (*)()>(GetProcAddress(mod, func_shutdown_)))
+        func();
+#endif  // _WIN32
+  }
+
+  void SetupProvider(OrtSessionOptions& session_options, const ProviderOptions& provider_options) override {
+    std::vector<const OrtEpDevice*> supported_devices;
+
+    {
+      const OrtEpDevice* const* devices = nullptr;
+      size_t ndevices = 0;
+
+      Ort::ThrowOnError(Ort::api->GetEpDevices(&GetOrtEnv(), &devices, &ndevices));
+
+      for (const auto& device : std::span{devices, ndevices})
+        if (std::string_view{ep_name_} == Ort::api->EpDevice_EpName(device) &&
+            OrtHardwareDeviceType_NPU == Ort::api->HardwareDevice_Type(Ort::api->EpDevice_Device(device)))
+          supported_devices.push_back(device);
+    }
+
+    if (supported_devices.empty())
+      throw std::runtime_error{"No RyzenAI devices detected"};
+
+    {
+      std::vector<const char*> ep_keys, ep_values;
+
+      for (auto& option : provider_options) {
+        ep_keys.emplace_back(option.first.c_str());
+        ep_values.emplace_back(option.second.c_str());
+      }
+
+      // this call merges provider_options into session_options
+      Ort::ThrowOnError(Ort::api->SessionOptionsAppendExecutionProvider_V2(&session_options,
+                                                                           &GetOrtEnv(), supported_devices.data(), supported_devices.size(),
+                                                                           ep_keys.data(), ep_values.data(), ep_keys.size()));
+    }
+
+    Ort::ThrowOnError(Ort::api->RegisterCustomOpsUsingFunction(&session_options, func_custom_ops_));
+  }
+
+  DeviceType GetType() const override { return DeviceType::RyzenAI; }
+
+  void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override {
+    assert(!ort_allocator_);
+    ort_allocator_ = &allocator;
+  }
+
+  Ort::Allocator& GetAllocator() override {
+    return *ort_allocator_;
+  }
+
+  std::shared_ptr<DeviceBuffer> AllocateBase(size_t size) override {
+    return std::make_shared<Memory>(size);
+  }
+
+  std::shared_ptr<DeviceBuffer> WrapMemoryBase(void* p, size_t size) override {
+    return std::make_shared<Memory>(p, size);
+  }
+
+  std::unique_ptr<Search> CreateGreedy(const GeneratorParams& params) override { return std::make_unique<GreedySearch_Cpu>(params); }
+  std::unique_ptr<Search> CreateBeam(const GeneratorParams& params) override { return std::make_unique<BeamSearch_Cpu>(params); }
+
+  void Synchronize() override {}
+
+ private:
+  std::filesystem::path ep_path_;
+};
+
+static std::unique_ptr<Interface> interface_;
+
+}  // namespace RyzenAI
+
+void RyzenAIInterface::Shutdown() {
+  RyzenAI::interface_.reset();
+}
+
+RyzenAIInterface* GetRyzenAIInterface() {
+  static std::once_flag once;
+
+  std::call_once(once, []() {
+    RyzenAI::interface_ = std::make_unique<RyzenAI::Interface>();
+  });
+
+  return RyzenAI::interface_.get();
+}
+
+}  // namespace Generators
@@ -0,0 +1,16 @@
+#pragma once
+
+namespace Generators {
+
+// Note: memory allocated through RyzenAI interface is host/cpu accessible
+struct RyzenAIInterface : DeviceInterface {
+  using ProviderOptions = std::vector<std::pair<std::string, std::string>>;
+
+  virtual void SetupProvider(OrtSessionOptions&, const ProviderOptions&) = 0;
+
+  static void Shutdown();
+};
+
+RyzenAIInterface* GetRyzenAIInterface();
+
+}  // namespace Generators
@@ -92,6 +92,7 @@ enum struct DeviceType {
   QNN,
   OpenVINO,
   NvTensorRtRtx,
+  RyzenAI,
   MAX
 };
-Original file line number
+Diff line change
@@ Expand Up / @@ -92,6 +92,7 @@ enum struct DeviceType { @@
       QNN,
       OpenVINO,
       NvTensorRtRtx,
+      RyzenAI,
       MAX
     };
@@ Expand Down @@