From 2809a3f5274e9456cfa8d83f5a347ec7ad184e52 Mon Sep 17 00:00:00 2001
From: Arthur Islamov <arthur@islamov.ai>
Date: Sat, 26 Aug 2023 03:57:06 +0400
Subject: [PATCH] Added DML and CUDA provider support in onnxruntime-node
 (#16050)

### Description
I've added changes to support CUDA and DML (only on Windows, on other
platforms it will throw an error)


### Motivation and Context
It fixes this feature request
https://github.com/microsoft/onnxruntime/issues/14127 which is tracked
here https://github.com/microsoft/onnxruntime/issues/14529

I was working on StableDiffusion implementation for node.js and it is
very slow on CPU, so GPU support is essential.

Here is a working demo with a patched and precompiled version
https://github.com/dakenf/stable-diffusion-nodejs

---------
---
 cmake/onnxruntime_nodejs.cmake                | 18 ++++++-
 js/common/lib/inference-session.ts            | 16 ++++++-
 js/node/CMakeLists.txt                        | 29 +++++++++++
 js/node/README.md                             |  4 ++
 js/node/lib/backend.ts                        |  1 +
 js/node/lib/binding.ts                        | 13 +++--
 js/node/lib/index.ts                          |  8 +++-
 js/node/script/build.ts                       | 20 ++++++++
 js/node/src/directml_load_helper.cc           | 37 ++++++++++++++
 js/node/src/directml_load_helper.h            |  6 +++
 js/node/src/inference_session_wrap.cc         | 46 ++++++++++++++++--
 js/node/src/inference_session_wrap.h          |  6 +++
 js/node/src/session_options_helper.cc         | 48 ++++++++++++++++++-
 js/node/src/tensor_helper.cc                  |  3 +-
 js/node/src/tensor_helper.h                   |  2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |  7 ++-
 .../nuget/templates/dml-vs-2022.yml           | 19 ++++++++
 .../azure-pipelines/templates/c-api-cpu.yml   | 19 ++++----
 .../linux-gpu-tensorrt-packaging-pipeline.yml | 17 ++++++-
 19 files changed, 292 insertions(+), 27 deletions(-)
 create mode 100644 js/node/src/directml_load_helper.cc
 create mode 100644 js/node/src/directml_load_helper.h

diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index 7b4ad950dc50a..6053b9d1088cd 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -60,6 +60,20 @@ else()
     endif()
 endif()
 
+# setup providers
+if (onnxruntime_USE_CUDA)
+    set(NODEJS_BINDING_USE_CUDA "--use_cuda")
+endif()
+if (onnxruntime_USE_DML)
+    set(NODEJS_BINDING_USE_DML "--use_dml")
+endif()
+if (onnxruntime_USE_TENSORRT)
+    set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
+endif()
+if (onnxruntime_USE_COREML)
+    set(NODEJS_BINDING_USE_COREML "--use_coreml")
+endif()
+
 if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
 # add custom target
 add_custom_target(js_npm_ci ALL
@@ -74,7 +88,9 @@ add_custom_target(js_common_npm_ci ALL
 
 add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
-    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --arch=${NODEJS_BINDING_ARCH}
+    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE}
+        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_TENSORRT}
+        ${NODEJS_BINDING_USE_COREML}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
     COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
 
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 846b3ea37ab70..834b1f670f167 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -170,12 +170,14 @@ export declare namespace InferenceSession {
   // Backend React Native: supports 'cpu', 'xnnpack', 'coreml' (iOS), 'nnapi' (Android).
   interface ExecutionProviderOptionMap {
     cpu: CpuExecutionProviderOption;
+    coreml: CoreMlExecutionProviderOption;
     cuda: CudaExecutionProviderOption;
+    dml: DmlExecutionProviderOption;
+    tensorrt: TensorRtExecutionProviderOption;
     wasm: WebAssemblyExecutionProviderOption;
     webgl: WebGLExecutionProviderOption;
     xnnpack: XnnpackExecutionProviderOption;
     webnn: WebNNExecutionProviderOption;
-    coreml: CoreMLExecutionProviderOption;
     nnapi: NnapiExecutionProviderOption;
   }
 
@@ -194,6 +196,18 @@ export declare namespace InferenceSession {
     readonly name: 'cuda';
     deviceId?: number;
   }
+  export interface CoreMlExecutionProviderOption extends ExecutionProviderOption {
+    readonly name: 'coreml';
+    coreMlFlags?: number;
+  }
+  export interface DmlExecutionProviderOption extends ExecutionProviderOption {
+    readonly name: 'dml';
+    deviceId?: number;
+  }
+  export interface TensorRtExecutionProviderOption extends ExecutionProviderOption {
+    readonly name: 'tensorrt';
+    deviceId?: number;
+  }
   export interface WebAssemblyExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'wasm';
   }
diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index 5557440f431af..c3898fbad7401 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -28,8 +28,29 @@ endif()
 # include dirs
 include_directories(${CMAKE_JS_INC})
 include_directories(${CMAKE_SOURCE_DIR}/../../include/onnxruntime/core/session)
+include_directories(${CMAKE_SOURCE_DIR}/../../include/onnxruntime)
+include_directories(${CMAKE_SOURCE_DIR}/../../onnxruntime)
 include_directories(${CMAKE_SOURCE_DIR}/node_modules/node-addon-api)
 
+# optional providers
+option(USE_DML "Build with DirectML support" OFF)
+option(USE_CUDA "Build with CUDA support" OFF)
+option(USE_TENSORRT "Build with TensorRT support" OFF)
+option(USE_COREML "Build with CoreML support" OFF)
+
+if(USE_DML)
+  add_compile_definitions(USE_DML=1)
+endif()
+if(USE_CUDA)
+  add_compile_definitions(USE_CUDA=1)
+endif()
+if(USE_TENSORRT)
+  add_compile_definitions(USE_TENSORRT=1)
+endif()
+if(USE_COREML)
+  add_compile_definitions(USE_COREML=1)
+endif()
+
 # source files
 file(GLOB ORT_NODEJS_BINDING_SOURCE_FILES ${CMAKE_SOURCE_DIR}/src/*.cc)
 
@@ -77,6 +98,14 @@ if (WIN32)
       ${ONNXRUNTIME_BUILD_DIR}/${CMAKE_BUILD_TYPE}/onnxruntime.dll
       ${dist_folder}
   )
+  if (USE_DML)
+    add_custom_command(
+      TARGET onnxruntime_binding POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+      ${ONNXRUNTIME_BUILD_DIR}/${CMAKE_BUILD_TYPE}/DirectML.dll
+      ${dist_folder}
+    )
+  endif ()
   if (CMAKE_BUILD_TYPE STREQUAL "Debug")
     add_custom_command(
       TARGET onnxruntime_binding POST_BUILD
diff --git a/js/node/README.md b/js/node/README.md
index 24995e0012c45..98b2ea66de2a8 100644
--- a/js/node/README.md
+++ b/js/node/README.md
@@ -24,6 +24,10 @@ Following platforms are supported with pre-built binaries:
 
 To use on platforms without pre-built binaries, you can build Node.js binding from source and consume it by `npm install <onnxruntime_repo_root>/js/node/`. See also [instructions](https://www.onnxruntime.ai/docs/how-to/build.html#apis-and-language-bindings) for building ONNX Runtime Node.js binding locally.
 
+# GPU Support
+
+Right now, the Windows version supports only the DML provider. Linux x64 can use CUDA and TensorRT.
+
 ## License
 
 License information can be found [here](https://github.com/microsoft/onnxruntime/blob/main/README.md#license).
diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts
index 32f71c6a76661..d3680f9d44236 100644
--- a/js/node/lib/backend.ts
+++ b/js/node/lib/backend.ts
@@ -69,3 +69,4 @@ class OnnxruntimeBackend implements Backend {
 }
 
 export const onnxruntimeBackend = new OnnxruntimeBackend();
+export const listSupportedBackends = binding.listSupportedBackends;
diff --git a/js/node/lib/binding.ts b/js/node/lib/binding.ts
index ca6cf51804d19..8a0ce89abfa64 100644
--- a/js/node/lib/binding.ts
+++ b/js/node/lib/binding.ts
@@ -33,11 +33,18 @@ export declare namespace Binding {
   export interface InferenceSessionConstructor {
     new(): InferenceSession;
   }
+
+  export interface SupportedBackend {
+    name: string;
+    bundled: boolean;
+  }
 }
 
 // export native binding
 export const binding =
     // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
-    require(`../bin/napi-v3/${process.platform}/${process.arch}/onnxruntime_binding.node`) as
-    // eslint-disable-next-line @typescript-eslint/naming-convention
-    {InferenceSession: Binding.InferenceSessionConstructor};
+    require(`../bin/napi-v3/${process.platform}/${process.arch}/onnxruntime_binding.node`) as {
+  // eslint-disable-next-line @typescript-eslint/naming-convention
+  InferenceSession: Binding.InferenceSessionConstructor;
+  listSupportedBackends: () => Binding.SupportedBackend[];
+};
diff --git a/js/node/lib/index.ts b/js/node/lib/index.ts
index fbea822da943e..9dba44bce43b5 100644
--- a/js/node/lib/index.ts
+++ b/js/node/lib/index.ts
@@ -2,10 +2,14 @@
 // Licensed under the MIT License.
 
 export * from 'onnxruntime-common';
+export {listSupportedBackends} from './backend';
 import {registerBackend, env} from 'onnxruntime-common';
-import {onnxruntimeBackend} from './backend';
 import {version} from './version';
+import {onnxruntimeBackend, listSupportedBackends} from './backend';
 
-registerBackend('cpu', onnxruntimeBackend, 100);
+const backends = listSupportedBackends();
+for (const backend of backends) {
+  registerBackend(backend.name, onnxruntimeBackend, 100);
+}
 
 env.versions.node = version;
diff --git a/js/node/script/build.ts b/js/node/script/build.ts
index 95dacd076d3a3..dfa88821a8d09 100644
--- a/js/node/script/build.ts
+++ b/js/node/script/build.ts
@@ -25,6 +25,14 @@ if (ARCH !== 'x64' && ARCH !== 'ia32' && ARCH !== 'arm64' && ARCH !== 'arm') {
 const ONNXRUNTIME_BUILD_DIR = buildArgs['onnxruntime-build-dir'];
 // --rebuild
 const REBUILD = !!buildArgs.rebuild;
+// --use_dml
+const USE_DML = !!buildArgs.use_dml;
+// --use_cuda
+const USE_CUDA = !!buildArgs.use_cuda;
+// --use_tensorrt
+const USE_TENSORRT = !!buildArgs.use_tensorrt;
+// --use_coreml
+const USE_COREML = !!buildArgs.use_coreml;
 
 // build path
 const ROOT_FOLDER = path.join(__dirname, '..');
@@ -47,6 +55,18 @@ const args = [
 if (ONNXRUNTIME_BUILD_DIR && typeof ONNXRUNTIME_BUILD_DIR === 'string') {
   args.push(`--CDONNXRUNTIME_BUILD_DIR=${ONNXRUNTIME_BUILD_DIR}`);
 }
+if (USE_DML) {
+  args.push('--CDUSE_DML=ON');
+}
+if (USE_CUDA) {
+  args.push('--CDUSE_CUDA=ON');
+}
+if (USE_TENSORRT) {
+  args.push('--CDUSE_TENSORRT=ON');
+}
+if (USE_COREML) {
+  args.push('--CDUSE_COREML=ON');
+}
 
 // set CMAKE_OSX_ARCHITECTURES for macOS build
 if (os.platform() === 'darwin') {
diff --git a/js/node/src/directml_load_helper.cc b/js/node/src/directml_load_helper.cc
new file mode 100644
index 0000000000000..7017f627fd3d7
--- /dev/null
+++ b/js/node/src/directml_load_helper.cc
@@ -0,0 +1,37 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef _WIN32
+#include "common.h"
+#include "windows.h"
+
+void LoadDirectMLDll(Napi::Env env) {
+  DWORD pathLen = MAX_PATH;
+  std::wstring path(pathLen, L'\0');
+  HMODULE moduleHandle = nullptr;
+
+  GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                    reinterpret_cast<LPCSTR>(&LoadDirectMLDll), &moduleHandle);
+
+  DWORD getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t *>(path.c_str()), pathLen);
+  while (getModuleFileNameResult == 0 || getModuleFileNameResult == pathLen) {
+    int ret = GetLastError();
+    if (ret == ERROR_INSUFFICIENT_BUFFER && pathLen < 32768) {
+      pathLen *= 2;
+      path.resize(pathLen);
+      getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t *>(path.c_str()), pathLen);
+    } else {
+      ORT_NAPI_THROW_ERROR(env, "Failed getting path to load DirectML.dll, error code: ", ret);
+    }
+  }
+
+  path.resize(path.rfind(L'\\') + 1);
+  path.append(L"DirectML.dll");
+  HMODULE libraryLoadResult = LoadLibraryW(path.c_str());
+
+  if (!libraryLoadResult) {
+    int ret = GetLastError();
+    ORT_NAPI_THROW_ERROR(env, "Failed loading bundled DirectML.dll, error code: ", ret);
+  }
+}
+#endif
diff --git a/js/node/src/directml_load_helper.h b/js/node/src/directml_load_helper.h
new file mode 100644
index 0000000000000..074a4f95ed476
--- /dev/null
+++ b/js/node/src/directml_load_helper.h
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if defined(USE_DML) && defined(_WIN32)
+void LoadDirectMLDll(Napi::Env env);
+#endif
diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
index 9f235d29d20b7..c409fdc8895f7 100644
--- a/js/node/src/inference_session_wrap.cc
+++ b/js/node/src/inference_session_wrap.cc
@@ -4,14 +4,19 @@
 #include "onnxruntime_cxx_api.h"
 
 #include "common.h"
+#include "directml_load_helper.h"
 #include "inference_session_wrap.h"
 #include "run_options_helper.h"
 #include "session_options_helper.h"
 #include "tensor_helper.h"
+#include <string>
 
 Napi::FunctionReference InferenceSessionWrap::constructor;
 
 Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
+#if defined(USE_DML) && defined(_WIN32)
+  LoadDirectMLDll(env);
+#endif
   // create ONNX runtime env
   Ort::InitApi();
   ORT_NAPI_THROW_ERROR_IF(
@@ -32,6 +37,10 @@ Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
   constructor = Napi::Persistent(func);
   constructor.SuppressDestruct();
   exports.Set("InferenceSession", func);
+
+  Napi::Function listSupportedBackends = Napi::Function::New(env, InferenceSessionWrap::ListSupportedBackends);
+  exports.Set("listSupportedBackends", listSupportedBackends);
+
   return exports;
 }
 
@@ -70,7 +79,7 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo &info) {
       int64_t bytesOffset = info[1].As<Napi::Number>().Int64Value();
       int64_t bytesLength = info[2].As<Napi::Number>().Int64Value();
 
-      ParseSessionOptions(info[1].As<Napi::Object>(), sessionOptions);
+      ParseSessionOptions(info[3].As<Napi::Object>(), sessionOptions);
       this->session_.reset(new Ort::Session(*env.GetInstanceData<Ort::Env>(),
                                             reinterpret_cast<char *>(buffer) + bytesOffset, bytesLength,
                                             sessionOptions));
@@ -154,6 +163,7 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
   std::vector<bool> reuseOutput;
   size_t inputIndex = 0;
   size_t outputIndex = 0;
+  OrtMemoryInfo *memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault).release();
 
   try {
     for (auto &name : inputNames_) {
@@ -161,7 +171,7 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
         inputIndex++;
         inputNames_cstr.push_back(name.c_str());
         auto value = feed.Get(name);
-        inputValues.push_back(NapiValueToOrtValue(env, value));
+        inputValues.push_back(NapiValueToOrtValue(env, value, memory_info));
       }
     }
     for (auto &name : outputNames_) {
@@ -170,7 +180,7 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
         outputNames_cstr.push_back(name.c_str());
         auto value = fetch.Get(name);
         reuseOutput.push_back(!value.IsNull());
-        outputValues.emplace_back(value.IsNull() ? Ort::Value{nullptr} : NapiValueToOrtValue(env, value));
+        outputValues.emplace_back(value.IsNull() ? Ort::Value{nullptr} : NapiValueToOrtValue(env, value, memory_info));
       }
     }
 
@@ -198,3 +208,33 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
     ORT_NAPI_THROW_ERROR(env, e.what());
   }
 }
+
+Napi::Value InferenceSessionWrap::ListSupportedBackends(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  Napi::EscapableHandleScope scope(env);
+  Napi::Array result = Napi::Array::New(env);
+
+  auto createObject = [&env](const std::string &name, const bool bundled) -> Napi::Object {
+    Napi::Object result = Napi::Object::New(env);
+    result.Set("name", name);
+    result.Set("bundled", bundled);
+    return result;
+  };
+
+  result.Set(uint32_t(0), createObject("cpu", true));
+
+#ifdef USE_DML
+  result.Set(result.Length(), createObject("dml", true));
+#endif
+#ifdef USE_CUDA
+  result.Set(result.Length(), createObject("cuda", false));
+#endif
+#ifdef USE_TENSORRT
+  result.Set(result.Length(), createObject("tensorrt", false));
+#endif
+#ifdef USE_COREML
+  result.Set(result.Length(), createObject("coreml", true));
+#endif
+
+  return scope.Escape(result);
+}
diff --git a/js/node/src/inference_session_wrap.h b/js/node/src/inference_session_wrap.h
index 1a51a70a836cf..9eee45b72dcb1 100644
--- a/js/node/src/inference_session_wrap.h
+++ b/js/node/src/inference_session_wrap.h
@@ -15,6 +15,12 @@ class InferenceSessionWrap : public Napi::ObjectWrap<InferenceSessionWrap> {
   InferenceSessionWrap(const Napi::CallbackInfo &info);
 
 private:
+  /**
+   * [sync] list supported backend list
+   * @returns array with objects { "name": "cpu", requirementsInstalled: true }
+   */
+  static Napi::Value ListSupportedBackends(const Napi::CallbackInfo &info);
+
   /**
    * [sync] create the session.
    * @param arg0 either a string (file path) or a Uint8Array
diff --git a/js/node/src/session_options_helper.cc b/js/node/src/session_options_helper.cc
index 55825a4a2baac..70e63da7cefa7 100644
--- a/js/node/src/session_options_helper.cc
+++ b/js/node/src/session_options_helper.cc
@@ -9,6 +9,19 @@
 
 #include "common.h"
 #include "session_options_helper.h"
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+#ifdef USE_DML
+#include "core/providers/dml/dml_provider_factory.h"
+#endif
+#ifdef USE_TENSORRT
+#include "core/providers/tensorrt/tensorrt_provider_factory.h"
+#include "core/providers/tensorrt/tensorrt_provider_options.h"
+#endif
+#ifdef USE_COREML
+#include "core/providers/coreml/coreml_provider_factory.h"
+#endif
 
 const std::unordered_map<std::string, GraphOptimizationLevel> GRAPH_OPT_LEVEL_NAME_TO_ID_MAP = {
     {"disabled", ORT_DISABLE_ALL},
@@ -23,6 +36,8 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions &sess
   for (uint32_t i = 0; i < epList.Length(); i++) {
     Napi::Value epValue = epList[i];
     std::string name;
+    int deviceId = 0;
+    int coreMlFlags = 0;
     if (epValue.IsString()) {
       name = epValue.As<Napi::String>().Utf8Value();
     } else if (!epValue.IsObject() || epValue.IsNull() || !epValue.As<Napi::Object>().Has("name") ||
@@ -30,14 +45,43 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions &sess
       ORT_NAPI_THROW_TYPEERROR(epList.Env(), "Invalid argument: sessionOptions.executionProviders[", i,
                                "] must be either a string or an object with property 'name'.");
     } else {
-      name = epValue.As<Napi::Object>().Get("name").As<Napi::String>().Utf8Value();
+      auto obj = epValue.As<Napi::Object>();
+      name = obj.Get("name").As<Napi::String>().Utf8Value();
+      if (obj.Has("deviceId")) {
+        deviceId = obj.Get("deviceId").As<Napi::Number>();
+      }
+      if (obj.Has("coreMlFlags")) {
+        coreMlFlags = obj.Get("coreMlFlags").As<Napi::Number>();
+      }
     }
 
     // CPU execution provider
     if (name == "cpu") {
       // TODO: handling CPU EP options
+#ifdef USE_CUDA
     } else if (name == "cuda") {
-      // TODO: handling Cuda EP options
+      OrtCUDAProviderOptionsV2 *options;
+      Ort::GetApi().CreateCUDAProviderOptions(&options);
+      options->device_id = deviceId;
+      sessionOptions.AppendExecutionProvider_CUDA_V2(*options);
+      Ort::GetApi().ReleaseCUDAProviderOptions(options);
+#endif
+#ifdef USE_TENSORRT
+    } else if (name == "tensorrt") {
+      OrtTensorRTProviderOptionsV2 *options;
+      Ort::GetApi().CreateTensorRTProviderOptions(&options);
+      options->device_id = deviceId;
+      sessionOptions.AppendExecutionProvider_TensorRT_V2(*options);
+      Ort::GetApi().ReleaseTensorRTProviderOptions(options);
+#endif
+#ifdef USE_DML
+    } else if (name == "dml") {
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_DML(sessionOptions, deviceId));
+#endif
+#ifdef USE_COREML
+    } else if (name == "coreml") {
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreMlFlags));
+#endif
     } else {
       ORT_NAPI_THROW_ERROR(epList.Env(), "Invalid argument: sessionOptions.executionProviders[", i,
                            "] is unsupported: '", name, "'.");
diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc
index a082448aa8f01..1c0b141e6a44f 100644
--- a/js/node/src/tensor_helper.cc
+++ b/js/node/src/tensor_helper.cc
@@ -106,7 +106,7 @@ const std::unordered_map<std::string, ONNXTensorElementDataType> DATA_TYPE_NAME_
     {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}};
 
 // currently only support tensor
-Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value) {
+Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo *memory_info) {
   ORT_NAPI_THROW_TYPEERROR_IF(!value.IsObject(), env, "Tensor must be an object.");
 
   // check 'dims'
@@ -180,7 +180,6 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value) {
                                 "Tensor.data must be a typed array (", DATA_TYPE_TYPEDARRAY_MAP[elemType], ") for ",
                                 tensorTypeString, " tensors, but got typed array (", typedArrayType, ").");
 
-    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
     char *buffer = reinterpret_cast<char *>(tensorDataTypedArray.ArrayBuffer().Data());
     size_t bufferByteOffset = tensorDataTypedArray.ByteOffset();
     // there is a bug in TypedArray::ElementSize(): https://github.com/nodejs/node-addon-api/pull/705
diff --git a/js/node/src/tensor_helper.h b/js/node/src/tensor_helper.h
index 019e5e8231bd7..d5e8ef709f53e 100644
--- a/js/node/src/tensor_helper.h
+++ b/js/node/src/tensor_helper.h
@@ -9,7 +9,7 @@
 #include "onnxruntime_cxx_api.h"
 
 // convert a Javascript OnnxValue object to an OrtValue object
-Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value);
+Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo *memory_info);
 
 // convert an OrtValue object to a Javascript OnnxValue object
 Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value &value);
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index ea998963d956b..d0b0a4ab19641 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -183,6 +183,8 @@ stages:
       artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
       buildJava: true
       buildJavaOption: '--build_java'
+      buildNodejs: true
+      buildNodejsOption: '--build_nodejs'
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
@@ -795,7 +797,7 @@ stages:
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
     ArtifactName: 'drop-nuget-dml'
     StageName: 'Windows_CI_GPU_DML_Dev'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --build_nodejs --cmake_generator "Visual Studio 17 2022"
     BuildArch: 'x64'
     msbuildArchitecture: 'amd64'
     EnvSetupScript: 'setup_env.bat'
@@ -841,7 +843,7 @@ stages:
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
     ArtifactName: 'drop-win-dml-arm64-zip'
     StageName: 'Windows_CI_GPU_DML_Dev_arm64'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --build_nodejs --cmake_generator "Visual Studio 17 2022"
     BuildArch: 'x64'
     EnvSetupScript: 'setup_env.bat'
     sln_platform: 'arm64'
@@ -873,6 +875,7 @@ stages:
     DoCompliance: ${{ parameters.DoCompliance }}
     DoEsrp: ${{ parameters.DoEsrp }}
     RunTests: 'false'
+    BuildNodejs: 'false'
     NuPackScript: |
      msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
      cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index 8335b5e8c6fa1..b1e36e63e86ab 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -7,6 +7,7 @@ parameters:
   NuPackScript : ''
   ArtifactName: 'drop-nuget'
   DoNodejsPack: 'false'
+  BuildNodejs: 'true'
   DoEsrp: 'false'
   DoTestCoverage: 'false'
   BuildArch: 'x64' # Optional. Options: x86, x64
@@ -266,6 +267,24 @@ stages:
           displayName: 'Unzip package to test'
           workingDirectory: '$(Build.ArtifactStagingDirectory)'
 
+      - ${{ if eq(parameters.BuildNodejs, 'true') }}:
+        - task: CopyFiles@2
+          displayName: 'Copy DirectML binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+          inputs:
+            SourceFolder: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
+            Contents: 'DirectML.dll'
+            TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+        - template: ../../templates/win-esrp-dll.yml
+          parameters:
+            FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+            DisplayName: 'ESRP - Sign Node.js binding binaries'
+            DoEsrp: ${{ parameters.DoEsrp }}
+            Pattern: '*.node'
+        - task: PublishPipelineArtifact@1
+          inputs:
+            targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+            artifactName: 'drop-onnxruntime-nodejs-win-${{ parameters.sln_platform }}-dml'
+
       - ${{ if eq(parameters['DoCompliance'], 'true') }}:
         - template: ../../templates/compliance.yml
           parameters :
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 21cd3a44e8924..830325b05d086 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -532,9 +532,10 @@ stages:
 - stage: Nodejs_Packaging_CPU
   dependsOn:
   - Linux_C_API_Packaging_CPU
+  - Linux_C_API_Packaging_GPU_TensorRT_x64
   - MacOS_C_API_Package_Publish
-  - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
+  - Windows_CI_GPU_DML_Dev
+  - Windows_CI_GPU_DML_Dev_arm64
   condition: succeeded()
   jobs:
   - job:
@@ -564,13 +565,13 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet (Win x64)'
       inputs:
-        artifactName: 'onnxruntime-win-x64'
+        artifactName: 'drop-nuget-dml'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet (Win ARM64)'
       inputs:
-        artifactName: 'onnxruntime-win-arm64'
+        artifactName: 'drop-win-dml-arm64-zip'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
     - task: DownloadPipelineArtifact@0
@@ -594,14 +595,14 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Win x64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-win-x64'
-        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/x64/'
+        artifactName: 'drop-onnxruntime-nodejs-win-x64-dml'
+        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/'
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Win ARM64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-win-arm64'
-        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/arm64/'
+        artifactName: 'drop-onnxruntime-nodejs-win-arm64-dml'
+        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/'
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (macOS x86_64)'
@@ -618,7 +619,7 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Linux x64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-linux-x64'
+        artifactName: 'drop-onnxruntime-nodejs-linux-x64-tensorrt'
         targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/linux/x64/'
 
     - task: DownloadPipelineArtifact@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
index 5b9ffac6fabb0..a0fe44e7b96ff 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
@@ -15,6 +15,14 @@ parameters:
   type: string
   default: ''
 
+- name: buildNodejs
+  type: boolean
+  default: true
+
+- name: buildNodejsOption
+  type: string
+  default: ''
+
 stages:
 - stage: Linux_C_API_Packaging_GPU_TensorRT_x64
   dependsOn: []
@@ -46,7 +54,7 @@ stages:
             docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
             --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
             /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
-            --skip_submodule_sync --parallel --build_shared_lib ${{ parameters.buildJavaOption }} --use_tensorrt --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
+            --skip_submodule_sync --parallel --build_shared_lib ${{ parameters.buildJavaOption }} ${{ parameters.buildNodejsOption }} --use_tensorrt --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
           workingDirectory: $(Build.SourcesDirectory)
 
       - ${{ if eq(parameters.buildJava, true) }}:
@@ -59,6 +67,13 @@ stages:
                 libraryName: 'libonnxruntime.so'
                 nativeLibraryName: 'libonnxruntime4j_jni.so'
 
+      - ${{ if eq(parameters.buildNodejs, 'true') }}:
+          - template: nodejs-artifacts-package-and-publish-steps-posix.yml
+            parameters:
+              arch: '${{parameters.OnnxruntimeNodejsBindingArch}}'
+              os: 'linux'
+              artifactName: 'drop-onnxruntime-nodejs-linux-x64-tensorrt'
+
       - template: c-api-artifacts-package-and-publish-steps-posix.yml
         parameters:
             buildConfig: 'Release'