diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 5ab1605dd3a99..cb54bd02d5500 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -107,6 +107,7 @@ option(onnxruntime_ENABLE_MICROSOFT_INTERNAL "Use this option to enable/disable
 option(onnxruntime_USE_VITISAI "Build with Vitis-AI" OFF)
 option(onnxruntime_USE_TENSORRT "Build with TensorRT support" OFF)
 option(onnxruntime_USE_TENSORRT_BUILTIN_PARSER "Use TensorRT builtin parser" OFF)
+option(onnxruntime_USE_NV "Build with TensorRT support" OFF)
 option(onnxruntime_ENABLE_LTO "Enable link time optimization" OFF)
 option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
 option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
@@ -250,6 +251,7 @@ option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for thre
 option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)
 
 option(onnxruntime_USE_TENSORRT_INTERFACE "Build ONNXRuntime shared lib which is compatible with TensorRT EP interface" OFF)
+option(onnxruntime_USE_NV_INTERFACE "Build ONNXRuntime shared lib which is compatible with NV EP interface" OFF)
 option(onnxruntime_USE_CUDA_INTERFACE "Build ONNXRuntime shared lib which is compatible with Cuda EP interface" OFF)
 option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is compatible with OpenVINO EP interface" OFF)
 option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
@@ -946,6 +948,15 @@ if (onnxruntime_USE_TENSORRT_INTERFACE AND (NOT onnxruntime_USE_TENSORRT))
     list(APPEND ORT_INTERFACE_FLAGS -DUSE_TENSORRT=1)
 endif()
 
+if (onnxruntime_USE_NV)
+    list(APPEND ORT_PROVIDER_FLAGS -DUSE_NV=1)
+    list(APPEND ONNXRUNTIME_PROVIDER_NAMES nv_tensorrt_rtx)
+endif()
+
+if (onnxruntime_USE_NV_INTERFACE AND (NOT onnxruntime_USE_NV))
+    list(APPEND ORT_INTERFACE_FLAGS -DUSE_NV=1)
+endif()
+
 if (onnxruntime_USE_RKNPU)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_RKNPU=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES rknpu)
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index d248f3652e064..e96bb32a7cd21 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -63,7 +63,7 @@ endif()
 if(onnxruntime_ENABLE_INSTRUMENT)
   target_compile_definitions(onnxruntime_framework PRIVATE ONNXRUNTIME_ENABLE_INSTRUMENT)
 endif()
-if(onnxruntime_USE_TENSORRT OR onnxruntime_USE_NCCL)
+if(onnxruntime_USE_TENSORRT OR onnxruntime_USE_NCCL OR onnxruntime_USE_NV)
 # TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP
 # TODO: provider_bridge_ort.cc should not include nccl.h
 target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 67fa48b28278d..d1984156187f6 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -132,6 +132,10 @@ if (onnxruntime_USE_TENSORRT)
   include(onnxruntime_providers_tensorrt.cmake)
 endif()
 
+if (onnxruntime_USE_NV)
+  include(onnxruntime_providers_nv.cmake)
+endif()
+
 if (onnxruntime_USE_VITISAI)
   include(onnxruntime_providers_vitisai.cmake)
 endif()
diff --git a/cmake/onnxruntime_providers_nv.cmake b/cmake/onnxruntime_providers_nv.cmake
new file mode 100644
index 0000000000000..06d44b5289518
--- /dev/null
+++ b/cmake/onnxruntime_providers_nv.cmake
@@ -0,0 +1,202 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+  find_package(CUDAToolkit REQUIRED 12.8)
+  enable_language(CUDA)
+  if(onnxruntime_DISABLE_CONTRIB_OPS)
+    message( FATAL_ERROR "To compile TensorRT execution provider contrib ops have to be enabled to dump an engine using com.microsoft:EPContext node." )
+  endif()
+  add_definitions(-DUSE_NV=1)
+  if (onnxruntime_NV_PLACEHOLDER_BUILDER)
+    add_definitions(-DORT_NV_PLACEHOLDER_BUILDER)
+  endif()
+  set(BUILD_LIBRARY_ONLY 1)
+  add_definitions("-DONNX_ML=1")
+  add_definitions("-DONNX_NAMESPACE=onnx")
+  set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
+  set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME})
+  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(PROTOBUF_LIBRARY ${PROTOBUF_LIB})
+  if (WIN32)
+    set(OLD_CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4099 /wd4551 /wd4505 /wd4515 /wd4706 /wd4456 /wd4324 /wd4701 /wd4804 /wd4702 /wd4458 /wd4703")
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4805")
+    endif()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -include algorithm")
+    set(DISABLED_WARNINGS_FOR_TRT /wd4456)
+  endif()
+  if ( CMAKE_COMPILER_IS_GNUCC )
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers")
+  endif()
+  set(CXX_VERSION_DEFINED TRUE)
+
+  find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    HINTS ${TENSORRT_ROOT}
+    PATH_SUFFIXES include)
+
+
+  file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
+  string(REGEX MATCH "define NV_TENSORRT_MAJOR * +([0-9]+)" NV_TENSORRT_MAJOR "${NVINFER_VER_CONTENT}")
+  string(REGEX REPLACE "define NV_TENSORRT_MAJOR * +([0-9]+)" "\\1" NV_TENSORRT_MAJOR "${NV_TENSORRT_MAJOR}")
+  string(REGEX MATCH "define NV_TENSORRT_MINOR * +([0-9]+)" NV_TENSORRT_MINOR "${NVINFER_VER_CONTENT}")
+  string(REGEX REPLACE "define NV_TENSORRT_MINOR * +([0-9]+)" "\\1" NV_TENSORRT_MINOR "${NV_TENSORRT_MINOR}")
+  string(REGEX MATCH "define NV_TENSORRT_PATCH * +([0-9]+)" NV_TENSORRT_PATCH "${NVINFER_VER_CONTENT}")
+  string(REGEX REPLACE "define NV_TENSORRT_PATCH * +([0-9]+)" "\\1" NV_TENSORRT_PATCH "${NV_TENSORRT_PATCH}")
+  math(EXPR NV_TENSORRT_MAJOR_INT "${NV_TENSORRT_MAJOR}")
+  math(EXPR NV_TENSORRT_MINOR_INT "${NV_TENSORRT_MINOR}")
+  math(EXPR NV_TENSORRT_PATCH_INT "${NV_TENSORRT_PATCH}")
+
+  if (NV_TENSORRT_MAJOR)
+    MESSAGE(STATUS "NV_TENSORRT_MAJOR is ${NV_TENSORRT_MAJOR}")
+  else()
+    MESSAGE(STATUS "Can't find NV_TENSORRT_MAJOR macro")
+  endif()
+
+  # Check TRT version >= 10.0.1.6
+  if ((NV_TENSORRT_MAJOR_INT GREATER 10) OR
+      (NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_MINOR_INT GREATER 0) OR
+      (NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_PATCH_INT GREATER 0))
+    set(TRT_GREATER_OR_EQUAL_TRT_10_GA ON)
+  else()
+    message( FATAL_ERROR "Only TensorRT 10.x or higher is supported." )
+  endif()
+
+  # TensorRT 10 GA onwards, the TensorRT libraries will have major version appended to the end on Windows,
+  # for example, nvinfer_10.dll, nvonnxparser_10.dll ...
+  if (WIN32 AND TRT_GREATER_OR_EQUAL_TRT_10_GA)
+    set(NVINFER_LIB "nvinfer_${NV_TENSORRT_MAJOR}")
+    set(PARSER_LIB "nvonnxparser_${NV_TENSORRT_MAJOR}")
+  endif()
+
+  if (NOT NVINFER_LIB)
+     set(NVINFER_LIB "nvinfer")
+  endif()
+
+  if (NOT PARSER_LIB)
+     set(PARSER_LIB "nvonnxparser")
+  endif()
+
+  MESSAGE(STATUS "Looking for ${NVINFER_LIB}")
+
+  find_library(TENSORRT_LIBRARY_INFER ${NVINFER_LIB}
+    HINTS ${TENSORRT_ROOT}
+    PATH_SUFFIXES lib lib64 lib/x64)
+
+  if (NOT TENSORRT_LIBRARY_INFER)
+    MESSAGE(STATUS "Can't find ${NVINFER_LIB}")
+  endif()
+
+  if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
+    MESSAGE(STATUS "Looking for ${PARSER_LIB}")
+
+    find_library(TENSORRT_LIBRARY_NVONNXPARSER ${PARSER_LIB}
+      HINTS  ${TENSORRT_ROOT}
+      PATH_SUFFIXES lib lib64 lib/x64)
+
+    if (NOT TENSORRT_LIBRARY_NVONNXPARSER)
+      MESSAGE(STATUS "Can't find ${PARSER_LIB}")
+    endif()
+
+    set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_NVONNXPARSER})
+    MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
+  else()
+    if (TRT_GREATER_OR_EQUAL_TRT_10_GA)
+      set(ONNX_USE_LITE_PROTO ON)
+    endif()
+    onnxruntime_fetchcontent_declare(
+      onnx_tensorrt
+      URL ${DEP_URL_onnx_tensorrt}
+      URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt}
+      EXCLUDE_FROM_ALL
+    )
+    if (NOT CUDA_INCLUDE_DIR)
+      set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS}) # onnx-tensorrt repo needs this variable to build
+    endif()
+    # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
+    # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
+    onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
+    include_directories(${onnx_tensorrt_SOURCE_DIR})
+    set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
+    if ( CMAKE_COMPILER_IS_GNUCC )
+      set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+    endif()
+    if (WIN32)
+      set(CMAKE_CUDA_FLAGS ${OLD_CMAKE_CUDA_FLAGS})
+      unset(PROTOBUF_LIBRARY)
+      unset(OLD_CMAKE_CXX_FLAGS)
+      unset(OLD_CMAKE_CUDA_FLAGS)
+      set_target_properties(${PARSER_LIB} PROPERTIES LINK_FLAGS "/ignore:4199")
+      target_compile_options(nvonnxparser_static PRIVATE /FIio.h /wd4100)
+      target_compile_options(${PARSER_LIB} PRIVATE /FIio.h /wd4100)
+    endif()
+    # Static libraries are just nvonnxparser_static on all platforms
+    set(onnxparser_link_libs nvonnxparser_static)
+    set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER})
+    MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
+  endif()
+
+  include_directories(${TENSORRT_INCLUDE_DIR})
+  # ${TENSORRT_LIBRARY} is empty if we link nvonnxparser_static.
+  # nvonnxparser_static is linked against tensorrt libraries in onnx-tensorrt
+  # See https://github.com/onnx/onnx-tensorrt/blob/8af13d1b106f58df1e98945a5e7c851ddb5f0791/CMakeLists.txt#L121
+  # However, starting from TRT 10 GA, nvonnxparser_static doesn't link against tensorrt libraries.
+  # Therefore, the above code finds ${TENSORRT_LIBRARY_INFER}
+  set(trt_link_libs ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY})
+  file(GLOB_RECURSE onnxruntime_providers_nv_tensorrt_rtx_cc_srcs CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_stream_handle.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_stream_handle.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_graph.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_graph.cc"
+  )
+
+  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_nv_tensorrt_rtx_cc_srcs})
+  onnxruntime_add_shared_library_module(onnxruntime_providers_nv_tensorrt_rtx ${onnxruntime_providers_nv_tensorrt_rtx_cc_srcs})
+  onnxruntime_add_include_to_target(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_common)
+  target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE Eigen3::Eigen  onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface Eigen3::Eigen)
+  add_dependencies(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
+    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
+  else()
+    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
+  endif()
+  target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}
+    PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+
+  # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
+  set_target_properties(onnxruntime_providers_nv_tensorrt_rtx PROPERTIES LINKER_LANGUAGE CUDA)
+  set_target_properties(onnxruntime_providers_nv_tensorrt_rtx PROPERTIES FOLDER "ONNXRuntime")
+  target_compile_definitions(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ONNXIFI_BUILD_LIBRARY=1)
+  target_compile_options(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${DISABLED_WARNINGS_FOR_TRT})
+  if (WIN32)
+    target_compile_options(onnxruntime_providers_nv_tensorrt_rtx INTERFACE /wd4456)
+  endif()
+  # set CUDA_MINIMAL as default for NV provider since we do not have fallback to CUDA
+  target_compile_definitions(onnxruntime_providers_nv_tensorrt_rtx PRIVATE USE_CUDA_MINIMAL=1)
+
+  # Needed for the provider interface, as it includes training headers when training is enabled
+  if (onnxruntime_ENABLE_TRAINING_OPS)
+    target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ORTTRAINING_ROOT})
+    if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
+      onnxruntime_add_include_to_target(onnxruntime_providers_nv_tensorrt_rtx Python::Module)
+    endif()
+  endif()
+
+  if(APPLE)
+    set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/exported_symbols.lst")
+  elseif(UNIX)
+    set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
+    set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/version_script.lds -Xlinker --gc-sections")
+  elseif(WIN32)
+    set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/symbols.def")
+  else()
+    message(FATAL_ERROR "onnxruntime_providers_nv_tensorrt_rtx unknown platform, need to specify shared library exports for it")
+  endif()
+
+  install(TARGETS onnxruntime_providers_nv_tensorrt_rtx
+          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 2743c1e522c9f..c57a2a962303d 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -928,6 +928,16 @@ if (onnxruntime_USE_TENSORRT)
   )
 endif()
 
+if (onnxruntime_USE_NV)
+  add_custom_command(
+    TARGET onnxruntime_pybind11_state POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:onnxruntime_providers_nv_tensorrt_rtx>
+        $<TARGET_FILE:onnxruntime_providers_shared>
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+  )
+endif()
+
 if (onnxruntime_USE_MIGRAPHX)
   add_custom_command(
     TARGET onnxruntime_pybind11_state POST_BUILD
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 4ffd49cdec01e..c0e31990552ea 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -61,13 +61,13 @@ function(AddTest)
             Threads::Threads)
     target_compile_definitions(${_UT_TARGET} PRIVATE -DUSE_ONNXRUNTIME_DLL)
   else()
-    if(onnxruntime_USE_CUDA)
+    if(onnxruntime_USE_CUDA OR onnxruntime_USE_NV)
       #XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs,
       # otherwise it will impact when CUDA DLLs can be unloaded.
       target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart)
-      if(NOT onnxruntime_CUDA_MINIMAL)
-          target_link_libraries(${_UT_TARGET} PRIVATE cudnn_frontend)
-      endif()
+    endif()
+    if(onnxruntime_USE_CUDA AND NOT onnxruntime_CUDA_MINIMAL)
+      target_link_libraries(${_UT_TARGET} PRIVATE cudnn_frontend)
     endif()
     target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
   endif()
@@ -75,7 +75,7 @@ function(AddTest)
   onnxruntime_add_include_to_target(${_UT_TARGET} date::date flatbuffers::flatbuffers)
   target_include_directories(${_UT_TARGET} PRIVATE ${TEST_INC_DIR})
   if (onnxruntime_USE_CUDA)
-    target_include_directories(${_UT_TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR})
+    target_include_directories(${_UT_TARGET} PRIVATE ${CUDAToolkit_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
     if (onnxruntime_USE_NCCL)
       target_include_directories(${_UT_TARGET} PRIVATE ${NCCL_INCLUDE_DIRS})
     endif()
@@ -87,6 +87,10 @@ function(AddTest)
     # used for instantiating placeholder TRT builder to mitigate TRT library load/unload overhead
     target_include_directories(${_UT_TARGET} PRIVATE ${TENSORRT_INCLUDE_DIR})
   endif()
+  if (onnxruntime_USE_NV)
+    # used for instantiating placeholder TRT builder to mitigate TRT library load/unload overhead
+    target_include_directories(${_UT_TARGET} PRIVATE ${NV_INCLUDE_DIR} ${CUDAToolkit_INCLUDE_DIRS})
+  endif()
 
   if(MSVC)
     target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
@@ -679,6 +683,15 @@ if(onnxruntime_USE_TENSORRT)
   list(APPEND onnxruntime_test_providers_libs ${TENSORRT_LIBRARY_INFER})
 endif()
 
+if(onnxruntime_USE_NV)
+  list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/nv_tensorrt_rtx/*)
+  list(APPEND onnxruntime_test_framework_src_patterns  "${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h")
+  list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_nv_tensorrt_rtx)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared)
+  list(APPEND onnxruntime_test_providers_libs ${TENSORRT_LIBRARY_INFER})
+endif()
+
+
 if(onnxruntime_USE_MIGRAPHX)
   list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/migraphx/*)
   list(APPEND onnxruntime_test_framework_src_patterns  "${ONNXRUNTIME_ROOT}/core/providers/migraphx/migraphx_execution_provider_utils.h")
@@ -896,7 +909,7 @@ if (USE_ROCM)
 endif()
 
 set(test_all_args)
-if (onnxruntime_USE_TENSORRT)
+if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV)
   # TRT EP CI takes much longer time when updating to TRT 8.2
   # So, we only run trt ep and exclude other eps to reduce CI test time.
   #
@@ -943,7 +956,7 @@ if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
   target_compile_options(onnxruntime_test_all PRIVATE -Wno-error=shorten-64-to-32)
 endif()
 
-if (UNIX AND onnxruntime_USE_TENSORRT)
+if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
     # The test_main.cc includes NvInfer.h where it has many deprecated declarations
     # simply ignore them for TensorRT EP build
     set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
@@ -1342,7 +1355,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       list(APPEND onnxruntime_shared_lib_test_LIBS cpuinfo)
     endif()
     if (onnxruntime_USE_CUDA)
-      list(APPEND onnxruntime_shared_lib_test_LIBS CUDA::cudart)
+      list(APPEND onnxruntime_shared_lib_test_LIBS)
     endif()
     if (onnxruntime_USE_ROCM)
       list(APPEND onnxruntime_shared_lib_test_LIBS hip::host)
@@ -1350,6 +1363,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     if (onnxruntime_USE_TENSORRT)
       list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
     endif()
+    if (onnxruntime_USE_NV)
+      list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER} CUDA::cudart)
+    endif()
     if (onnxruntime_USE_DML)
       list(APPEND onnxruntime_shared_lib_test_LIBS d3d12.lib)
     endif()
@@ -1371,10 +1387,12 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     target_include_directories(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_ROOT})
 
     if (onnxruntime_USE_CUDA)
-      target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+      target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
       target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu)
     endif()
-
+    if (onnxruntime_USE_NV)
+      target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+    endif()
     if (onnxruntime_USE_ROCM)
       target_include_directories(onnxruntime_shared_lib_test PRIVATE ${onnxruntime_ROCM_HOME}/include)
       target_compile_definitions(onnxruntime_shared_lib_test PRIVATE __HIP_PLATFORM_AMD__)
@@ -1396,7 +1414,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
         $<TARGET_FILE_DIR:onnxruntime_shared_lib_test>/testdata)
     endif()
 
-    if (UNIX AND onnxruntime_USE_TENSORRT)
+    if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
         # The test_main.cc includes NvInfer.h where it has many deprecated declarations
         # simply ignore them for TensorRT EP build
         set_property(TARGET onnxruntime_shared_lib_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
@@ -1578,7 +1596,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     list(APPEND custom_op_src_patterns
         "${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu"
         "${TEST_SRC_DIR}/testdata/custom_op_library/cuda/cuda_ops.*")
-    list(APPEND custom_op_lib_include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR})
+    list(APPEND custom_op_lib_include ${CUDAToolkit_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
     if (HAS_QSPECTRE)
       list(APPEND custom_op_lib_option "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /Qspectre>")
     endif()
@@ -1685,6 +1703,9 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     if (onnxruntime_USE_TENSORRT)
       list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
     endif()
+    if (onnxruntime_USE_NV)
+      list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
+    endif()
     if (CMAKE_SYSTEM_NAME MATCHES "AIX")
       list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_lora onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto)
     endif()
@@ -1703,7 +1724,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
         $<TARGET_FILE_DIR:onnxruntime_customopregistration_test>/testdata)
     endif()
 
-    if (UNIX AND onnxruntime_USE_TENSORRT)
+    if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
         # The test_main.cc includes NvInfer.h where it has many deprecated declarations
         # simply ignore them for TensorRT EP build
         set_property(TARGET onnxruntime_customopregistration_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
diff --git a/include/onnxruntime/core/graph/constants.h b/include/onnxruntime/core/graph/constants.h
index f072badd199ba..d3f1182909b5c 100644
--- a/include/onnxruntime/core/graph/constants.h
+++ b/include/onnxruntime/core/graph/constants.h
@@ -36,6 +36,7 @@ constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
 constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
 constexpr const char* kVitisAIExecutionProvider = "VitisAIExecutionProvider";
 constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";
+constexpr const char* kNvTensorRTRTXExecutionProvider = "NvTensorRTRTXExecutionProvider";
 constexpr const char* kNnapiExecutionProvider = "NnapiExecutionProvider";
 constexpr const char* kQnnExecutionProvider = "QNNExecutionProvider";
 constexpr const char* kRknpuExecutionProvider = "RknpuExecutionProvider";
diff --git a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
new file mode 100644
index 0000000000000..0b1cbe6afac79
--- /dev/null
+++ b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
@@ -0,0 +1,38 @@
+#pragma once
+
+/**
+
+ * @namespace onnxruntime::nv::provider_option_names
+ * @details The `provider_option_names` namespace contains the following constants:
+ * - `kDeviceId`: Specifies the GPU device ID to use.
+ * - `kHasUserComputeStream`: Indicates whether a user-provided compute stream is used.
+ * - `kUserComputeStream`: Specifies the user-provided compute stream.
+ * - `kMaxWorkspaceSize`: Sets the maximum workspace size for GPU memory allocation.
+ * - `kDumpSubgraphs`: Enables or disables dumping of subgraphs for debugging.
+ * - `kDetailedBuildLog`: Enables or disables detailed build logs for debugging.
+ * - `kProfilesMinShapes`: Specifies the minimum shapes for profiling.
+ * - `kProfilesMaxShapes`: Specifies the maximum shapes for profiling.
+ * - `kProfilesOptShapes`: Specifies the optimal shapes for profiling.
+ * - `kCudaGraphEnable`: Enables or disables CUDA graph optimizations.
+ * - `kONNXBytestream`: Specifies the ONNX model as a bytestream.
+ * - `kONNXBytestreamSize`: Specifies the size of the ONNX bytestream.
+ */
+namespace onnxruntime {
+namespace nv {
+namespace provider_option_names {
+constexpr const char* kDeviceId = "device_id";
+constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
+constexpr const char* kUserComputeStream = "user_compute_stream";
+constexpr const char* kMaxWorkspaceSize = "nv_max_workspace_size";
+constexpr const char* kDumpSubgraphs = "nv_dump_subgraphs";
+constexpr const char* kDetailedBuildLog = "nv_detailed_build_log";
+constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes";
+constexpr const char* kProfilesMaxShapes = "nv_profile_max_shapes";
+constexpr const char* kProfilesOptShapes = "nv_profile_opt_shapes";
+constexpr const char* kCudaGraphEnable = "nv_cuda_graph_enable";
+constexpr const char* kONNXBytestream = "nv_onnx_bytestream";
+constexpr const char* kONNXBytestreamSize = "nv_onnx_bytestream_size";
+
+}  // namespace provider_option_names
+}  // namespace nv
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index dbf4468db54b6..4866be40b4aba 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -299,6 +299,7 @@ ORT_RUNTIME_CLASS(ThreadingOptions);
 ORT_RUNTIME_CLASS(ArenaCfg);
 ORT_RUNTIME_CLASS(PrepackedWeightsContainer);
 ORT_RUNTIME_CLASS(TensorRTProviderOptionsV2);
+ORT_RUNTIME_CLASS(NvTensorRtRtxProviderOptions);
 ORT_RUNTIME_CLASS(CUDAProviderOptionsV2);
 ORT_RUNTIME_CLASS(CANNProviderOptions);
 ORT_RUNTIME_CLASS(DnnlProviderOptions);
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index a2937b6e82a27..a9deb2dd3e341 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -559,6 +559,7 @@ ORT_DEFINE_RELEASE(ValueInfo);
 ORT_DEFINE_RELEASE(Node);
 ORT_DEFINE_RELEASE(Graph);
 ORT_DEFINE_RELEASE(Model);
+ORT_DEFINE_RELEASE(KeyValuePairs)
 ORT_DEFINE_RELEASE_FROM_API_STRUCT(ModelCompilationOptions, GetCompileApi);
 
 #undef ORT_DEFINE_RELEASE
@@ -675,6 +676,7 @@ struct AllocatedFree {
 
 struct AllocatorWithDefaultOptions;
 struct Env;
+struct EpDevice;
 struct Graph;
 struct Model;
 struct Node;
@@ -737,6 +739,77 @@ struct ThreadingOptions : detail::Base<OrtThreadingOptions> {
   ThreadingOptions& SetGlobalCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn);
 };
 
+namespace detail {
+template <typename T>
+struct KeyValuePairsImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+  const char* GetValue(const char* key) const;
+
+  // get the pairs in unordered_map. needs to copy to std::string so the hash works as expected
+  std::unordered_map<std::string, std::string> GetKeyValuePairs() const;
+  // get the pairs in two vectors. entries will be 1:1 between keys and values. avoids copying to std::string
+  void GetKeyValuePairs(std::vector<const char*>& keys, std::vector<const char*>& values) const;
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstKeyValuePairs = detail::KeyValuePairsImpl<Ort::detail::Unowned<const OrtKeyValuePairs>>;
+
+/** \brief Wrapper around ::OrtKeyValuePair */
+struct KeyValuePairs : detail::KeyValuePairsImpl<OrtKeyValuePairs> {
+  explicit KeyValuePairs(std::nullptr_t) {}  ///< No instance is created
+  /// Take ownership of a pointer created by C API
+  explicit KeyValuePairs(OrtKeyValuePairs* p) : KeyValuePairsImpl<OrtKeyValuePairs>{p} {}
+
+  explicit KeyValuePairs();
+  explicit KeyValuePairs(const std::unordered_map<std::string, std::string>& kv_pairs);
+
+  void Add(const char* key, const char* value);
+  void Remove(const char* key);
+
+  ConstKeyValuePairs GetConst() const { return ConstKeyValuePairs{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct HardwareDeviceImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+  OrtHardwareDeviceType Type() const;
+  uint32_t VendorId() const;
+  uint32_t DeviceId() const;
+  const char* Vendor() const;
+  ConstKeyValuePairs Metadata() const;
+};
+}  // namespace detail
+
+/** \brief Wrapper around ::OrtHardwareDevice
+ * \remarks HardwareDevice is always read-only for API users.
+ */
+using ConstHardwareDevice = detail::HardwareDeviceImpl<Ort::detail::Unowned<const OrtHardwareDevice>>;
+
+namespace detail {
+template <typename T>
+struct EpDeviceImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+  const char* EpName() const;
+  const char* EpVendor() const;
+  ConstKeyValuePairs EpMetadata() const;
+  ConstKeyValuePairs EpOptions() const;
+  ConstHardwareDevice Device() const;
+};
+}  // namespace detail
+
+/** \brief Wrapper around ::OrtEpDevice
+ * \remarks EpDevice is always read-only for API users.
+ */
+using ConstEpDevice = detail::EpDeviceImpl<Ort::detail::Unowned<const OrtEpDevice>>;
+
 /** \brief The Env (Environment)
  *
  * The Env holds the logging state used by all other objects.
@@ -768,7 +841,14 @@ struct Env : detail::Base<OrtEnv> {
 
   Env& CreateAndRegisterAllocator(const OrtMemoryInfo* mem_info, const OrtArenaCfg* arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocator
 
-  Env& CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo* mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocatorV2
+  Env& CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo* mem_info,
+                                    const std::unordered_map<std::string, std::string>& options,
+                                    const OrtArenaCfg* arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocatorV2
+
+  Env& RegisterExecutionProviderLibrary(const char* registration_name, const std::basic_string<ORTCHAR_T>& path);  ///< Wraps OrtApi::RegisterExecutionProviderLibrary
+  Env& UnregisterExecutionProviderLibrary(const char* registration_name);                                          ///< Wraps OrtApi::UnregisterExecutionProviderLibrary
+
+  std::vector<ConstEpDevice> GetEpDevices() const;
 };
 
 /** \brief Custom Op Domain
@@ -919,7 +999,7 @@ struct ConstSessionOptionsImpl : Base<T> {
 
   std::string GetConfigEntry(const char* config_key) const;  ///< Wraps OrtApi::GetSessionConfigEntry
   bool HasConfigEntry(const char* config_key) const;         ///< Wraps OrtApi::HasSessionConfigEntry
-  std::string GetConfigEntryOrDefault(const char* config_key, const std::string& def);
+  std::string GetConfigEntryOrDefault(const char* config_key, const std::string& def) const;
 };
 
 template <typename T>
@@ -981,6 +1061,11 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
   SessionOptionsImpl& AppendExecutionProvider(const std::string& provider_name,
                                               const std::unordered_map<std::string, std::string>& provider_options = {});
 
+  SessionOptionsImpl& AppendExecutionProvider_V2(Env& env, const std::vector<ConstEpDevice>& ep_devices,
+                                                 const KeyValuePairs& ep_options);
+  SessionOptionsImpl& AppendExecutionProvider_V2(Env& env, const std::vector<ConstEpDevice>& ep_devices,
+                                                 const std::unordered_map<std::string, std::string>& ep_options);
+
   SessionOptionsImpl& SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn);  ///< Wraps OrtApi::SessionOptionsSetCustomCreateThreadFn
   SessionOptionsImpl& SetCustomThreadCreationOptions(void* ort_custom_thread_creation_options);      ///< Wraps OrtApi::SessionOptionsSetCustomThreadCreationOptions
   SessionOptionsImpl& SetCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn);        ///< Wraps OrtApi::SessionOptionsSetCustomJoinThreadFn
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index e41ef005349ac..57b4f1b3ead66 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -479,6 +479,120 @@ inline ThreadingOptions& ThreadingOptions::SetGlobalCustomJoinThreadFn(OrtCustom
   return *this;
 }
 
+namespace detail {
+template <typename T>
+inline const char* KeyValuePairsImpl<T>::GetValue(const char* key) const {
+  return GetApi().GetKeyValue(this->p_, key);
+}
+
+template <typename T>
+inline std::unordered_map<std::string, std::string> KeyValuePairsImpl<T>::GetKeyValuePairs() const {
+  std::unordered_map<std::string, std::string> out;
+
+  size_t num_pairs = 0;
+  const char* const* keys = nullptr;
+  const char* const* values = nullptr;
+  GetApi().GetKeyValuePairs(this->p_, &keys, &values, &num_pairs);
+  if (num_pairs > 0) {
+    out.reserve(num_pairs);
+    for (size_t i = 0; i < num_pairs; ++i) {
+      out.emplace(keys[i], values[i]);
+    }
+  }
+
+  return out;
+}
+
+template <typename T>
+inline void KeyValuePairsImpl<T>::GetKeyValuePairs(std::vector<const char*>& keys,
+                                                   std::vector<const char*>& values) const {
+  keys.clear();
+  values.clear();
+
+  size_t num_pairs = 0;
+  const char* const* keys_ptr = nullptr;
+  const char* const* values_ptr = nullptr;
+  GetApi().GetKeyValuePairs(this->p_, &keys_ptr, &values_ptr, &num_pairs);
+  if (num_pairs > 0) {
+    keys.resize(num_pairs);
+    values.resize(num_pairs);
+    std::copy(keys_ptr, keys_ptr + num_pairs, keys.begin());
+    std::copy(values_ptr, values_ptr + num_pairs, values.begin());
+  }
+}
+}  // namespace detail
+
+inline KeyValuePairs::KeyValuePairs() {
+  GetApi().CreateKeyValuePairs(&p_);
+}
+
+inline KeyValuePairs::KeyValuePairs(const std::unordered_map<std::string, std::string>& kv_pairs) {
+  GetApi().CreateKeyValuePairs(&p_);
+  for (const auto& kv : kv_pairs) {
+    GetApi().AddKeyValuePair(this->p_, kv.first.c_str(), kv.second.c_str());
+  }
+}
+
+inline void KeyValuePairs::Add(const char* key, const char* value) {
+  GetApi().AddKeyValuePair(this->p_, key, value);
+}
+
+inline void KeyValuePairs::Remove(const char* key) {
+  GetApi().RemoveKeyValuePair(this->p_, key);
+}
+
+namespace detail {
+template <typename T>
+inline OrtHardwareDeviceType HardwareDeviceImpl<T>::Type() const {
+  return GetApi().HardwareDevice_Type(this->p_);
+}
+
+template <typename T>
+inline uint32_t HardwareDeviceImpl<T>::VendorId() const {
+  return GetApi().HardwareDevice_VendorId(this->p_);
+}
+
+template <typename T>
+inline uint32_t HardwareDeviceImpl<T>::DeviceId() const {
+  return GetApi().HardwareDevice_DeviceId(this->p_);
+}
+
+template <typename T>
+inline const char* HardwareDeviceImpl<T>::Vendor() const {
+  return GetApi().HardwareDevice_Vendor(this->p_);
+}
+
+template <typename T>
+inline ConstKeyValuePairs HardwareDeviceImpl<T>::Metadata() const {
+  return ConstKeyValuePairs{GetApi().HardwareDevice_Metadata(this->p_)};
+}
+
+template <typename T>
+inline const char* EpDeviceImpl<T>::EpName() const {
+  return GetApi().EpDevice_EpName(this->p_);
+}
+
+template <typename T>
+inline const char* EpDeviceImpl<T>::EpVendor() const {
+  return GetApi().EpDevice_EpVendor(this->p_);
+}
+
+template <typename T>
+inline ConstKeyValuePairs EpDeviceImpl<T>::EpMetadata() const {
+  return ConstKeyValuePairs(GetApi().EpDevice_EpMetadata(this->p_));
+}
+
+template <typename T>
+inline ConstKeyValuePairs EpDeviceImpl<T>::EpOptions() const {
+  return ConstKeyValuePairs(GetApi().EpDevice_EpOptions(this->p_));
+}
+
+template <typename T>
+inline ConstHardwareDevice EpDeviceImpl<T>::Device() const {
+  return ConstHardwareDevice(GetApi().EpDevice_Device(this->p_));
+}
+}  // namespace detail
+
 inline Env::Env(OrtLoggingLevel logging_level, _In_ const char* logid) {
   ThrowOnError(GetApi().CreateEnv(logging_level, logid, &p_));
   if (strcmp(logid, "onnxruntime-node") == 0) {
@@ -551,6 +665,33 @@ inline Env& Env::CreateAndRegisterAllocatorV2(const std::string& provider_type,
   return *this;
 }
 
+inline Env& Env::RegisterExecutionProviderLibrary(const char* registration_name,
+                                                  const std::basic_string<ORTCHAR_T>& path) {
+  ThrowOnError(GetApi().RegisterExecutionProviderLibrary(p_, registration_name, path.c_str()));
+  return *this;
+}
+
+inline Env& Env::UnregisterExecutionProviderLibrary(const char* registration_name) {
+  ThrowOnError(GetApi().UnregisterExecutionProviderLibrary(p_, registration_name));
+  return *this;
+}
+
+inline std::vector<ConstEpDevice> Env::GetEpDevices() const {
+  size_t num_devices = 0;
+  const OrtEpDevice* const* device_ptrs = nullptr;
+  ThrowOnError(GetApi().GetEpDevices(p_, &device_ptrs, &num_devices));
+
+  std::vector<ConstEpDevice> devices;
+  if (num_devices > 0) {
+    devices.reserve(num_devices);
+    for (size_t i = 0; i < num_devices; ++i) {
+      devices.emplace_back(device_ptrs[i]);
+    }
+  }
+
+  return devices;
+}
+
 inline CustomOpDomain::CustomOpDomain(const char* domain) {
   ThrowOnError(GetApi().CreateCustomOpDomain(domain, &p_));
 }
@@ -717,7 +858,8 @@ inline bool ConstSessionOptionsImpl<T>::HasConfigEntry(const char* config_key) c
 }
 
 template <typename T>
-inline std::string ConstSessionOptionsImpl<T>::GetConfigEntryOrDefault(const char* config_key, const std::string& def) {
+inline std::string ConstSessionOptionsImpl<T>::GetConfigEntryOrDefault(const char* config_key,
+                                                                       const std::string& def) const {
   if (!this->HasConfigEntry(config_key)) {
     return def;
   }
@@ -955,6 +1097,53 @@ inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider(
   return *this;
 }
 
+namespace {
+template <typename T>
+void SessionOptionsAppendEP(detail::SessionOptionsImpl<T>& session_options,
+                            Env& env, const std::vector<ConstEpDevice>& ep_devices,
+                            const std::vector<const char*>& ep_options_keys,
+                            const std::vector<const char*>& ep_options_values) {
+  std::vector<const OrtEpDevice*> ep_devices_ptrs;
+  ep_devices_ptrs.reserve(ep_devices.size());
+  for (const auto& ep_device : ep_devices) {
+    ep_devices_ptrs.push_back(ep_device);
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_V2(
+      session_options, env, ep_devices_ptrs.data(), ep_devices_ptrs.size(),
+      ep_options_keys.data(), ep_options_values.data(), ep_options_keys.size()));
+}
+}  // namespace
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_V2(
+    Env& env, const std::vector<ConstEpDevice>& ep_devices, const KeyValuePairs& ep_options) {
+  std::vector<const char*> ep_options_keys, ep_options_values;
+  ep_options.GetKeyValuePairs(ep_options_keys, ep_options_values);
+
+  SessionOptionsAppendEP(*this, env, ep_devices, ep_options_keys, ep_options_values);
+
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_V2(
+    Env& env, const std::vector<ConstEpDevice>& ep_devices,
+    const std::unordered_map<std::string, std::string>& ep_options) {
+  std::vector<const char*> ep_options_keys, ep_options_values;
+  ep_options_keys.reserve(ep_options.size());
+  ep_options_values.reserve(ep_options.size());
+
+  for (const auto& [key, value] : ep_options) {
+    ep_options_keys.push_back(key.c_str());
+    ep_options_values.push_back(value.c_str());
+  }
+
+  SessionOptionsAppendEP(*this, env, ep_devices, ep_options_keys, ep_options_values);
+
+  return *this;
+}
+
 template <typename T>
 inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn) {
   ThrowOnError(GetApi().SessionOptionsSetCustomCreateThreadFn(this->p_, ort_custom_create_thread_fn));
diff --git a/onnxruntime/contrib_ops/webgpu/bert/attention.cc b/onnxruntime/contrib_ops/webgpu/bert/attention.cc
index 6e7919f281fb6..c6752b22ccc24 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/attention.cc
@@ -155,7 +155,7 @@ Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
 
   shader.MainFunctionBody() << "if (m + local_id.y < uniforms.M && n + local_id.x < total_sequence_length) {\n"
                             << "  let headOffset = batch_head_idx * uniforms.M * uniforms.N;\n"
-                            << "  let outputIdx = headOffset + m + local_id.y * uniforms.N + n + local_id.x;\n"
+                            << "  let outputIdx = headOffset + (m + local_id.y) * uniforms.N + n + local_id.x;\n"
                             << "  var sum: f32 = " << (components_ == 4 ? "value.x + value.y + value.z + value.w" : (components_ == 2 ? "value.x + value.y" : "value")) << ";\n";
 
   shader.MainFunctionBody() << "  output[outputIdx] = output_value_t(sum * uniforms.alpha)";
diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
index a451e3ad60e94..83c5d7bc8d92a 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
@@ -43,6 +43,14 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
       continue;
     }
 
+    // Require that the node's output is consumed by a single QuantizeLinear node.
+    // Otherwise, if only the inputs are quantized, but not the output, then this node group would not
+    // be considered a QDQ node unit anyway.
+    std::vector<const Node*> children_nodes = graph.GetConsumerNodes(node.OutputDefs()[0]->Name());
+    if (children_nodes.size() != 1 || children_nodes[0]->OpType() != QDQ::QOpName) {
+      continue;
+    }
+
     Node& dq_0 = *graph.GetNode(parent_node_0->Index());
     Node* dq_1 = nullptr;
     const ONNX_NAMESPACE::TensorProto* weight_proto = nullptr;
diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc
index 8c0136c495403..2aa3cf30813b6 100644
--- a/onnxruntime/core/optimizer/transformer_memcpy.cc
+++ b/onnxruntime/core/optimizer/transformer_memcpy.cc
@@ -218,6 +218,7 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node,
   auto node_provider_type = node.GetExecutionProviderType();
   if ((node_provider_type == provider_) ||
       (node_provider_type == kCudaExecutionProvider && kTensorrtExecutionProvider == provider_) ||
+      (node_provider_type == kCudaExecutionProvider && kNvTensorRTRTXExecutionProvider == provider_) ||
       (node_provider_type == kRocmExecutionProvider && kMIGraphXExecutionProvider == provider_)) {
     provider_nodes_.insert(&node);
     // note KernelCreateInfo might be nullptr for custom kernel
@@ -266,6 +267,7 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node,
         provider_output_defs_.insert(arg);
     }
   } else if (node_provider_type != kCudaExecutionProvider && node_provider_type != kTensorrtExecutionProvider &&
+             node_provider_type != kCudaExecutionProvider && node_provider_type != kNvTensorRTRTXExecutionProvider &&
              node_provider_type != kRocmExecutionProvider && node_provider_type != kMIGraphXExecutionProvider) {
     for (const auto* arg : node.InputDefs()) {
       if (arg->Exists())
@@ -307,6 +309,7 @@ void TransformerMemcpyImpl::BuildDefsMapping(const onnxruntime::NodeArg* arg,
     auto node_provider_type = it.GetExecutionProviderType();
     if ((node_provider_type == provider_) ||
         (node_provider_type == kCudaExecutionProvider && kTensorrtExecutionProvider == provider_) ||
+        (node_provider_type == kCudaExecutionProvider && kNvTensorRTRTXExecutionProvider == provider_) ||
         (node_provider_type == kRocmExecutionProvider && kMIGraphXExecutionProvider == provider_)) {
       const KernelCreateInfo* kci = nullptr;
       ORT_IGNORE_RETURN_VALUE(kernel_registries.SearchKernelRegistry(it, logger, &kci));
diff --git a/onnxruntime/core/platform/windows/debug_alloc.cc b/onnxruntime/core/platform/windows/debug_alloc.cc
index f3520b4f7f7f5..fed61854860f0 100644
--- a/onnxruntime/core/platform/windows/debug_alloc.cc
+++ b/onnxruntime/core/platform/windows/debug_alloc.cc
@@ -75,45 +75,41 @@ struct SymbolHelper {
 
   SymbolHelper() = default;
 
-  static constexpr size_t kInitialBufferSize = sizeof(SYMBOL_INFO) + MAX_SYM_NAME;
-
-  bool LoookupSymAndInitialize(const ULONG_PTR address, char* buffer, size_t buffer_size, SYMBOL_INFO* symbol) {
-    if (SymFromAddr(process_handle_, address, 0, symbol) != TRUE) {
+  bool LookupSymAndInitialize(const void* address, SYMBOL_INFO* symbol, std::ostream& message) {
+    if (SymFromAddr(process_handle_, reinterpret_cast<ULONG_PTR>(address), 0, symbol) != TRUE) {
       if (GetLastError() == ERROR_INVALID_HANDLE) {
         // Try to initialize first
-        if (!InitializeWhenNeeded() || SymFromAddr(process_handle_, address, 0, symbol) != TRUE) {
-          _snprintf_s(buffer, buffer_size, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+        if (!InitializeWhenNeeded() ||
+            SymFromAddr(process_handle_, reinterpret_cast<ULONG_PTR>(address), 0, symbol) != TRUE) {
+          message << "0x" << address << " (Unknown symbol)";
           return false;
         }
       } else {
-        _snprintf_s(buffer, buffer_size, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+        message << "0x" << address << " (Unknown symbol)";
         return false;
       }
     }
     return true;
   }
 
-  void Lookup(std::string& string, const ULONG_PTR address) {
-    alignas(SYMBOL_INFO) char buffer[kInitialBufferSize] = {0};
-    SYMBOL_INFO* symbol = reinterpret_cast<SYMBOL_INFO*>(buffer);
+  void Lookup(const void* address, std::ostream& message) {
+    SYMBOL_INFO_PACKAGE symbol_info_package{};
+    SYMBOL_INFO* symbol = &symbol_info_package.si;
     symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
-    symbol->MaxNameLen = MAX_SYM_NAME;
+    symbol->MaxNameLen = std::size(symbol_info_package.name);
 
-    if (!LoookupSymAndInitialize(address, buffer, kInitialBufferSize, symbol)) {
-      string.append(buffer);
+    if (!LookupSymAndInitialize(address, symbol, message)) {
       return;
     }
 
     Line line;
     DWORD displacement;
-    if (SymGetLineFromAddr(process_handle_, address, &displacement, &line) == false) {
-      _snprintf_s(buffer, _TRUNCATE, "(unknown file & line number): %s", symbol->Name);
-      string.append(buffer);
+    if (SymGetLineFromAddr(process_handle_, reinterpret_cast<ULONG_PTR>(address), &displacement, &line) == false) {
+      message << "(unknown file & line number): " << symbol->Name;
       return;
     }
 
-    _snprintf_s(buffer, _TRUNCATE, "%s(%d): %s", line.FileName, static_cast<int>(line.LineNumber), symbol->Name);
-    string.append(buffer);
+    message << line.FileName << "(" << line.LineNumber << "): " << symbol->Name;
   }
 
   struct Line : IMAGEHLP_LINE {
@@ -221,17 +217,17 @@ Memory_LeakCheck::~Memory_LeakCheck() {
     const MemoryBlock& block = *static_cast<const MemoryBlock*>(entry.lpData);
     const BYTE* pBlock = static_cast<const BYTE*>(entry.lpData) + sizeof(MemoryBlock);
 
-    std::string string;
-    char buffer[1024];
-    _snprintf_s(buffer, _TRUNCATE, "%Iu bytes at location 0x%08IX\n", entry.cbData - sizeof(MemoryBlock),
-                UINT_PTR(pBlock));
-    string.append(buffer);
+    std::ostringstream message;
+    message << (entry.cbData - sizeof(MemoryBlock)) << " bytes at location 0x" << static_cast<const void*>(pBlock)
+            << "\n";
     for (auto& p : block.m_pTraces) {
       if (!p) break;
-      symbols.Lookup(string, reinterpret_cast<ULONG_PTR>(p));
-      string.push_back('\n');
+      symbols.Lookup(p, message);
+      message << "\n";
     }
 
+    const std::string string = message.str();
+
     // Google test has memory leaks that they haven't fixed. One such issue is tracked here: https://github.com/google/googletest/issues/692
     //
     // In gtest-port.cc in function: static ThreadIdToThreadLocals* GetThreadLocalsMapLocked()
@@ -271,12 +267,8 @@ Memory_LeakCheck::~Memory_LeakCheck() {
   if (leaked_bytes) {
     DebugPrint("-----Ending Heap Trace-----\n\n");
 
-    std::string string;
-    char buffer[1024];
-    _snprintf_s(buffer, _TRUNCATE, "%d bytes of memory leaked in %d allocations", static_cast<int>(leaked_bytes), static_cast<int>(leak_count));
-    string.append(buffer);
-
-    std::cout << "\n----- MEMORY LEAKS: " << string.c_str() << "\n";
+    std::cout << "\n----- MEMORY LEAKS: " << leaked_bytes << " bytes of memory leaked in "
+              << leak_count << " allocations\n";
     if (!IsDebuggerPresent()) {
       exit(-1);
     }
diff --git a/onnxruntime/core/providers/get_execution_providers.cc b/onnxruntime/core/providers/get_execution_providers.cc
index 7d8c5525667b9..9ecabcad504b3 100644
--- a/onnxruntime/core/providers/get_execution_providers.cc
+++ b/onnxruntime/core/providers/get_execution_providers.cc
@@ -20,6 +20,14 @@ struct ProviderInfo {
 // kCpuExecutionProvider should always be last
 constexpr ProviderInfo kProvidersInPriorityOrder[] =
     {
+        {
+            kNvTensorRTRTXExecutionProvider,
+#ifdef USE_NV
+            true,
+#else
+            false,
+#endif
+        },
         {
             kTensorrtExecutionProvider,
 #ifdef USE_TENSORRT
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/exported_symbols.lst b/onnxruntime/core/providers/nv_tensorrt_rtx/exported_symbols.lst
new file mode 100644
index 0000000000000..f4c41412594af
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/exported_symbols.lst
@@ -0,0 +1 @@
+_GetProvider
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
new file mode 100644
index 0000000000000..42f8f9fe8a62c
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -0,0 +1,3232 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include <fstream>
+#include <list>
+#include <unordered_set>
+#include "core/providers/shared_library/provider_api.h"
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
+#include "nv_execution_provider.h"
+#include "nv_execution_provider_utils.h"
+#include "nv_execution_provider_custom_ops.h"
+#include "onnx_ctx_model_helper.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "core/providers/cuda/gpu_data_transfer.h"
+#include "core/session/allocator_adapters.h"
+#include "cuda_runtime_api.h"
+#include <gsl/gsl>
+#include <unordered_map>
+#include <utility>
+#include <limits>
+#include <map>
+#include <memory>
+#include <filesystem>
+// TODO: find a better way to share this
+#include "core/providers/cuda/cuda_stream_handle.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define LIBTYPE HINSTANCE
+#define OPENLIB(libname) LoadLibrary(libname)
+#define LIBFUNC(lib, fn) GetProcAddress((lib), (fn))
+#else
+#include <dlfcn.h>
+#define LIBTYPE void*
+#define OPENLIB(libname) dlopen((libname), RTLD_LAZY)
+#define LIBFUNC(lib, fn) dlsym((lib), (fn))
+#endif
+
+#define CUDA_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDA_CALL(expr))
+
+using namespace ONNX_NAMESPACE;
+using namespace ::onnxruntime::logging;
+namespace {
+// Check if cycle exists in the graph after partitioning
+bool FindCycleHelper(size_t i, const std::list<size_t>* adjacency_map, bool visited[], bool* st, std::vector<size_t>& cycles) {
+  if (!visited[i]) {
+    visited[i] = true;
+    st[i] = true;
+    for (auto iter = adjacency_map[i].begin(); iter != adjacency_map[i].end(); ++iter) {
+      if (!visited[*iter] && FindCycleHelper(*iter, adjacency_map, visited, st, cycles)) {
+        cycles.push_back(*iter);
+        return true;
+      } else if (st[*iter]) {
+        cycles.push_back(*iter);
+        return true;
+      }
+    }
+  }
+  st[i] = false;
+  return false;
+}
+}  // namespace
+
+namespace google {
+namespace protobuf {
+void ShutdownProtobufLibrary();
+}
+}  // namespace google
+
+struct ShutdownProtobuf {
+  ~ShutdownProtobuf() {
+    ::google::protobuf::ShutdownProtobufLibrary();
+  }
+} g_protobuf;
+
+namespace onnxruntime {
+
+namespace cuda {
+template <>
+void Impl_Cast(
+    cudaStream_t stream,
+    const int64_t* input_data, int32_t* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
+}
+
+template <>
+void Impl_Cast(
+    cudaStream_t stream,
+    const int32_t* input_data, int64_t* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
+}
+
+template <>
+void Impl_Cast(
+    cudaStream_t stream,
+    const double* input_data, float* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
+}
+
+template <>
+void Impl_Cast(
+    cudaStream_t stream,
+    const float* input_data, double* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
+}
+}  // namespace cuda
+
+template <>
+Status CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line) {
+  return g_host->CudaCall_false(retCode, exprString, libName, successCode, msg, file, line);
+}
+
+template <>
+void CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line) {
+  return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg, file, line);
+}
+
+#if NV_TENSORRT_MAJOR >= 10
+void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
+                                             uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept {
+  // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+  // even for empty tensors, so allocate a dummy byte.
+  size = std::max(size, static_cast<uint64_t>(1));
+  if (size > allocated_size) {
+    cudaFree(outputPtr);
+    outputPtr = nullptr;
+    allocated_size = 0;
+    if (cudaMalloc(&outputPtr, size) == cudaSuccess) {
+      allocated_size = size;
+    }
+  }
+  // if cudaMalloc fails, returns nullptr.
+  return outputPtr;
+}
+#else
+// Only override this method when TensorRT <= 8.6
+void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
+                                        uint64_t /*alignment*/) noexcept {
+  // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+  // even for empty tensors, so allocate a dummy byte.
+  size = std::max(size, static_cast<uint64_t>(1));
+  if (size > allocated_size) {
+    cudaFree(outputPtr);
+    outputPtr = nullptr;
+    allocated_size = 0;
+    if (cudaMalloc(&outputPtr, size) == cudaSuccess) {
+      allocated_size = size;
+    }
+  }
+  // if cudaMalloc fails, returns nullptr.
+  return outputPtr;
+}
+#endif
+
+void OutputAllocator::notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept {
+  output_shapes.clear();
+  output_shapes.reserve(dims.nbDims);
+  for (int i = 0; i < dims.nbDims; i++) {
+    output_shapes.push_back(dims.d[i]);
+  }
+}
+
+class Memcpy final : public OpKernel {
+ public:
+  Memcpy(const OpKernelInfo& info) : OpKernel(info) {}
+
+  Status Compute(OpKernelContext* ctx) const override {
+    const auto* X = ctx->Input<Tensor>(0);
+    ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
+    Tensor* Y = ctx->Output(0, X->Shape());
+    ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
+    auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
+    if (!gpu_data_transfer)
+      return Status(common::ONNXRUNTIME, common::EP_FAIL, "gpu data transfer is missing in TRT EP.");
+    if (!ctx->GetComputeStream())
+      return Status(common::ONNXRUNTIME, common::EP_FAIL, "Compute Stream is missing in TRT MemCpy kernel's context.");
+    return gpu_data_transfer->CopyTensorAsync(*X, *Y, *(ctx->GetComputeStream()));
+  }
+};
+
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+
+ONNX_OPERATOR_KERNEL_EX(
+    MemcpyFromHost,
+    kOnnxDomain,
+    1,
+    kNvTensorRTRTXExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Memcpy);
+
+ONNX_OPERATOR_KERNEL_EX(
+    MemcpyToHost,
+    kOnnxDomain,
+    1,
+    kNvTensorRTRTXExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .OutputMemoryType(OrtMemTypeCPUOutput, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Memcpy);
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kNvTensorRTRTXExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kNvTensorRTRTXExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
+
+static std::shared_ptr<KernelRegistry> s_kernel_registry;
+
+void InitializeRegistry() {
+  s_kernel_registry = KernelRegistry::Create();
+
+  static const BuildKernelCreateInfoFn function_table[] = {
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kNvTensorRTRTXExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kNvTensorRTRTXExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
+  };
+
+  for (auto& function_table_entry : function_table) {
+    ORT_THROW_IF_ERROR(s_kernel_registry->Register(function_table_entry()));
+  }
+}
+
+void DeleteRegistry() {
+  s_kernel_registry.reset();
+}
+
+std::shared_ptr<KernelRegistry> NvExecutionProvider::GetKernelRegistry() const {
+  return s_kernel_registry;
+}
+
+// Per TensorRT documentation, logger needs to be a singleton.
+TensorrtLogger& GetTensorrtLogger(bool verbose_log) {
+  const auto log_level = verbose_log ? nvinfer1::ILogger::Severity::kVERBOSE : nvinfer1::ILogger::Severity::kWARNING;
+  static TensorrtLogger trt_logger(log_level);
+  if (log_level != trt_logger.get_level()) {
+    trt_logger.set_level(verbose_log ? nvinfer1::ILogger::Severity::kVERBOSE : nvinfer1::ILogger::Severity::kWARNING);
+  }
+  return trt_logger;
+}
+
+std::unique_lock<std::mutex> NvExecutionProvider::GetApiLock() const {
+  static std::mutex singleton;
+  return std::unique_lock<std::mutex>(singleton);
+}
+
+/*
+ * Get the shape of "shape tensor" input
+ */
+template <typename T>
+Status GetShapeOfShapeTensor(Ort::ConstValue& input_tensor,
+                             void* shape_values,
+                             int shape_size,
+                             cudaStream_t stream) {
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(shape_values,
+                                       input_tensor.GetTensorData<T>(),
+                                       shape_size * sizeof(T),
+                                       cudaMemcpyDeviceToHost,
+                                       stream));
+  CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+  return Status::OK();
+}
+
+/*
+ * Apply TensorRT optimization profile shapes from provider options.
+ *
+ * This function supports single/multiple profile(s).
+ * (Note: An optimization profile describes a range of dimensions for each network input)
+ *
+ */
+bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationProfile*>& trt_profiles,
+                                           nvinfer1::ITensor* input,
+                                           std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_min_shapes,
+                                           std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_max_shapes,
+                                           std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_opt_shapes,
+                                           ShapeRangesMap& input_explicit_shape_ranges) {
+  if (trt_profiles.size() == 0) {
+    LOGS_DEFAULT(WARNING) << "[Nv EP] Number of optimization profiles should be greater than 0, but it's 0.";
+    return false;
+  }
+
+  const std::string& input_name = input->getName();
+  if (profile_min_shapes.find(input_name) == profile_min_shapes.end()) {
+    return false;
+  }
+
+  if (input_explicit_shape_ranges.find(input_name) == input_explicit_shape_ranges.end()) {
+    std::unordered_map<size_t, std::vector<std::vector<int64_t>>> inner_map;
+    input_explicit_shape_ranges[input_name] = inner_map;
+  }
+
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] Begin to apply profile shapes ...";
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] Input tensor name is '" << input_name << "', number of profiles found is " << trt_profiles.size();
+
+  for (size_t i = 0; i < trt_profiles.size(); i++) {
+    nvinfer1::Dims dims = input->getDimensions();
+    int nb_dims = dims.nbDims;
+
+    auto trt_profile = trt_profiles[i];
+
+    // Shape tensor
+    if (input->isShapeTensor()) {
+      int shape_size = nb_dims == 0 ? 1 : static_cast<int>(profile_min_shapes[input_name][i].size());
+      std::vector<int64_t> shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size);
+
+      LOGS_DEFAULT(VERBOSE) << "[Nv EP] shape size of this shape tensor is " << shape_size;
+
+      for (int j = 0; j < shape_size; j++) {
+        auto min_value = profile_min_shapes[input_name][i][j];
+        auto max_value = profile_max_shapes[input_name][i][j];
+        auto opt_value = profile_opt_shapes[input_name][i][j];
+        shapes_min[j] = static_cast<int64_t>(min_value);
+        shapes_max[j] = static_cast<int64_t>(max_value);
+        shapes_opt[j] = static_cast<int64_t>(opt_value);
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] shapes_min.d[" << j << "] is " << shapes_min[j];
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] shapes_max.d[" << j << "] is " << shapes_max[j];
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] shapes_opt.d[" << j << "] is " << shapes_opt[j];
+
+        if (input_explicit_shape_ranges[input_name].find(j) == input_explicit_shape_ranges[input_name].end()) {
+          std::vector<std::vector<int64_t>> profile_vector(trt_profiles.size());
+          input_explicit_shape_ranges[input_name][j] = profile_vector;
+        }
+        input_explicit_shape_ranges[input_name][static_cast<int64_t>(j)][i].push_back(min_value);
+        input_explicit_shape_ranges[input_name][static_cast<int64_t>(j)][i].push_back(max_value);
+        input_explicit_shape_ranges[input_name][static_cast<int64_t>(j)][i].push_back(opt_value);
+      }
+
+      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, &shapes_min[0], shape_size);
+      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, &shapes_max[0], shape_size);
+      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, &shapes_opt[0], shape_size);
+    }
+    // Execution tensor
+    else {
+      nvinfer1::Dims dims_min, dims_opt, dims_max;
+      dims_min.nbDims = nb_dims;
+      dims_max.nbDims = nb_dims;
+      dims_opt.nbDims = nb_dims;
+
+      LOGS_DEFAULT(VERBOSE) << "[Nv EP] number of dimension of this execution tensor is " << nb_dims;
+
+      for (int j = 0; j < nb_dims; j++) {
+        if (dims.d[j] == -1) {
+          auto min_value = profile_min_shapes[input_name][i][j];
+          auto max_value = profile_max_shapes[input_name][i][j];
+          auto opt_value = profile_opt_shapes[input_name][i][j];
+          dims_min.d[j] = static_cast<int32_t>(min_value);
+          dims_max.d[j] = static_cast<int32_t>(max_value);
+          dims_opt.d[j] = static_cast<int32_t>(opt_value);
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] dims_min.d[" << j << "] is " << dims_min.d[j];
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] dims_max.d[" << j << "] is " << dims_max.d[j];
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] dims_opt.d[" << j << "] is " << dims_opt.d[j];
+
+          if (input_explicit_shape_ranges[input_name].find(j) == input_explicit_shape_ranges[input_name].end()) {
+            std::vector<std::vector<int64_t>> profile_vector(trt_profiles.size());
+            input_explicit_shape_ranges[input_name][j] = profile_vector;
+          }
+          input_explicit_shape_ranges[input_name][static_cast<int64_t>(j)][i].push_back(min_value);
+          input_explicit_shape_ranges[input_name][static_cast<int64_t>(j)][i].push_back(max_value);
+          input_explicit_shape_ranges[input_name][static_cast<int64_t>(j)][i].push_back(opt_value);
+        } else {
+          dims_min.d[j] = dims.d[j];
+          dims_max.d[j] = dims.d[j];
+          dims_opt.d[j] = dims.d[j];
+        }
+      }
+
+      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, dims_min);
+      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, dims_max);
+      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, dims_opt);
+    }
+  }
+  return true;
+}
+
+/*
+ * Apply TensorRT optimization profile shapes from input tensor value.
+ *
+ * This function supports single/multiple profile(s).
+ * (Note: An optimization profile describes a range of dimensions for each network input)
+ *
+ * @param shape_tensor_values holds "shape tensor -> shape values" for the INT32 shape tensor input across this inference run
+ * @param shape_tensor_values_int64 holds "shape tensor -> shape values" for the INT64 shape tensor input across this inference run
+ */
+Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizationProfile*>& trt_profiles,
+                                              Ort::KernelContext ctx,
+                                              nvinfer1::ITensor* input,
+                                              ShapeRangesMap& shape_ranges,
+                                              const std::unordered_map<std::string, size_t>& input_indexes,
+                                              std::unordered_map<std::string, std::vector<int32_t>>& shape_tensor_values,
+                                              std::unordered_map<std::string, std::vector<int64_t>>& shape_tensor_values_int64,
+                                              cudaStream_t stream,
+                                              bool* engine_update) {
+  for (size_t i = 0; i < trt_profiles.size(); i++) {
+    const std::string& input_name = input->getName();
+    nvinfer1::Dims dims = input->getDimensions();
+    int nb_dims = dims.nbDims;
+
+    size_t input_index = 0;
+    const auto& iter = input_indexes.find(input_name);
+    if (iter != input_indexes.end()) {
+      input_index = iter->second;
+    }
+
+    auto input_tensor = ctx.GetInput(input_index);
+    auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+    const auto tensor_shapes = tensor_info.GetShape();
+    auto& shape_ranges_per_input = shape_ranges[input_name];
+
+    auto trt_profile = trt_profiles[i];
+
+    // If there are multiple profiles, for second and rest of profiles, simply copy the min/max/opt profile values from the first profile.
+    // Following "if statement" won't be executed since TRT EP currently only allows single profile for non-explicit profiles case.
+    if (i > 0) {
+      if (input->isShapeTensor()) {
+        // shape tensor
+        int shape_size = nb_dims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);
+        std::vector<int64_t> shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size);
+        for (int j = 0; j < shape_size; j++) {
+          shapes_min[j] = *(trt_profiles[0]->getShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN));
+          shapes_max[j] = *(trt_profiles[0]->getShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX));
+          shapes_opt[j] = *(trt_profiles[0]->getShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT));
+        }
+        trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, &shapes_min[0], shape_size);
+        trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, &shapes_max[0], shape_size);
+        trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, &shapes_opt[0], shape_size);
+      } else {
+        // execution tensor
+        nvinfer1::Dims dims_min, dims_opt, dims_max;
+        dims_min = trt_profiles[0]->getDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN);
+        dims_max = trt_profiles[0]->getDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX);
+        dims_opt = trt_profiles[0]->getDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT);
+        trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, dims_min);
+        trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, dims_max);
+        trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, dims_opt);
+      }
+      continue;
+    }
+
+    // Create shape profile
+    if (input->isShapeTensor()) {
+      // Get shape values for shape tensor input
+      const auto tensor_type = tensor_info.GetElementType();
+      // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension
+      int shape_size = dims.nbDims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);
+      // For setting TRT optimization profile. (Note: the min/opt/max profile values are still int32 even though int64 is supported after TRT 10)
+      std::vector<int32_t> values(shape_size);
+
+      switch (tensor_type) {
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
+          auto buffer = std::make_unique<int32_t[]>(shape_size);
+          auto status = GetShapeOfShapeTensor<int32_t>(input_tensor, buffer.get(), shape_size, stream);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+          }
+          shape_tensor_values[input_name].resize(shape_size);
+          for (int j = 0; j < shape_size; ++j) {
+            shape_tensor_values[input_name][j] = buffer[j];
+            values[j] = buffer[j];
+          }
+          break;
+        }
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+          auto buffer = std::make_unique<int64_t[]>(shape_size);
+          auto status = GetShapeOfShapeTensor<int64_t>(input_tensor, buffer.get(), shape_size, stream);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+          }
+          shape_tensor_values_int64[input_name].resize(shape_size);
+          for (int j = 0; j < shape_size; ++j) {
+            shape_tensor_values_int64[input_name][j] = buffer[j];
+            values[j] = static_cast<int32_t>(buffer[j]);
+          }
+          break;
+        }
+        default: {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT shape tensor data type: " + std::to_string(tensor_type) + " not supported.");
+        }
+      }
+
+      // Update shape ranges
+      std::vector<int64_t> shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size);
+      int shape_range_size = static_cast<int>(shape_ranges_per_input.size());
+      if (shape_size == shape_range_size) {
+        // If shape size matches, check/update shape range
+        for (int j = 0; j < shape_size; ++j) {
+          auto& shape_range = shape_ranges_per_input[j][0];  // only has one profile
+          shapes_min[j] = static_cast<int64_t>(shape_range[0]);
+          shapes_max[j] = static_cast<int64_t>(shape_range[1]);
+          shapes_opt[j] = static_cast<int64_t>(shape_range[2]);
+
+          const auto& tensor_shape_value = values[j];
+          // Update shape range lower bound
+          if (tensor_shape_value < shape_range[0]) {
+            shape_range[0] = tensor_shape_value;
+            shapes_min[j] = tensor_shape_value;
+            *engine_update = true;
+          }
+          // Update shape range upper bound
+          if (tensor_shape_value > shape_range[1]) {
+            shape_range[1] = tensor_shape_value;
+            shape_range[2] = tensor_shape_value;
+            shapes_max[j] = tensor_shape_value;
+            shapes_opt[j] = tensor_shape_value;
+            *engine_update = true;
+          }
+        }
+      } else {
+        // If shape size doesn't match, initialize shape_range with the new shape value
+        shape_ranges_per_input.clear();
+        for (int j = 0; j < shape_size; ++j) {
+          const auto& tensor_shape_value = values[j];
+          std::vector<std::vector<int64_t>> profile_vector;
+          std::vector<int64_t> shape_vector{tensor_shape_value, tensor_shape_value, tensor_shape_value};
+          profile_vector.push_back(shape_vector);  // only one profile needed
+          shape_ranges_per_input[j] = profile_vector;
+          shapes_min[j] = tensor_shape_value;
+          shapes_opt[j] = tensor_shape_value;
+          shapes_max[j] = tensor_shape_value;
+        }
+        *engine_update = true;
+      }
+
+      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, &shapes_min[0], shape_size);
+      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, &shapes_max[0], shape_size);
+      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, &shapes_opt[0], shape_size);
+    } else {  // Execution tensor
+      nvinfer1::Dims dims_min(dims), dims_opt(dims), dims_max(dims);
+      for (int j = 0, end = nb_dims; j < end; ++j) {
+        const auto& tensor_shape = tensor_shapes[j];
+        if (shape_ranges_per_input.find(j) != shape_ranges_per_input.end()) {
+          auto& shape_range = shape_ranges_per_input[j][0];  // only has one profile
+          dims_min.d[j] = static_cast<int32_t>(shape_range[0]);
+          dims_max.d[j] = static_cast<int32_t>(shape_range[1]);
+          dims_opt.d[j] = static_cast<int32_t>(shape_range[2]);
+
+          // Update minimum dimension
+          if (tensor_shape < shape_range[0]) {
+            shape_range[0] = tensor_shape;
+            dims_min.d[j] = static_cast<int32_t>(tensor_shape);
+            *engine_update = true;
+          }
+          // Update maximum dimension
+          if (tensor_shape > shape_range[1]) {
+            shape_range[1] = tensor_shape;
+            shape_range[2] = tensor_shape;
+            dims_max.d[j] = static_cast<int32_t>(tensor_shape);
+            dims_opt.d[j] = static_cast<int32_t>(tensor_shape);
+            *engine_update = true;
+          }
+        }
+      }
+
+      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, dims_min);
+      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, dims_max);
+      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, dims_opt);
+    }
+  }
+  return Status::OK();
+}
+
+#define CASE_GET_INPUT_TENSOR(DATA_TYPE, SrcT)                                              \
+  case DATA_TYPE: {                                                                         \
+    auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                             \
+    if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                      \
+      data = const_cast<SrcT*>(input_tensor_ptr);                                           \
+    } else {                                                                                \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
+      data = scratch_buffers.back().get();                                                  \
+    }                                                                                       \
+    break;                                                                                  \
+  }
+
+#define CASE_GET_CAST_INPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                         \
+  case DATA_TYPE: {                                                                                               \
+    auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                                                   \
+    if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                                            \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
+      data = scratch_buffers.back().get();                                                                        \
+      cuda::Impl_Cast<SrcT, DstT>(stream, input_tensor_ptr, reinterpret_cast<DstT*>(data), elem_cnt);             \
+    } else {                                                                                                      \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
+      data = scratch_buffers.back().get();                                                                        \
+    }                                                                                                             \
+    break;                                                                                                        \
+  }
+
+#define CASE_GET_OUTPUT_TENSOR(DATA_TYPE, SrcT)                                             \
+  case DATA_TYPE: {                                                                         \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                    \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                     \
+      buffers[output_name] = output_tensor_ptr;                                             \
+    } else {                                                                                \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
+      buffers[output_name] = scratch_buffers.back().get();                                  \
+    }                                                                                       \
+    break;                                                                                  \
+  }
+
+#define CASE_GET_CAST_OUTPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                        \
+  case DATA_TYPE: {                                                                                               \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                                          \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                           \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
+      buffers[output_name] = scratch_buffers.back().get();                                                        \
+      output_dim_sizes[i] = static_cast<int>(elem_cnt);                                                           \
+    } else {                                                                                                      \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
+      buffers[output_name] = scratch_buffers.back().get();                                                        \
+      output_dim_sizes[i] = 1;                                                                                    \
+    }                                                                                                             \
+    break;                                                                                                        \
+  }
+
+#define CASE_COPY_TENSOR(DATA_TYPE, DstT)                                                                                                          \
+  case DATA_TYPE: {                                                                                                                                \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                           \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                                                            \
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(DstT), cudaMemcpyDeviceToDevice, stream)); \
+    }                                                                                                                                              \
+    break;                                                                                                                                         \
+  }
+
+#define CASE_CAST_TENSOR(DATA_TYPE, SrcT, DstT)                                                                                                   \
+  case DATA_TYPE: {                                                                                                                               \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                          \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                                                           \
+      cuda::Impl_Cast<SrcT, DstT>(stream, reinterpret_cast<SrcT*>(allocator->getBuffer()), reinterpret_cast<DstT*>(output_tensor_ptr), elem_cnt); \
+    }                                                                                                                                             \
+    break;                                                                                                                                        \
+  }
+
+/*
+ * Set Nv executio context input.
+ *
+ * There are two types of input tensor: (1) shape tensor and (2) execution tensor.
+ * The input buffer binding needs to be handled differently.
+ *
+ * @param shape_tensor_values holds "shape tensor -> shape values" for the INT32 shape tensor input across this inference run
+ * @param shape_tensor_values_int64 holds "shape tensor -> shape values" for the INT64 shape tensor input across this inference run
+ */
+Status BindContextInput(Ort::KernelContext& ctx,
+                        nvinfer1::ICudaEngine* trt_engine,
+                        nvinfer1::IExecutionContext* trt_context,
+                        const char* input_name,
+                        size_t input_index,
+                        std::unordered_map<std::string, std::vector<int32_t>>& shape_tensor_values,
+                        std::unordered_map<std::string, std::vector<int64_t>>& shape_tensor_values_int64,
+                        std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
+                        OrtAllocator* alloc,
+                        cudaStream_t stream) {
+  auto input_tensor = ctx.GetInput(input_index);
+  auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+  const auto tensor_shapes = tensor_info.GetShape();
+  const auto tensor_type = tensor_info.GetElementType();
+  /*
+   * Return the number of elements specified by the tensor shape (all dimensions multiplied by each other).
+   * For 0 dimensions, 1 is returned. If any dimension is less than 0, the result is always -1.
+   *
+   * Examples:<br>
+   * [] = 1<br>
+   * [1,3,4] = 12<br>
+   * [2,0,4] = 0<br>
+   * [-1,3,4] = -1<br>
+   */
+  const auto elem_cnt = tensor_info.GetElementCount();
+
+  if (trt_engine->isShapeInferenceIO(input_name)) {
+    // Bind "shape tensor" input buffer
+
+    // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension
+    int shape_size = trt_engine->getTensorShape(input_name).nbDims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);
+    switch (tensor_type) {
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
+        // get shape tensor value if not present
+        if (shape_tensor_values.find(input_name) == shape_tensor_values.end()) {
+          auto input = std::make_unique<int32_t[]>(shape_size);
+          auto status = GetShapeOfShapeTensor<int32_t>(input_tensor, input.get(), shape_size, stream);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+          }
+          shape_tensor_values[input_name].resize(shape_size);
+          for (int i = 0; i < shape_size; ++i) {
+            shape_tensor_values[input_name][i] = input[i];
+          }
+        }
+
+        if (!trt_context->setTensorAddress(input_name, &shape_tensor_values[input_name][0])) {
+          std::string error_input_name = input_name;
+          std::string error_msg =
+              "Nv EP failed to call nvinfer1::IExecutionContext::setTensorAddress() for shape input '" +
+              error_input_name + "'";
+          ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, error_msg));
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+        // get shape tensor value if not present
+        if (shape_tensor_values_int64.find(input_name) == shape_tensor_values_int64.end()) {
+          auto input = std::make_unique<int64_t[]>(shape_size);
+          auto status = GetShapeOfShapeTensor<int64_t>(input_tensor, input.get(), shape_size, stream);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+          }
+          shape_tensor_values_int64[input_name].resize(shape_size);
+          for (int i = 0; i < shape_size; ++i) {
+            shape_tensor_values_int64[input_name][i] = input[i];
+          }
+        }
+
+        if (!trt_context->setTensorAddress(input_name, &shape_tensor_values_int64[input_name][0])) {
+          std::string error_input_name = input_name;
+          std::string error_msg =
+              "Nv EP failed to call nvinfer1::IExecutionContext::setTensorAddress() for shape input '" +
+              error_input_name + "'";
+          ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, error_msg));
+        }
+        break;
+      }
+      default: {
+        std::string error_input_name = input_name;
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "The data type of shape tensor should be INT32 or INT64. Please check the data type of " + error_input_name);
+      }
+    }
+  } else {
+    // Set shape for input tensor which is execution tensor
+    nvinfer1::Dims dims = trt_context->getTensorShape(input_name);
+    int nb_dims = dims.nbDims;
+    for (int j = 0, end = nb_dims; j < end; ++j) {
+      dims.d[j] = static_cast<int32_t>(tensor_shapes[j]);
+    }
+    if (!trt_context->setInputShape(input_name, dims)) {
+      std::string error_input_name = input_name;
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "Nv EP failed to call nvinfer1::IExecutionContext::setInputShape() for input '" + error_input_name + "'"));
+    }
+
+    // Bind "execution tensor" input buffer
+    //
+    // Note: If an engine binding is an empty tensor, it still needs a non-null memory address, and different tensors should have different addresses.
+    //       Therefore, in the case of empty tensor, TRT EP always allocates a dummy byte.
+    //       https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#empty-tensors
+    void* data = nullptr;
+    switch (tensor_type) {
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
+      CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
+      default: {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "Nv EP input onnx tensor data type: " + std::to_string(tensor_type) + " not supported.");
+      }
+    }
+    trt_context->setTensorAddress(input_name, data);
+  }
+
+  return Status::OK();
+}
+
+/*
+ * Bind Nv executio context output.
+ *
+ * Please note that the "data-depedent shape" output needs corresponding allocator provided.
+ *
+ *
+ * param ctx - ORT kernel context
+ * param trt_context - A pointer to Nv executio context object
+ * param output_name - Output tensor name
+ * param output_index - The index of the output to the ORT kernel context
+ * param output_type - Data type of the output
+ * param i - Output iteration index
+ * param output_tensors - Output iteration index to output's ORT value
+ * param output_dim_sizes - Output iteration index to the multiplocation of its shape's dimensions
+ * param dds_output_set - DDS output set
+ * param dds_output_allocator_map - DDS output to its allocator
+ * param scratch_buffer - The allocation buffer created by TRT EP
+ * param allocator - ORT allocator
+ * param buffers - It holds all the output values which are binding to TRT's execution context
+ *
+ */
+Status BindContextOutput(Ort::KernelContext& ctx,
+                         nvinfer1::IExecutionContext* trt_context,
+                         const char* output_name,
+                         size_t output_index,
+                         size_t output_type,
+                         size_t i,
+                         std::unordered_map<size_t, Ort::UnownedValue>& output_tensors,
+                         std::unordered_map<size_t, int>& output_dim_sizes,
+                         DDSOutputAllocatorMap& dds_output_allocator_map,
+                         std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
+                         OrtAllocator* alloc,
+                         std::unordered_map<char const*, void*>& buffers) {
+  // Get output shape
+  nvinfer1::Dims dims = trt_context->getTensorShape(output_name);
+  int nb_dims = dims.nbDims;
+  bool is_DDS = false;
+  std::vector<int64_t> output_shapes(nb_dims);
+  for (int j = 0, end = nb_dims; j < end; ++j) {
+    // data-dependent shape
+    if (dims.d[j] == -1) {
+      is_DDS = true;
+      break;
+    }
+    output_shapes[j] = dims.d[j];
+  }
+
+  auto known_DDS = dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end();
+
+  // If the output tensor has data-dependent shape, TRT EP will provide an IOutputAllocator for enqueueV3 to dynamically allocate memory buffer.
+  // Once enqueueV3 returns, TRT EP will then bind the output allocation to ORT kernel context output.
+  // (Please note that we take strategy A mentioned in https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#dynamic-shaped-output,
+  //  which we defer allocation until the size is known and don't call IExecution::setTensorAddress)
+  //
+  // Otherwise, if the shape of the output tensor is known prior to the runtime, ORT will pre-allocate memory buffer for the output tensor for enqueueV3.
+  if (is_DDS || known_DDS) {
+    if (!known_DDS) {
+      auto allocatorPtr = std::make_unique<OutputAllocator>();
+      trt_context->setOutputAllocator(output_name, allocatorPtr.get());
+      dds_output_allocator_map[output_name] = std::move(allocatorPtr);
+    }
+  } else {
+    output_tensors[i] = ctx.GetOutput(output_index, output_shapes);
+    auto& output_tensor = output_tensors[i];
+    const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
+
+    switch (output_type) {
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
+      CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
+      default: {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "Nv EP output tensor data type: " + std::to_string(output_type) + " not supported.");
+      }
+    }
+    trt_context->setTensorAddress(output_name, buffers[output_name]);
+  }
+
+  return Status::OK();
+}
+
+/*
+ * Bind ORT kernel context Output.
+ *
+ * In the case of DDS (data-dependent shape) output, TRT requires a provided allocator to allocate memory during runtime.
+ * Once the output has been put in the allocation buffer, ORT calls this function to bind the allocation to ORT kernel context output.
+ *
+ * Note: Current approach of setting the ORT kernel context output is copying the output data from allocation buffer to ORT context output address which is not optimal,
+ * we are waiting for ORT core to support "assign" memory address to ORT context output. Some works need to be done in ORT memory planner to be aware of this memory support.
+ */
+Status BindKernelOutput(Ort::KernelContext& ctx,
+                        OrtMemoryInfo* /*mem_info*/,
+                        DDSOutputAllocatorMap& allocator_map,
+                        char const* output_name,
+                        size_t output_index,
+                        size_t output_type,
+                        cudaStream_t stream) {
+  auto allocator = allocator_map[output_name].get();
+  auto& shape = allocator->getOutputShape();
+  auto output_tensor = ctx.GetOutput(output_index, shape);
+
+  /*
+   * Return the number of elements specified by the tensor shape (all dimensions multiplied by each other).
+   * For 0 dimensions, 1 is returned. If any dimension is less than 0, the result is always -1.
+   *
+   * Examples:<br>
+   * [] = 1<br>
+   * [1,3,4] = 12<br>
+   * [2,0,4] = 0<br>
+   * [-1,3,4] = -1<br>
+   */
+  auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
+
+  /*
+   * Copy output data from allocation buffer to ORT kernel context output location or
+   * cast (int32 or float) -> (int64 or double) to ORT kernel context output location.
+   *
+   * Note:
+   * 1. If the output tensor is empty tensor (i.e. any of the dimension is 0) which means element count is 0,
+   *    TRT EP does not perform cuda memory copy nor cuda cast to prevent overwriting other location that might belong to other tensors.
+   * 2. The cudaMemcpyAsync() and cuda::Impl_Cast() (implemented as _UnaryElementWise() in cuda ep) are all async, but we
+   *    don't need to explicitly call cudaStreamSynchronize() after those APIs due to CUDA EP and TRT EP uses same stream,
+   *    and within the same stream, operations are guaranteed to be executed in order.
+   */
+  switch (output_type) {
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
+    CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double)
+    default: {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP output tensor data type: " + std::to_string(output_type) + " not supported.");
+    }
+  }
+  return Status::OK();
+}
+
+NvExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream) {
+  if (has_user_compute_stream) {
+    CUDA_CALL_THROW(cudaSetDevice(device_id));
+    (void)(stream);
+  }
+}
+
+NvExecutionProvider::PerThreadContext::~PerThreadContext() {
+  trt_context_map_.clear();
+}
+
+/*
+ * Returns true if the shape ranges maintained by the PerThreadContext is different from the shape ragnes maintained by TRT EP, meaning the
+ * engine is being updated and the execution context maintained by the PerThreadContext should be updated as well. Otherwise, returns false.
+ *
+ */
+bool NvExecutionProvider::PerThreadContext::CompareProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges) {
+  if (shape_ranges.size() > 0) {
+    if (input_shape_ranges_[fused_node] != shape_ranges) {
+      LOGS_DEFAULT(VERBOSE) << "[Nv EP] The shape ranges maintained by the PerThreadContext is different from the shape ranges maintained by TRT EP. \
+                                This means the engine is updated and will need to update the execution context as well.";
+      return true;
+    }
+  }
+  return false;
+}
+
+/*
+ * Updates the shape ranges maintained by the PerThreadContext.
+ * As long as the execution context maintained by the PerThreadContext is updated, the associated shape ranges should be updated as well.
+ *
+ */
+void NvExecutionProvider::PerThreadContext::UpdateProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges) {
+  input_shape_ranges_[fused_node] = shape_ranges;
+}
+
+void NvExecutionProvider::PerThreadContext::ResetTensorRTContext(std::string fused_node) {
+  auto it = trt_context_map_.find(fused_node);
+  if (it != trt_context_map_.end()) {
+    trt_context_map_[fused_node].reset();
+  }
+}
+
+bool NvExecutionProvider::PerThreadContext::UpdateTensorRTContext(std::string fused_node, std::unique_ptr<nvinfer1::IExecutionContext> context) {
+  if (!context) {
+    context = std::make_unique<nvinfer1::IExecutionContext>();
+  }
+  trt_context_map_[fused_node] = std::move(context);
+
+  if (trt_context_map_[fused_node]) {
+    return true;
+  }
+  return false;
+}
+
+bool NvExecutionProvider::PerThreadContext::IsTensorRTContextInMap(std::string fused_node) {
+  auto it = trt_context_map_.find(fused_node);
+  if (it != trt_context_map_.end()) {
+    return true;
+  }
+  return false;
+}
+
+nvinfer1::IExecutionContext& NvExecutionProvider::PerThreadContext::GetTensorRTContext(std::string fused_node) {
+  auto it = trt_context_map_.find(fused_node);
+  if (it != trt_context_map_.end()) {
+    return *(it->second);  // dereference shared pointer
+  }
+  auto context = std::make_unique<nvinfer1::IExecutionContext>();
+  trt_context_map_[fused_node] = std::move(context);
+  return *(trt_context_map_[fused_node]);  // dereference shared pointer
+}
+
+void NvExecutionProvider::ReleasePerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  auto cached_context_it = per_thread_context_cache->find(this);
+  ORT_ENFORCE(cached_context_it != per_thread_context_cache->end());
+  auto cached_context = cached_context_it->second.lock();
+  ORT_ENFORCE(cached_context);
+
+  {
+    std::lock_guard<std::mutex> lock(context_state_.mutex);
+    context_state_.active_contexts.erase(cached_context);
+    context_state_.retired_context_pool.push_back(cached_context);
+  }
+
+  per_thread_context_cache->erase(cached_context_it);
+}
+
+NvExecutionProvider::PerThreadContext& NvExecutionProvider::GetPerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  // try to use cached context
+  auto cached_context_it = per_thread_context_cache->find(this);
+  if (cached_context_it != per_thread_context_cache->end()) {
+    auto cached_context = cached_context_it->second.lock();
+    ORT_ENFORCE(cached_context);
+    return *cached_context;
+  }
+
+  // get context and update cache
+  std::shared_ptr<PerThreadContext> context;
+  {
+    std::lock_guard<std::mutex> lock(context_state_.mutex);
+
+    // get or create a context
+    if (context_state_.retired_context_pool.empty()) {
+      context = std::make_shared<PerThreadContext>(narrow<OrtDevice::DeviceId>(info_.device_id),
+                                                   info_.has_user_compute_stream, stream_);
+    } else {
+      context = context_state_.retired_context_pool.back();
+      context_state_.retired_context_pool.pop_back();
+    }
+
+    // insert into active_contexts, should not already be present
+    const auto active_contexts_insert_result = context_state_.active_contexts.insert(context);
+    ORT_ENFORCE(active_contexts_insert_result.second);
+
+    // insert into caches_to_update_on_destruction, may already be present
+    ORT_IGNORE_RETURN_VALUE(context_state_.caches_to_update_on_destruction.insert(per_thread_context_cache));
+  }
+
+  per_thread_context_cache->insert(std::make_pair(this, context));
+
+  return *context;
+}
+
+NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
+    : IExecutionProvider{onnxruntime::kNvTensorRTRTXExecutionProvider,
+                         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT,
+                                   narrow<OrtDevice::DeviceId>(info.device_id))},
+      info_(info),
+      device_id_(info.device_id) {
+  InitProviderOrtApi();
+  // TODO(maximlianm) remove this since we should be able to compile an AOT context file without GPU
+  CUDA_CALL_THROW(cudaSetDevice(device_id_));
+  cudaDeviceProp prop;
+  CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+  compute_capability_ = GetComputeCapacity(prop);
+  if (info.has_user_compute_stream) {
+    external_stream_ = true;
+    stream_ = static_cast<cudaStream_t>(info.user_compute_stream);
+  }
+
+  std::string profile_min_shapes, profile_max_shapes, profile_opt_shapes;
+
+  // incase the EP context is dumped the engine cache has to be enabled
+  auto enable_engine_cache_for_ep_context_model = [this]() {
+    if (dump_ep_context_model_ && ep_context_embed_mode_ == 0) {
+      engine_cache_enable_ = true;
+    }
+  };
+
+  // Get environment variables
+  if (info.has_trt_options) {
+    max_partition_iterations_ = info.max_partition_iterations;
+    min_subgraph_size_ = info.min_subgraph_size;
+    max_workspace_size_ = info.max_workspace_size;
+    dump_subgraphs_ = info.dump_subgraphs;
+    weight_stripped_engine_enable_ = info.weight_stripped_engine_enable;
+    onnx_model_folder_path_ = info.onnx_model_folder_path;
+    onnx_model_bytestream_ = info.onnx_bytestream;
+    onnx_model_bytestream_size_ = info.onnx_bytestream_size;
+    if ((onnx_model_bytestream_ != nullptr && onnx_model_bytestream_size_ == 0) ||
+        (onnx_model_bytestream_ == nullptr && onnx_model_bytestream_size_ != 0)) {
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "When providing either 'trt_onnx_bytestream_size' or "
+                                         "'trt_onnx_bytestream' both have to be provided"));
+    }
+    detailed_build_log_ = info.detailed_build_log;
+    dump_ep_context_model_ = info.dump_ep_context_model;
+    ep_context_file_path_ = info.ep_context_file_path;
+    ep_context_embed_mode_ = info.ep_context_embed_mode;
+    enable_engine_cache_for_ep_context_model();
+    cache_prefix_ = info.engine_cache_prefix;
+    // use a more global cache if given
+    engine_decryption_enable_ = info.engine_decryption_enable;
+    if (engine_decryption_enable_) {
+      engine_decryption_lib_path_ = info.engine_decryption_lib_path;
+    }
+    force_sequential_engine_build_ = info.force_sequential_engine_build;
+    context_memory_sharing_enable_ = info.context_memory_sharing_enable;
+    sparsity_enable_ = info.sparsity_enable;
+    auxiliary_streams_ = info.auxiliary_streams;
+    profile_min_shapes = info.profile_min_shapes;
+    profile_max_shapes = info.profile_max_shapes;
+    profile_opt_shapes = info.profile_opt_shapes;
+    cuda_graph_enable_ = info.cuda_graph_enable;
+    op_types_to_exclude_ = info.op_types_to_exclude;
+  } else {
+    LOGS_DEFAULT(INFO) << "[Nv EP] Options were not specified";
+  }
+
+  // Validate setting
+  if (max_partition_iterations_ <= 0) {
+    // LOGS_DEFAULT(WARNING) << "[Nv EP] TensorRT option nv_max_partition_iterations must be a positive integer value. Set it to 1000";
+    max_partition_iterations_ = 1000;
+  }
+  if (min_subgraph_size_ <= 0) {
+    // LOGS_DEFAULT(WARNING) << "[Nv EP] TensorRT option nv_min_subgraph_size must be a positive integer value. Set it to 1";
+    min_subgraph_size_ = 1;
+  }
+
+  // If ep_context_file_path_ is provided as a directory, create it if it's not existed
+  if (dump_ep_context_model_ && !ep_context_file_path_.empty() && std::filesystem::path(ep_context_file_path_).extension().empty() && !std::filesystem::is_directory(ep_context_file_path_)) {
+    if (!std::filesystem::create_directory(ep_context_file_path_)) {
+      throw std::runtime_error("Failed to create directory " + ep_context_file_path_);
+    }
+  }
+
+  // If dump_ep_context_model_ is enabled, TRT EP forces cache_path_ to be the relative path of ep_context_file_path_.
+  // For example,
+  //    - original cache path = "engine_cache_dir" -> new cache path = "./context_model_dir/engine_cache_dir"
+  //    - original cache path = ""                 -> new cache path = "./context_model_dir"
+  // The new cache path will be saved as the "ep_cache_context" node attritue of the EP context node.
+  // For security reason, it needs to make sure the engine cache is saved inside context model directory.
+  if (dump_ep_context_model_) {
+    // TODO(maximilianm) not sure if this is still needed
+    engine_cache_enable_ = true;
+    if (IsAbsolutePath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, the trt_engine_cache_path should be set with a relative path, but it is an absolute path:  " << cache_path_;
+    }
+    if (IsRelativePathToParentPath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, The trt_engine_cache_path has '..', it's not allowed to point outside the directory.";
+    }
+
+    // Engine cache relative path to context model directory.
+    // It's used when dumping the "ep_cache_context" node attribute.
+    engine_cache_relative_path_to_context_model_dir = cache_path_;
+
+    // Make cache_path_ to be the relative path of ep_context_file_path_
+    cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string();
+  }
+
+  if (engine_decryption_enable_) {
+    LIBTYPE handle = OPENLIB(engine_decryption_lib_path_.c_str());
+    if (handle == nullptr) {
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "Nv EP could not open shared library from " + engine_decryption_lib_path_));
+    }
+    engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt");
+    engine_encryption_ = (int (*)(const char*, char*, size_t))LIBFUNC(handle, "encrypt");
+    if (engine_decryption_ == nullptr) {
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "Nv EP could not find decryption function in shared library from " + engine_decryption_lib_path_));
+    }
+  }
+
+  // cuda graph:
+  // cudaStreamSynchronize() is not allowed in cuda graph capture.
+  //
+  // external stream:
+  // If user provides "external" cuda stream, only this cuda stream will be used even if multiple threads are running InferenceSession.Run() concurrently.
+  // So, no need to synchronize different streams after enqueueV3.
+  if (cuda_graph_enable_ || external_stream_) {
+    sync_stream_after_enqueue_ = false;
+  }
+
+  {
+    auto lock = GetApiLock();
+    runtime_ = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger(detailed_build_log_)));
+  }
+
+  trt_version_ = getInferLibVersion();
+  CUDA_CALL_THROW(cudaRuntimeGetVersion(&cuda_version_));
+
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] TensorRT version is " << trt_version_;
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] CUDA version is " << cuda_version_;
+
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] Nv provider options: "
+                        << "device_id: " << device_id_
+                        << ", nv_max_partition_iterations: " << max_partition_iterations_
+                        << ", nv_min_subgraph_size: " << min_subgraph_size_
+                        << ", nv_max_workspace_size: " << max_workspace_size_
+                        << ", nv_dump_subgraphs: " << dump_subgraphs_
+                        << ", nv_weight_stripped_engine_enable: " << weight_stripped_engine_enable_
+                        << ", nv_onnx_model_folder_path: " << onnx_model_folder_path_
+                        << ", nv_engine_decryption_enable: " << engine_decryption_enable_
+                        << ", nv_engine_decryption_lib_path: " << engine_decryption_lib_path_
+                        << ", nv_force_sequential_engine_build: " << force_sequential_engine_build_
+                        << ", nv_context_memory_sharing_enable: " << context_memory_sharing_enable_
+                        << ", nv_sparsity_enable: " << sparsity_enable_
+                        << ", nv_auxiliary_streams: " << auxiliary_streams_
+                        << ", nv_cuda_graph_enable: " << cuda_graph_enable_
+                        << ", nv_dump_ep_context_model: " << dump_ep_context_model_
+                        << ", nv_ep_context_file_path: " << ep_context_file_path_
+                        << ", nv_ep_context_embed_mode: " << ep_context_embed_mode_
+                        << ", nv_cache_prefix: " << cache_prefix_
+                        << ", nv_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_
+                        << ", nv_op_types_to_exclude: " << op_types_to_exclude_;
+}
+
+NvExecutionProvider::~NvExecutionProvider() {
+  // clean up thread local context caches
+  {
+    std::lock_guard<std::mutex> lock(context_state_.mutex);
+    for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) {
+      const auto cache = cache_weak.lock();
+      if (!cache) continue;
+      ORT_IGNORE_RETURN_VALUE(cache->erase(this));
+    }
+  }
+
+  if (!external_stream_ && stream_) {
+    ORT_IGNORE_RETURN_VALUE(CUDA_CALL(cudaStreamDestroy(stream_)));
+  }
+  ReleaseTensorRTCustomOpDomainList(info_.custom_op_domain_list);
+
+  if (alloc_ != nullptr) {
+    // This code is same as OrtApis::ReleaseAllocator defined in allocator_adapters.cc.
+    // We can't get api inside destructor so that's why we duplicate the code here.
+    delete static_cast<OrtAllocatorImpl*>(alloc_);
+  }
+}
+
+bool NvExecutionProvider::IsGraphCaptureEnabled() const {
+  return cuda_graph_enable_;
+}
+
+bool NvExecutionProvider::IsGraphCaptureAllowed() const {
+  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
+}
+
+void NvExecutionProvider::CaptureBegin(int) {
+  cuda_graph_.Reset();
+  cuda_graph_.CaptureBegin(0);
+}
+
+void NvExecutionProvider::CaptureEnd(int) {
+  cuda_graph_.CaptureEnd(0);
+  is_graph_captured_ = true;
+}
+
+bool NvExecutionProvider::IsGraphCaptured(int) const {
+  return is_graph_captured_;
+}
+
+Status NvExecutionProvider::ReplayGraph(int) {
+  ORT_ENFORCE(IsGraphCaptured(0));
+  // Please note that CUDAGraph::Replay() is not thread safe.
+  // ORT TRT calls ReplayGraph() in compute_func() where synchronization is enforced due to lock_guard(),
+  // therefore calling CUDAGraph::Replay() here is guaranteed to be thread safe.
+  return cuda_graph_.Replay(0);
+}
+
+void NvExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
+  // Please note that this function is not thread safe.
+  // ORT TRT calls this function in compute_func() where synchronization is enforced due to lock_guard(),
+  // therefore following increment is guaranteed to be thread safe.
+  ++regular_run_count_before_graph_capture_;
+}
+
+std::vector<AllocatorPtr> NvExecutionProvider::CreatePreferredAllocators() {
+  AllocatorCreationInfo default_memory_info(
+      [](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); },
+      narrow<OrtDevice::DeviceId>(device_id_));
+
+  AllocatorCreationInfo pinned_allocator_info(
+      [](OrtDevice::DeviceId device_id) {
+        ORT_UNUSED_PARAMETER(device_id);
+        return CreateCUDAPinnedAllocator(onnxruntime::CUDA_PINNED);
+      },
+      0);
+
+  return std::vector<AllocatorPtr>{CreateAllocator(default_memory_info), CreateAllocator(pinned_allocator_info)};
+}
+
+std::unique_ptr<IDataTransfer> NvExecutionProvider::GetDataTransfer() const {
+  return onnxruntime::CreateGPUDataTransfer();
+}
+
+Status NvExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
+  return Status::OK();
+}
+
+Status NvExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
+  if (sync_stream && external_stream_) {
+    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
+  }
+  return Status::OK();
+}
+
+// Get the pointer to the IBuilder instance.
+// Note: This function is not thread safe. Calls to this function from different threads must be serialized
+// even though it doesn't make sense to have multiple threads initializing the same inference session.
+nvinfer1::IBuilder* NvExecutionProvider::GetBuilder(TensorrtLogger& trt_logger) const {
+  if (!builder_) {
+    {
+      auto lock = GetApiLock();
+      builder_ = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(trt_logger));
+    }
+  }
+  return builder_.get();
+}
+
+void NvExecutionProvider::GetCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list) const {
+  std::string extra_plugin_lib_paths{""};
+  if (info_.has_trt_options) {
+    if (!info_.extra_plugin_lib_paths.empty()) {
+      extra_plugin_lib_paths = info_.extra_plugin_lib_paths;
+    }
+  }
+  auto status = CreateTensorRTCustomOpDomainList(custom_op_domain_list, extra_plugin_lib_paths);
+  if (status != Status::OK()) {
+    LOGS_DEFAULT(WARNING) << "[Nv EP] Failed to get TRT plugins from TRT plugin registration.";
+  }
+}
+
+// Check the graph is the subgraph of control flow op
+bool NvExecutionProvider::IsSubGraphOfControlFlowOp(const GraphViewer& graph) const {
+  if (graph.IsSubgraph()) {
+    const auto& node = graph.ParentNode();
+    if (control_flow_op_set_.find(node->OpType()) != control_flow_op_set_.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Check whether all the nodes of the graph are assigned to specific ep
+bool NvExecutionProvider::AllNodesAssignedToSpecificEP(const GraphViewer& graph, const std::string& provider_type) const {
+  const int number_of_ort_nodes = graph.NumberOfNodes();
+  std::vector<size_t> nodes_vector(number_of_ort_nodes);
+  std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);
+  const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+  for (const auto& index : nodes_vector) {
+    const auto& node = graph.GetNode(node_index[index]);
+    if (node->GetExecutionProviderType() != provider_type) {
+      return false;
+    }
+  }
+
+  return number_of_ort_nodes != 0;
+}
+
+// Check whether all the nodes of subgraph are supported
+bool NvExecutionProvider::IsSubGraphFullySupported(SubGraphCollection_t supported_nodes_vector, const int number_of_ort_nodes) const {
+  int number_of_trt_nodes = 0;
+  for (const auto& group : supported_nodes_vector) {
+    if (!group.first.empty()) {
+      number_of_trt_nodes += static_cast<int>(group.first.size());
+    }
+  }
+
+  return number_of_trt_nodes == number_of_ort_nodes;
+}
+
+std::unique_ptr<IndexedSubGraph> NvExecutionProvider::GetSubGraph(SubGraph_t graph_nodes_index, const GraphViewer& graph, const HashValue& model_hash, int subgraph_index) const {
+  const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+  std::unordered_set<size_t> node_set;
+  node_set.reserve(graph_nodes_index.first.size());
+  for (const auto& index : graph_nodes_index.first) {
+    node_set.insert(node_index[index]);
+  }
+
+  // Get parent graph output names
+  std::unordered_set<std::string> graph_output_names;
+  for (const auto* output_arg : graph.GetOutputs()) {
+    graph_output_names.insert(output_arg->Name());
+  }
+
+  // Find inputs and outputs of the subgraph
+  std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::IndexedSubGraph::Create();
+  std::unordered_map<const NodeArg*, int> original_inputs, fused_inputs, fused_outputs, fused_outputs_to_add, graph_outputs_to_add;
+  std::unordered_set<const NodeArg*> erased;
+  int input_order = 0;
+  int output_order = 0;
+
+  std::vector<std::string> initializers;
+  for (const auto& index : graph_nodes_index.first) {
+    sub_graph->Nodes().push_back(node_index[index]);
+    const auto& node = graph.GetNode(node_index[index]);
+    for (const auto& input : node->InputDefs()) {
+      if (graph.IsConstantInitializer(input->Name(), true)) {
+        initializers.push_back(input->Name());
+        continue;
+      }
+      const auto& it = fused_outputs.find(input);
+      if (it != fused_outputs.end()) {
+        fused_outputs.erase(it);
+        erased.insert(input);
+      } else if (erased.find(input) == erased.end()) {
+        // Only when input is neither in output list nor erased list, add the input to input list
+        fused_inputs[input] = input_order++;
+      }
+    }
+
+    for (const auto& input : node->ImplicitInputDefs()) {
+      if (graph.IsConstantInitializer(input->Name(), true)) {
+        initializers.push_back(input->Name());
+        continue;
+      }
+      const auto& it = fused_outputs.find(input);
+      if (it != fused_outputs.end()) {
+        fused_outputs.erase(it);
+        erased.insert(input);
+      } else if (erased.find(input) == erased.end()) {
+        // Only when input is neither in output list nor erased list, add the input to input list
+        fused_inputs[input] = input_order++;
+      }
+    }
+
+    // For output searching, there are two special cases,
+    // One is, if node's OutputEdges are more than its outputs, meaning certain output is used more than once,
+    // if the output is connected to nodes that don't belong to the subgraph, the output need to be added
+    // to the output list
+    // The other one is, if subgraph's node output is parent graph's output. the node output should
+    // be also added to the subgraph's output list
+    if (node->GetOutputEdgesCount() > node->OutputDefs().size()) {
+      for (auto it = node->OutputEdgesBegin(), end = node->OutputEdgesEnd(); it != end; ++it) {
+        const auto& node_idx = it->GetNode().Index();
+        const onnxruntime::NodeArg* output;
+        // The dst_arg_index from GetDstArgIndex() could be the index for explicit/implicit input defs of the node.
+        // We need to get the correct input index accordingly. (See Graph::BuildConnections() in graph.cc for more details)
+        if (it->GetDstArgIndex() < static_cast<int>(it->GetNode().InputDefs().size())) {
+          output = (it->GetNode()).InputDefs()[it->GetDstArgIndex()];
+        } else {
+          output = (it->GetNode()).ImplicitInputDefs()[it->GetDstArgIndex() - static_cast<int>(it->GetNode().InputDefs().size())];
+        }
+        if (node_set.find(node_idx) != node_set.end()) {
+          const auto& iter = fused_inputs.find(output);
+          if (iter != fused_inputs.end()) {
+            fused_inputs.erase(iter);
+            erased.insert(output);
+          } else if (erased.find(output) == erased.end()) {
+            if (graph_output_names.find(output->Name()) != graph_output_names.end()) {
+              graph_outputs_to_add[output] = output_order;
+            }
+            fused_outputs[output] = output_order++;
+          }
+        } else {
+          fused_outputs_to_add[output] = output_order++;
+        }
+      }
+    } else {
+      for (const auto& output : node->OutputDefs()) {
+        const auto& it = fused_inputs.find(output);
+        if (it != fused_inputs.end()) {
+          fused_inputs.erase(it);
+          erased.insert(output);
+        }
+        // Only when output is neither in input list nor erased list, add the output to output list
+        else if (erased.find(output) == erased.end()) {
+          if (graph_output_names.find(output->Name()) != graph_output_names.end()) {
+            graph_outputs_to_add[output] = output_order;
+          }
+          fused_outputs[output] = output_order++;
+        }
+      }
+    }
+  }
+
+  fused_outputs.insert(fused_outputs_to_add.begin(), fused_outputs_to_add.end());
+  fused_outputs.insert(graph_outputs_to_add.begin(), graph_outputs_to_add.end());
+
+  std::multimap<int, const NodeArg*> inputs, outputs;
+
+  // Get the input order of the original graph
+  int order = 0;
+  for (const auto* input : graph.GetInputs()) {
+    original_inputs[input] = order++;
+  }
+
+  // input order needs to be consistent with original graph's input order
+  for (auto it = fused_inputs.begin(), end = fused_inputs.end(); it != end; ++it) {
+    const auto& iter = original_inputs.find(it->first);
+    if (iter != original_inputs.end()) {
+      inputs.insert(std::pair<int, const NodeArg*>(iter->second, iter->first));
+    } else {
+      inputs.insert(std::pair<int, const NodeArg*>(it->second, it->first));
+    }
+  }
+
+  // Sort outputs by the order they were added
+  for (auto it = fused_outputs.begin(), end = fused_outputs.end(); it != end; ++it) {
+    outputs.insert(std::pair<int, const NodeArg*>(it->second, it->first));
+  }
+
+  // Generate unique kernel name for TRT subgraph
+  std::string subgraph_id = std::to_string(model_hash) + "_" + std::to_string(subgraph_index);
+  auto meta_def = IndexedSubGraph_MetaDef::Create();
+  const std::string graph_type = graph.IsSubgraph() ? "subgraph" : "graph";
+  meta_def->name() = "TRTKernel_" + graph_type + "_" + graph.Name() + "_" + subgraph_id;
+  LOGS_DEFAULT(INFO) << "[Nv EP] TensorRT subgraph MetaDef name " + meta_def->name();
+
+  // Assign inputs and outputs to subgraph's meta_def
+  for (const auto& input : inputs) {
+    if (input.second->Exists()) {
+      meta_def->inputs().push_back(input.second->Name());
+    }
+  }
+
+  for (const auto& initializer : initializers) {
+    meta_def->constant_initializers().push_back(initializer);
+  }
+
+  for (const auto& output : outputs) {
+    if (output.second->Exists()) {
+      meta_def->outputs().push_back(output.second->Name());
+    }
+  }
+
+  meta_def->domain() = kMSDomain;
+  meta_def->since_version() = 1;
+  sub_graph->SetMetaDef(std::move(meta_def));
+
+  return sub_graph;
+}
+
+SubGraphCollection_t NvExecutionProvider::GetSupportedList(SubGraphCollection_t nodes_vector_input, int iterations, const int max_iterations,
+                                                           const GraphViewer& graph, bool* early_termination) const {
+  // Return if iterations are exceeding predefined number
+  SubGraphCollection_t nodes_list_output;
+  if (iterations > max_iterations) {
+    *early_termination = true;
+    return nodes_list_output;
+  }
+
+  // Get parent graph output names
+  std::unordered_set<std::string> graph_output_names;
+  for (const auto* output_arg : graph.GetOutputs()) {
+    graph_output_names.insert(output_arg->Name());
+  }
+
+  iterations++;
+  const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+  for (const auto& group : nodes_vector_input) {
+    // Construct subgraph
+    if (!group.first.empty()) {
+      if (group.second) {
+        nodes_list_output.push_back(group);
+      } else {
+        auto model_build = graph.CreateModel(*GetLogger());
+        auto& graph_build = model_build->MainGraph();
+        bool has_control_flow_op = false;
+
+        // Add node and node args
+        // If node output is also parent graph output, the output will be added to the
+        // subgraph's output list
+        std::vector<std::string> subgraph_output_names;
+        for (const auto& index : group.first) {
+          const auto& node = graph.GetNode(node_index[index]);
+          std::vector<onnxruntime::NodeArg*> inputs, outputs;
+          for (auto input : node->InputDefs()) {
+            auto& n_input = graph_build.GetOrCreateNodeArg(input->Name(), input->TypeAsProto());
+            inputs.push_back(&n_input);
+            const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+            if (graph.GetInitializedTensor(input->Name(), initializer)) {
+              const ONNX_NAMESPACE::TensorProto* subgraph_initializer = nullptr;
+              if (!graph_build.GetInitializedTensor(input->Name(), subgraph_initializer)) {
+                graph_build.AddInitializedTensor(*(initializer));
+              }
+            }
+          }
+
+          for (auto input : node->ImplicitInputDefs()) {
+            const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+            if (graph.GetInitializedTensor(input->Name(), initializer)) {
+              const ONNX_NAMESPACE::TensorProto* subgraph_initializer = nullptr;
+              if (!graph_build.GetInitializedTensor(input->Name(), subgraph_initializer)) {
+                graph_build.AddInitializedTensor(*(initializer));
+              }
+            }
+          }
+          for (auto output : node->OutputDefs()) {
+            auto& n_output = graph_build.GetOrCreateNodeArg(output->Name(), output->TypeAsProto());
+            outputs.push_back(&n_output);
+            const auto name = output->Name();
+            if (graph_output_names.find(name) != graph_output_names.end()) {
+              subgraph_output_names.push_back(name);
+            }
+          }
+
+          if (control_flow_op_set_.find(node->OpType()) != control_flow_op_set_.end()) {
+            has_control_flow_op = true;
+          }
+
+          // If the node has subgraph, it's possible that the ORT graph of that subgraph and the GraphProto in the node attributes are not in sync because of graph optimization.
+          // Therefore, we need to force GraphProto attributes to be updated in order to get the valid GraphProto.
+          if (node->GetAttributes().size() > 0) {
+            auto node_proto = ONNX_NAMESPACE::NodeProto::Create();
+            // we need to update any GraphProto attributes for subgraphs so that any changes made by things
+            // such as the optimizers are captured. otherwise we can end up saving an invalid graph.
+            node->ToProto(*node_proto, /* update_subgraphs */ true);
+            const int num_attributes = node_proto->attribute_size();
+            auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
+            node_attributes->reserve(num_attributes);
+
+            for (int i = 0; i < num_attributes; ++i) {
+              auto& attr = node_proto->attribute(i);
+              node_attributes->emplace(attr.name(), attr);
+            }
+
+            // The GraphProto attributes are the updated ones.
+            graph_build.AddNode(node->Name(), node->OpType(), node->Description(), inputs, outputs, node_attributes.get(), node->Domain());
+          } else {
+            // The GraphProto attributes are the original ones.
+            graph_build.AddNode(node->Name(), node->OpType(), node->Description(), inputs, outputs, &node->GetAttributes(), node->Domain());
+          }
+        }
+
+        // Only if the newly built graph has control flow op as well as it has parent node,
+        // it needs to handle outer scope values before calling graph.Resolve().
+        if (has_control_flow_op && graph.ParentNode()) {
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] Handle outer scope values for the subgraph " << graph_build.Name();
+          BuildSubGraphContext(graph_build);
+          SetGraphOuterScopeValuesAndInputs(graph_build, graph.GetGraph());
+          SetAllGraphInputs(graph_build);
+        }
+
+        ORT_ENFORCE(graph_build.Resolve().IsOK());
+
+        // Add parent graph output to the subgraph
+        int i = 0;
+        std::vector<const NodeArg*> subgraph_outputs;
+        subgraph_outputs.resize(subgraph_output_names.size());
+        for (auto& name : subgraph_output_names) {
+          auto output_arg = graph.GetNodeArg(name);
+          auto& subgraph_output_arg = graph_build.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto());
+          subgraph_outputs[i] = &subgraph_output_arg;
+          ++i;
+        }
+        auto& graph_build_outputs = graph_build.GetOutputs();
+        subgraph_outputs.insert(subgraph_outputs.begin(), graph_build_outputs.begin(), graph_build_outputs.end());
+        graph_build.SetOutputs(graph_build_outputs);
+        ORT_ENFORCE(graph_build.Resolve().IsOK());
+
+        // Check if input tensors have shapes
+        if (iterations > 1) {
+          auto graph_inputs = graph_build.GetInputs();
+          for (auto input_arg : graph_inputs) {
+            bool has_dim_value_or_param = true;
+            auto input_shape = input_arg->Shape();
+            if (input_shape != nullptr) {
+              auto dim_size = input_shape->dim_size();
+              for (int i = 0; i < dim_size; ++i) {
+                auto& dim = input_shape->dim(i);
+                if (!dim.has_dim_value() && !dim.has_dim_param()) {
+                  has_dim_value_or_param = false;
+                  break;
+                }
+              }
+            }
+
+            if (input_shape == nullptr || !has_dim_value_or_param) {
+              ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                                 "TensorRT input: " + input_arg->Name() + " has no shape specified. " +
+                                                     "Please run shape inference on the onnx model first. Details can be found in " +
+                                                     "https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#shape-inference-for-tensorrt-subgraphs"));
+            }
+          }
+        }
+
+        // Serialize modelproto to string
+        auto graph_viewer = graph_build.CreateGraphViewer();
+        auto model = graph_viewer->CreateModel(*GetLogger());
+        auto model_proto = model->ToProto();
+
+        // ORT's default topological sort is using reversed DFS.
+        // When creating model proto from graph viewer, let ORT use priority-based topological sort based on node index.
+        // The reason is, in some cases, for example ResNet50, using default topological sort will end up with generating
+        // the model proto that has different node ordering compared to original onnx model.
+        graph_viewer->ToProto(*model_proto->mutable_graph(), true, true, 1 /*priority-based topological sort*/);
+        model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+
+        std::string string_buf;
+        model_proto->SerializeToString(string_buf);
+
+        if (dump_subgraphs_) {
+          // Dump TensorRT subgraph for debugging
+          std::fstream dump("NvExecutionProvider_TRT_Subgraph.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+          model_proto->SerializeToOstream(dump);
+        }
+
+        // Get supported node list recursively
+        SubGraphCollection_t parser_nodes_list;
+        TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log_);
+        auto trt_builder = GetBuilder(trt_logger);
+        auto network_flags = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED);
+        auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(network_flags));
+
+        // limit the scope of trt_parser so that model gets unloaded from memory asap
+        {
+          auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
+
+          auto is_model_supported = trt_parser->supportsModelV2(string_buf.data(), string_buf.size(), model_path_);
+
+          // Note: Calling getNbSubgraphs or getSubgraphNodes before calling supportsModelV2 results in undefined behavior.
+          auto num_subgraphs = trt_parser->getNbSubgraphs();
+          parser_nodes_list.reserve(num_subgraphs);
+
+          for (int64_t i = 0; i < num_subgraphs; ++i) {
+            int64_t subgraph_len = 0;
+            int64_t* nodes = trt_parser->getSubgraphNodes(i, subgraph_len);
+            parser_nodes_list.emplace_back();
+            parser_nodes_list.back().first.reserve(subgraph_len);
+            for (int64_t j = 0; j < subgraph_len; ++j) {
+              parser_nodes_list.back().first.push_back(nodes[j]);
+            }
+            parser_nodes_list.back().second = is_model_supported ? true : false;
+          }
+        }
+        SubGraphCollection_t next_nodes_list;
+        const std::vector<NodeIndex>& subgraph_node_index = graph_viewer->GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+        next_nodes_list = GetSupportedList(parser_nodes_list, iterations, max_iterations, *graph_viewer, early_termination);
+        for (size_t i = 0, end = next_nodes_list.size(); i < end; ++i) {
+          for (size_t j = 0, end = next_nodes_list[i].first.size(); j < end; ++j) {
+            /*
+             * Convert the supported node list returning from onnx-tensorrt parser to the node list recognized by ORT TRT.
+             *
+             * TRT EP reconstructs the graph based on the nodes in group.first and feeds this graph (converts to model proto and to string buffer) to onnx-tensorrt parser.
+             * The node index in the list returning from onnx-tensorrt parser might not be the same as the node index in group.first. Therefore, TRT EP needs a node index mapping table here.
+             *
+             * The order of iterating the nodes in group.first and calling graph_build.AddNode() determines the node order in the newly constructed graph (see Graph::AllocateNode() in graph.cc),
+             * however, once the graph is converted to model proto, the node proto order in model proto (ex: onnx-tensorrt calls model.graph().node() to iterate NodeProto in ModelProto) is decided by topo sort.
+             *
+             * The topo sort list (i.e. subgraph_node_index) acts as the node index mapping table:
+             * subgraph_node_index[node index from onnx-tensorrt parser] = index in group.first
+             *
+             * In the past, TRT EP uses ORT's default reversed DFS topo sort which might end up with the sorting result not sequence of 0, 1, ... n-1, ex: the subgraph_node_index = [0,2,1,3,4].
+             * With the change of using ORT's priority-based topo sort (node with lower node index outputs first) the sorting result is the sequence of 0, 1, ... n-1 for most of the cases,
+             * therefore subgraph_node_index as a mapping table is not needed anymore.
+             *
+             * TODO: Remove the subgraph_node_index
+             */
+            next_nodes_list[i].first[j] = group.first[subgraph_node_index[next_nodes_list[i].first[j]]];
+          }
+          nodes_list_output.push_back(next_nodes_list[i]);
+        }
+      }
+    }
+  }
+  return nodes_list_output;
+}
+
+// Detect and remove cycles from supported node list
+bool NvExecutionProvider::DetectTensorRTGraphCycles(SubGraphCollection_t& supported_nodes_vector, const GraphViewer& graph, const HashValue& model_hash, bool remove_cycles) const {
+  const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+  bool trt_cycle = true, cycle_detected = false;
+  while (trt_cycle) {
+    trt_cycle = false;
+    std::unordered_map<std::string, size_t> node_to_index_map;
+    std::unordered_map<size_t, std::string> index_to_node_map;
+    std::unordered_map<std::string, std::unordered_set<std::string>> input_to_nodes_map, node_to_outputs_map;
+    std::unordered_set<size_t> non_trt_node_index(node_index.begin(), node_index.end());
+    size_t id = 0;
+    int subgraph_index = 0;
+    for (const auto& group : supported_nodes_vector) {
+      if (!group.first.empty()) {
+        // Construct subgraph from node list
+        std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(group, graph, model_hash, subgraph_index);
+
+        // Create node to inputs/outputs/index maps
+        const auto& meta_def = sub_graph->GetMetaDef();
+        const std::string node_name = meta_def->name();
+        if (node_to_index_map.find(node_name) == node_to_index_map.end()) {
+          index_to_node_map[id] = node_name;
+          node_to_index_map[node_name] = id++;
+        }
+
+        if (meta_def != nullptr) {
+          for (const auto& input : meta_def->inputs()) {
+            input_to_nodes_map[input].insert(node_name);
+          }
+          for (const auto& output : meta_def->outputs()) {
+            node_to_outputs_map[node_name].insert(output);
+          }
+        }
+
+        // Remove TensorRT nodes from node index list
+        for (const auto& index : group.first) {
+          non_trt_node_index.erase(node_index[index]);
+        }
+        subgraph_index++;
+      }
+    }
+
+    // Add non TensorRT nodes to the maps
+    for (const auto& index : non_trt_node_index) {
+      const auto& node = graph.GetNode(index);
+      const std::string node_name = node->Name();
+      if (node_to_index_map.find(node_name) == node_to_index_map.end()) {
+        index_to_node_map[id] = node_name;
+        node_to_index_map[node_name] = id++;
+      }
+
+      for (const auto& input : node->InputDefs()) {
+        input_to_nodes_map[input->Name()].insert(node_name);
+      }
+
+      for (const auto& input : node->ImplicitInputDefs()) {
+        input_to_nodes_map[input->Name()].insert(node_name);
+      }
+
+      for (const auto& output : node->OutputDefs()) {
+        node_to_outputs_map[node_name].insert(output->Name());
+      }
+    }
+
+    // Create adjacency list
+    size_t graph_size = node_to_index_map.size();
+    std::list<size_t>* adjacency_map = new std::list<size_t>[graph_size];
+    for (const auto& node : node_to_outputs_map) {
+      for (auto iter = node.second.begin(); iter != node.second.end(); ++iter) {
+        const auto& loc = input_to_nodes_map.find(*iter);
+        if (loc != input_to_nodes_map.end()) {
+          size_t parent_node_index = node_to_index_map.find(node.first)->second;
+          for (auto child_node : loc->second) {
+            size_t child_node_index = node_to_index_map.find(child_node)->second;
+            adjacency_map[parent_node_index].push_back(child_node_index);
+          }
+        }
+      }
+    }
+
+    // Check cycle in the graph
+    bool* visited = new bool[graph_size];
+    bool* st = new bool[graph_size];
+    for (size_t i = 0; i < graph_size; ++i) {
+      visited[i] = false;
+      st[i] = false;
+    }
+
+    std::vector<size_t> cycles;
+    bool has_cycle = false;
+    for (size_t i = 0; i < graph_size; ++i) {
+      if (FindCycleHelper(i, adjacency_map, visited, st, cycles)) {
+        has_cycle = true;
+        cycle_detected = true;
+        break;
+      }
+    }
+
+    // Remove TensorRT subgraph from the supported node list if it's part of the cycle
+    if (has_cycle && remove_cycles) {
+      for (size_t i = 0; i < cycles.size(); ++i) {
+        auto loc = index_to_node_map.find(cycles[i]);
+        if (loc != index_to_node_map.end() && loc->second.find("TRTKernel") != std::string::npos) {
+          supported_nodes_vector.erase(supported_nodes_vector.begin() + cycles[i]);
+          trt_cycle = true;
+          break;
+        }
+      }
+    }
+
+    delete[] adjacency_map;
+    delete[] visited;
+    delete[] st;
+  }
+  return cycle_detected;
+}
+
+std::vector<std::unique_ptr<ComputeCapability>>
+NvExecutionProvider::GetCapability(const GraphViewer& graph,
+                                   const IKernelLookup& /*kernel_lookup*/,
+                                   const GraphOptimizerRegistry& /*graph_optimizer_registry*/,
+                                   IResourceAccountant* /* resource_accountant */) const {
+  // Construct subgraph capability from node list
+  std::vector<std::unique_ptr<ComputeCapability>> result;
+  // Get ModelPath
+  const auto& path_string = graph.ModelPath().string();
+#ifdef _WIN32
+  strncpy_s(model_path_, path_string.c_str(), sizeof(model_path_) - 1);
+#else
+  strncpy(model_path_, path_string.c_str(), sizeof(model_path_) - 1);
+#endif
+  model_path_[sizeof(model_path_) - 1] = '\0';
+
+  // If the model consists of only a single "EPContext" contrib op, it means TRT EP can fetch the precompiled engine info from the node and
+  // load the engine directly without having to go through the processes of graph proto reconstruction, calling TRT parser and engine compilation.
+  // So, simply return the ComputeCapability here.
+  if (graph.NumberOfNodes() == 1 && GraphHasCtxNode(graph)) {
+    SubGraph_t supported_node_vector = {{0}, true};
+    std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(supported_node_vector, graph, TRTGenerateId(graph, std::to_string(trt_version_), std::to_string(cuda_version_)), 0);
+    result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+    return result;
+  }
+
+  // Generate unique kernel name for TRT graph
+  HashValue model_hash = TRTGenerateId(graph, std::to_string(trt_version_), std::to_string(cuda_version_));
+
+  // Get supported node list from TensorRT parser
+  const int number_of_ort_nodes = graph.NumberOfNodes();
+  std::vector<size_t> nodes_vector(number_of_ort_nodes);
+  std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);
+
+  auto get_exclude_ops_set = [&](std::string node_list_to_exclude) -> std::set<std::string> {
+    std::set<std::string> set;
+    if (!node_list_to_exclude.empty()) {
+      std::stringstream node_list(node_list_to_exclude);
+      std::string node;
+      while (std::getline(node_list, node, ',')) {
+        set.insert(node);
+      }
+    }
+    return set;
+  };
+
+  auto exclude_ops_set = get_exclude_ops_set(op_types_to_exclude_);
+
+  SubGraphCollection_t parser_nodes_vector, supported_nodes_vector;
+  const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+  bool new_subgraph = true;
+
+  /* Iterate all the nodes and exclude the node if:
+   *   1. It's a control flow op and its subgraph(s) is not fully TRT eligible.
+   *   2. It's a DDS op.
+   */
+  for (const auto& index : nodes_vector) {
+    const auto& node = graph.GetNode(node_index[index]);
+    bool supported_node = true;
+
+    /* If current node is control flow op, we take different approach based on following four cases:
+     *
+     * (1) control flow op is supported by TRT, and its subgraphs are all supported by TRT. Assign this node to TRT.
+     * (2) control flow op is supported by TRT, but not all its subgraphs supported by TRT. Don't assign this node to TRT.
+     * (3) control flow op is not supported by TRT, but its subgraphs all supported by TRT. Don't assign this node to TRT.
+     * (4) control flow op is not supported by TRT, and not all its subgraphs supported by TRT. Don't assign this node to TRT.
+     *
+     * For cases 2, 3, 4, even though the control flow op is not assigned to TRT, any portion of its subgraphs that can run in TRT will be still fused and assigned to TRT EP.
+     */
+    if (control_flow_op_set_.find(node->OpType()) != control_flow_op_set_.end()) {
+      auto supported_control_flow_op = [&](const Node* node) {
+        auto sub_graphs = node->GetSubgraphs();
+        if (sub_graphs.size() != 0) {
+          for (auto sub_graph : sub_graphs) {
+            // TRT EP should consider the empty subgraph is fully supported by TRT.
+            if (sub_graph->CreateGraphViewer()->NumberOfNodes() == 0) {
+              continue;
+            }
+            if (!AllNodesAssignedToSpecificEP(*(sub_graph->CreateGraphViewer()), kNvTensorRTRTXExecutionProvider)) {
+              // if not all its subgraphs are supported, we need to exclude this control flow op
+              return false;
+            }
+          }
+        }
+        return true;
+      };
+      supported_node = supported_control_flow_op(node);
+    }
+
+    // Exclude any ops, if applicable
+    if (exclude_ops_set.find(node->OpType()) != exclude_ops_set.end()) {
+      supported_node = false;
+    }
+    // Exclude contrib ops
+    if (node->Domain() == kMSDomain) {
+      supported_node = false;
+    }
+
+    if (supported_node) {
+      if (new_subgraph) {
+        parser_nodes_vector.emplace_back();
+        // Mark all new graphs as "UnKnown" which will later be parsed by TRT parser
+        parser_nodes_vector.back().second = false;
+        new_subgraph = false;
+      }
+      parser_nodes_vector.back().first.emplace_back(index);
+    } else {
+      new_subgraph = true;
+    }
+  }
+
+  bool early_termination = false;
+  supported_nodes_vector = GetSupportedList(parser_nodes_vector, 0, max_partition_iterations_, graph, &early_termination);
+  if (early_termination) {
+    supported_nodes_vector.clear();
+  }
+
+  // Remove subgraphs if its size is less than the predefined minimal size
+  for (auto it = supported_nodes_vector.begin(); it != supported_nodes_vector.end(); ++it) {
+    const size_t subgraph_size = it->first.size();
+    if (subgraph_size < min_subgraph_size_) {
+      supported_nodes_vector.erase(it--);
+    }
+  }
+
+  // Detect and remove cycles from supported node list
+  DetectTensorRTGraphCycles(supported_nodes_vector, graph, model_hash);
+
+  // Consolidate supported node list
+  if (supported_nodes_vector.size() > 1) {
+    nodes_vector.clear();
+    for (const auto& group : supported_nodes_vector) {
+      if (!group.first.empty()) {
+        nodes_vector.insert(nodes_vector.end(), group.first.begin(), group.first.end());
+      }
+    }
+    SubGraphCollection_t consolidated_supported_nodes_vector = {{nodes_vector, true}};
+    if (DetectTensorRTGraphCycles(consolidated_supported_nodes_vector, graph, model_hash, false)) {
+      LOGS_DEFAULT(INFO) << "[Nv EP] TensorRT nodes are not consolidated because graph will have cycles after consolidation";
+    } else {
+      LOGS_DEFAULT(INFO) << "[Nv EP] TensorRT nodes are consolidated into one subgraph";
+      supported_nodes_vector = consolidated_supported_nodes_vector;
+    }
+  }
+
+  // Handle the case where the graph is subgraph of control flow op.
+  // The purpose is to make control flow op as well as its subgraphs run on TRT.
+  // Here we need to check whether subgraph is fully supported by TRT and don't fuse the nodes of the subgraph until control flow op level.
+  if (IsSubGraphOfControlFlowOp(graph) && IsSubGraphFullySupported(supported_nodes_vector, number_of_ort_nodes)) {
+    const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+    bool all_subgraphs_are_supported = true;
+
+    // "If" control flow op has two subgraph bodies, "then" body and "else" body respectively.
+    // Check its parent node's another subgraph to see whether that subgraph is also fully supported by TRT.
+    if (graph.ParentNode()->OpType() == "If") {
+      all_subgraphs_are_supported = false;
+      SubGraphCollection_t subgraph_supported_nodes_vector;
+      auto sub_graphs = graph.ParentNode()->GetSubgraphs();
+      for (auto sub_graph : sub_graphs) {
+        if (sub_graph.get() != &graph.GetGraph()) {
+          auto sub_graph_viewer = sub_graph->CreateGraphViewer();
+          const int number_of_ort_subgraph_nodes = sub_graph_viewer->NumberOfNodes();
+          std::vector<size_t> subgraph_nodes_vector(number_of_ort_subgraph_nodes);
+          std::iota(std::begin(subgraph_nodes_vector), std::end(subgraph_nodes_vector), 0);
+          SubGraphCollection_t parser_subgraph_nodes_vector = {{subgraph_nodes_vector, false}};
+          bool subgraph_early_termination = false;
+
+          // Another subgraph of "If" control flow op has no nodes.
+          // In this case, TRT EP should consider this empty subgraph is fully supported by TRT.
+          if (sub_graph_viewer->NumberOfNodes() == 0) {
+            all_subgraphs_are_supported = true;
+            break;
+          }
+          // Another subgraph of "If" control flow op has been parsed by GetCapability before and all subgraph's nodes assigned to TRT EP.
+          else if (AllNodesAssignedToSpecificEP(*sub_graph_viewer, kNvTensorRTRTXExecutionProvider)) {
+            all_subgraphs_are_supported = true;
+            break;
+          }
+          // Another subgraph of "If" control flow has been parsed by GetCapability and not all subgraph's nodes assigned to TRT EP.
+          // (Note: GetExecutionProviderType() returns "" meaning node has not yet been assigned to any EPs)
+          else if (!AllNodesAssignedToSpecificEP(*sub_graph_viewer, "")) {
+            all_subgraphs_are_supported = false;
+            break;
+          }
+
+          // Another subgraph of "If" control flow has not yet been parsed by GetCapability.
+          subgraph_supported_nodes_vector = GetSupportedList(parser_subgraph_nodes_vector, 0, max_partition_iterations_, *sub_graph_viewer, &subgraph_early_termination);
+          all_subgraphs_are_supported = IsSubGraphFullySupported(subgraph_supported_nodes_vector, number_of_ort_subgraph_nodes);
+          break;
+        }
+      }
+    }
+
+    if (all_subgraphs_are_supported) {
+      // We want the subgraph nodes to be assigned to TRT EP but don't want them to be fused until later at the control flow op level.
+      // Simply request the subgraph nodes with a single ComputeCapability for each with no MetaDef (i.e. what the default implementation for IExecutionProvider::GetCapability does).
+      for (const auto& group : supported_nodes_vector) {
+        if (!group.first.empty()) {
+          for (const auto& index : group.first) {
+            std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::IndexedSubGraph::Create();
+            sub_graph->Nodes().push_back(node_index[index]);
+            result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+          }
+        }
+      }
+      LOGS_DEFAULT(INFO) << "[Nv EP] Whole graph will run on Nv execution provider";
+
+      // The context map is only used during EP compile time, release it to save memory space.
+      subgraph_context_map_.clear();
+      return result;
+    }
+  }
+
+  int number_of_trt_nodes = 0, subgraph_index = 0;
+  for (const auto& group : supported_nodes_vector) {
+    if (!group.first.empty()) {
+      std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(group, graph, model_hash, subgraph_index);
+      result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+      number_of_trt_nodes += static_cast<int>(group.first.size());
+      subgraph_index++;
+    }
+  }
+
+  const size_t number_of_subgraphs = supported_nodes_vector.size();
+  if (number_of_trt_nodes == 0) {
+    LOGS_DEFAULT(WARNING) << "[Nv EP] No graph will run on Nv execution provider";
+  } else if (number_of_trt_nodes == number_of_ort_nodes) {
+    LOGS_DEFAULT(INFO) << "[Nv EP] Whole graph will run on Nv execution provider";
+  } else {
+    LOGS_DEFAULT(INFO) << "[Nv EP] Graph is partitioned and number of subgraphs running on Nv executio provider is " << number_of_subgraphs;
+  }
+
+  // The context map is only used during EP compile time, release it to save memory space.
+  subgraph_context_map_.clear();
+  return result;
+}
+
+/**
+ * Refit the weight-stripped engine
+ */
+common::Status NvExecutionProvider::RefitEngine(std::string onnx_model_filename,
+                                                std::string& onnx_model_folder_path,
+                                                std::string& weight_stripped_engine_cath_path,
+                                                bool path_check,
+                                                const void* onnx_model_bytestream,
+                                                size_t onnx_model_bytestream_size,
+                                                nvinfer1::ICudaEngine* trt_engine,
+                                                bool serialize_refitted_engine,
+                                                bool detailed_build_log) {
+  bool refit_from_file = onnx_model_bytestream == nullptr && onnx_model_bytestream_size == 0;
+  std::filesystem::path onnx_model_path{onnx_model_folder_path};
+  if (refit_from_file) {
+    if (!onnx_model_filename.empty()) {
+      onnx_model_path.append(onnx_model_filename);
+    }
+    if (onnx_model_path.empty()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "The ONNX model was not provided as path. "
+                             "Please use provide an ONNX bytestream to enable refitting the weightless engine.");
+    } else {
+      // check if file path to ONNX is legal
+      if (path_check && IsAbsolutePath(onnx_model_path.string())) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "For security purpose, the ONNX model path should be set with "
+                               "a relative path, but it is an absolute path: " +
+                                   onnx_model_path.string());
+      }
+      if (path_check && IsRelativePathToParentPath(onnx_model_path.string())) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "The ONNX model path has '..'. For security purpose, it's not "
+                               "allowed to point outside the directory.");
+      }
+
+      if (!(std::filesystem::exists(onnx_model_path) && std::filesystem::is_regular_file(onnx_model_path))) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "The ONNX model " + onnx_model_path.string() +
+                                   " does not exist.");
+      }
+    }
+  }
+
+  // weight-stripped engine refit logic
+  TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log);
+  auto refitter = std::unique_ptr<nvinfer1::IRefitter>(nvinfer1::createInferRefitter(*trt_engine, trt_logger));
+  auto parser_refitter = std::unique_ptr<nvonnxparser::IParserRefitter>(
+      nvonnxparser::createParserRefitter(*refitter, trt_logger));
+  if (refit_from_file) {
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Refitting from file on disk: " << onnx_model_path.string();
+    if (!parser_refitter->refitFromFile(onnx_model_path.string().c_str())) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+    }
+  } else {
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Refitting from byte array";
+    if (!parser_refitter->refitFromBytes(onnx_model_bytestream, onnx_model_bytestream_size)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in the provided bytestraem");
+    }
+  }
+  if (refitter->refitCudaEngine()) {
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Successfully refitted the weight-stripped engine.";
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "Nv EP's IRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+  }
+
+  // serialize the refitted engine to disk
+  if (serialize_refitted_engine) {
+    std::string refitted_engine_cache = GetWeightRefittedEnginePath(weight_stripped_engine_cath_path);
+    nvinfer1::IHostMemory* serialized_engine = trt_engine->serialize();
+    std::ofstream engine_file(refitted_engine_cache, std::ios::binary | std::ios::out);
+    engine_file.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Serialize the refitted engine to " << refitted_engine_cache;
+  }
+  return Status::OK();
+}
+
+common::Status NvExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
+                                            std::vector<NodeComputeInfo>& node_compute_funcs) {
+  for (auto& fused_node_graph : fused_nodes_and_graphs) {
+    const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
+    const Node& fused_node = fused_node_graph.fused_node;
+    // Build map from input name to its index in input definitions
+    std::unordered_map<std::string, size_t> input_map;
+    const auto& input_defs = fused_node.InputDefs();
+    input_map.reserve(input_defs.size());
+    for (size_t i = 0, end = input_defs.size(); i < end; ++i) {
+      input_map[input_defs[i]->Name()] = i;
+    }
+
+    // Build map from output name to its index in output definitions
+    std::unordered_map<std::string, size_t> output_map;
+    const auto& output_defs = fused_node.OutputDefs();
+    output_map.reserve(output_defs.size());
+    for (size_t i = 0, end = output_defs.size(); i < end; ++i) {
+      output_map[output_defs[i]->Name()] = i;
+    }
+
+    Status status;
+    if (GraphHasCtxNode(graph_body_viewer)) {
+      status = CreateNodeComputeInfoFromPrecompiledEngine(graph_body_viewer,
+                                                          fused_node,
+                                                          input_map,
+                                                          output_map,
+                                                          node_compute_funcs);
+    } else {
+      status = CreateNodeComputeInfoFromGraph(graph_body_viewer, fused_node, input_map, output_map, node_compute_funcs);
+    }
+    if (status != Status::OK()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+    }
+  }
+  return Status::OK();
+}
+
+Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
+                                                           const Node& fused_node,
+                                                           std::unordered_map<std::string, size_t>& input_map,
+                                                           std::unordered_map<std::string, size_t>& output_map,
+                                                           std::vector<NodeComputeInfo>& node_compute_funcs) {
+  // Reconstruct graph proto from fused node's function body
+  auto model = graph_body_viewer.CreateModel(*GetLogger());
+  auto model_proto = model->ToProto();
+
+  // ORT's default topological sort is using reversed DFS.
+  // When creating model proto from graph viewer, let ORT use priority-based topological sort based on node index.
+  // The reason is, in some cases, for example ResNet50, using default topological sort will end up with generating
+  // the model proto that has different node ordering compared to original onnx model.
+  graph_body_viewer.ToProto(*model_proto->mutable_graph(), true, true, 1 /*priority-based topological sort*/);
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  std::string string_buf;
+  model_proto->SerializeToString(string_buf);
+
+  if (dump_subgraphs_) {
+    // Dump TensorRT subgraphs
+    std::fstream dump(fused_node.Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+    model_proto->SerializeToOstream(dump);
+  }
+
+  TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log_);
+  auto trt_builder = GetBuilder(trt_logger);
+  auto network_flags = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED);
+  auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(network_flags));
+  auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+  auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
+  trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
+  if (max_workspace_size_ > 0) {
+    trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+  }
+
+  int num_inputs = trt_network->getNbInputs();
+  int num_outputs = trt_network->getNbOutputs();
+  std::unordered_map<std::string, size_t> input_indexes(num_inputs);
+  std::unordered_map<std::string, size_t> output_indexes(num_outputs);
+  std::unordered_map<std::string, size_t> output_types(num_outputs);
+
+  /*
+   * Initialize shape range for each dynamic shape input tensor:
+   *   1) If user explicitly specifies optimization profiles via provider options, TRT EP will create those profiles during EP compile time.
+   *      It won't make adjustment for profile values during EP compute time.
+   *
+   *   2) If no explicit optimization profiles provided by user, TRT EP will firstly set min/max/opt shape to [INT_MAX, INT_MIN, INT_MIN].
+   *      Later in EP compute time, the shape will be adjusted to [min_input_value, max_input_value, max_input_value] based on input tensor value.
+   *
+   *
+   * Once the TRT profiles are created:
+   *   1) If all the dynamic shape input tensors have associated profiles explicitly provided by user, those profiles will be applied to TRT builder config
+   *      and the engine will be built at EP compile time.
+   *
+   *   2) As long as one of the dynamic shape input tensors has no explicitly associated profile, TRT EP will create default shape as described above,
+   *      and all the profiles won't be applied and engine won't be built until EP compute time.
+   */
+  bool has_explicit_profile = false;
+  bool has_implicit_profile = false;
+  int num_profiles = 0;
+  std::vector<nvinfer1::IOptimizationProfile*> trt_profiles;
+
+  // Following c++ map data structure is used to help serialize/deserialize profiles where it saves dynamic shape dimension(s) and min/max/opt values for dynamic shape input tensor.
+  //
+  // (1) Single profile case:
+  // For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b
+  // has one dynamic shape dimension: dim_1. The data will be:
+  // {
+  //   tensor_a: {
+  //              dim_0: [[min_shape, max_shape, opt_shape]],
+  //              dim_2: [[min_shape, max_shape, opt_shape]]
+  //   },
+  //   tensor_b: {
+  //              dim_1: [[min_shape, max_shape, opt_shape]]
+  //   }
+  // }
+  //
+  // (2) Multiple profiles case:
+  // For example, assume tensor_a has one dynamic shap dimension: dim 0, and tensor_b has one dynamic shape dimension: dim_1,
+  // and both of the tensors have two profiles. The data will be:
+  // {
+  //   tensor_a: {
+  //     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
+  //   },
+  //   tensor_b: {
+  //     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
+  //   }
+  // }
+  ShapeRangesMap input_explicit_shape_ranges;
+  ShapeRangesMap input_implicit_shape_ranges;
+
+  auto tensor_is_dynamic = [&](nvinfer1::ITensor* tensor) -> bool {
+    if (tensor->isShapeTensor()) {
+      return true;
+    } else {
+      nvinfer1::Dims dims = tensor->getDimensions();
+      // Execution tensor
+      for (int j = 0, end = dims.nbDims; j < end; ++j) {
+        if (dims.d[j] == -1) {
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+
+  bool has_dynamic_shape = false;  // True if input tensor has dynamic shape and no explicit profile is specified, otherwise false
+  if ((!profile_min_shapes_.empty()) && (!profile_max_shapes_.empty()) && (!profile_opt_shapes_.empty())) {
+    has_explicit_profile = true;
+    has_dynamic_shape = true;
+    num_profiles = GetNumProfiles(profile_min_shapes_);
+    for (int i = 0; i < num_profiles; i++) {
+      trt_profiles.push_back(trt_builder->createOptimizationProfile());
+    }
+  } else {
+    for (unsigned int i = 0, end = num_inputs; i < end; ++i) {
+      auto input = trt_network->getInput(i);
+      has_dynamic_shape |= tensor_is_dynamic(input);
+    }
+    if (has_dynamic_shape) {
+      LOGS_DEFAULT(WARNING) << "[Nv EP] No explicit optimization profile was specified. "
+                               "We will assume a single profile with fully dynamic range. "
+                               "This feature is experimental and may change in the future."
+                               "If you plan to use this model as fixed shape we recommend using a free dimension override: "
+                               "https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides.";
+      trt_profiles.push_back(trt_builder->createOptimizationProfile());
+    }
+  }
+  if (has_dynamic_shape) {
+    // Iterate all input tensors to check dynamic shape
+    for (unsigned int i = 0, end = num_inputs; i < end; ++i) {
+      auto input = trt_network->getInput(i);
+      const std::string& input_name = input->getName();
+      nvinfer1::Dims dims = input->getDimensions();
+
+      // Apply explicit optimization profiles provided by user
+      bool apply_profile = false;
+      bool tensor_has_profile = profile_min_shapes_.find(input_name) != profile_min_shapes_.end() &&
+                                profile_opt_shapes_.find(input_name) != profile_opt_shapes_.end() &&
+                                profile_max_shapes_.find(input_name) != profile_max_shapes_.end();
+      if (has_explicit_profile && tensor_has_profile) {
+        apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
+      } else {
+        LOGS_DEFAULT(INFO) << "[Nv EP] Creating implicit profile for tensor " << input_name;
+        profile_min_shapes_[input_name] = std::vector<std::vector<int64_t>>{{}};
+        profile_min_shapes_[input_name][0].resize(dims.nbDims);
+        profile_opt_shapes_[input_name] = std::vector<std::vector<int64_t>>{{}};
+        profile_opt_shapes_[input_name][0].resize(dims.nbDims);
+        profile_max_shapes_[input_name] = std::vector<std::vector<int64_t>>{{}};
+        profile_max_shapes_[input_name][0].resize(dims.nbDims);
+        for (int idx_dim = 0; idx_dim < dims.nbDims; ++idx_dim) {
+          auto dim_value = dims.d[idx_dim];
+          if (dim_value == -1) {
+            has_implicit_profile = true;
+            // TODO(maximilianm) this is needed until we have a wildcard in the API to support dynamic shapes
+            profile_min_shapes_[input_name][0][idx_dim] = 0;
+            // TODO(maximilianm) This can be buggy since shape inference can failt with 1 being used as optimal shape
+            //        [2025-04-04 15:41:58   ERROR] IBuilder::buildSerializedNetwork: Error Code 4: Internal Error (kOPT values for profile 0 violate shape constraints:
+            //        /conv1/Conv: spatial dimension of convolution/deconvolution output cannot be negative (build-time output dimension of axis 2 is
+            //        (+ (CEIL_DIV (+ h -6) 2) 1)) Condition '<' violated: 2 >= 1.)
+            profile_opt_shapes_[input_name][0][idx_dim] = 1;
+            profile_max_shapes_[input_name][0][idx_dim] = std::numeric_limits<int16_t>::max();
+          } else {
+            profile_min_shapes_[input_name][0][idx_dim] = dim_value;
+            profile_opt_shapes_[input_name][0][idx_dim] = dim_value;
+            profile_max_shapes_[input_name][0][idx_dim] = dim_value;
+          }
+        }
+        apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
+      }
+      if (!apply_profile) {
+        std::ostringstream msg;
+        msg << "Optimization profile could not be applied for tensor:\n";
+        msg << input_name;
+        msg << "\n[";
+        for (int idx_dim = 0; idx_dim < dims.nbDims; ++idx_dim) {
+          msg << dims.d[idx_dim] << ",";
+        }
+        msg << "]";
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, msg.str());
+      }
+    }
+
+    // Set explicit profiles in TRT config if all dynamic shape inputs have associated profiles provided by user
+    if (has_explicit_profile || has_implicit_profile) {
+      // TRT EP has a constraint here.
+      // Users need to provide all the dynamic shape inputs with associated profiles if they want to explicitly specify profiles through provider options.
+      for (auto trt_profile : trt_profiles) {
+        trt_config->addOptimizationProfile(trt_profile);
+      }
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No explicit or implicit shapes were provided for dynamic shape inputs.");
+      ;
+    }
+  }
+  std::string trt_node_name_with_precision = fused_node.Name() + "_strong_typed";
+
+  // enable sparse weights
+  if (sparsity_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Sparse weights are allowed";
+  }
+
+  // limit auxiliary streams
+  if (auxiliary_streams_ >= 0) {
+    trt_config->setMaxAuxStreams(auxiliary_streams_);
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Auxiliary streams are se to " << auxiliary_streams_;
+  }
+
+  if (weight_stripped_engine_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kSTRIP_PLAN);
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] STRIP_PLAN is enabled";
+    trt_config->setFlag(nvinfer1::BuilderFlag::kREFIT_IDENTICAL);
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] REFIT_IDENTICAL is enabled";
+  }
+
+  // Build TRT engine (if needed) and load TRT engine if:
+  //   (1) Graph has no dynamic shape input
+  //   (2) All the dynamic shape inputs have associated explicit profiles specified by user
+  //
+  // Otherwise engine will be handled at inference time.
+  std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+  std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+
+  std::string cache_path = "";
+  std::string cache_suffix = "";
+  // Customize cache prefix if assigned
+  if (!cache_prefix_.empty()) {
+    // Generate cache suffix in case user would like to customize cache prefix
+    cache_suffix = "_" + GetCacheSuffix(fused_node.Name(), trt_node_name_with_precision);
+    cache_path = GetCachePath(cache_path_, cache_prefix_) + cache_suffix;
+  } else {
+    cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
+  }
+
+  // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+  // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+  const std::string cache_path_prefix = cache_path;
+  std::string engine_cache_path = cache_path_prefix + ".engine";
+  const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
+  const std::string profile_cache_path = cache_path_prefix + ".profile";
+
+  // If weight-stripped engine is enabled and refitted engine cache is not present,
+  // TRT EP will use the engine cache with ".stripped.engine" appended to the end.
+  const std::filesystem::path engine_cache_fs_path = engine_cache_path;
+  if (weight_stripped_engine_enable_ && !std::filesystem::exists(engine_cache_fs_path)) {
+    engine_cache_path = cache_path_prefix + ".stripped.engine";
+    weight_stripped_engine_refit_ = true;
+  }
+
+  // Generate file name for dumping ep context model
+  if (dump_ep_context_model_ && ctx_model_path_.empty()) {
+    ctx_model_path_ = GetCtxModelPath(ep_context_file_path_, model_path_);
+  }
+  {
+    auto lock = GetApiLock();
+    // Build engine
+    std::chrono::steady_clock::time_point engine_build_start;
+    if (detailed_build_log_) {
+      engine_build_start = std::chrono::steady_clock::now();
+    }
+    std::unique_ptr<nvinfer1::IHostMemory> serialized_engine{trt_builder->buildSerializedNetwork(*trt_network, *trt_config)};
+    if (serialized_engine == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP failed to create engine from network for fused node: " + fused_node.Name());
+    }
+    trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
+    if (trt_engine == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP failed to deserialize engine for fused node: " + fused_node.Name());
+    }
+    if (detailed_build_log_) {
+      auto engine_build_stop = std::chrono::steady_clock::now();
+      LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+    }
+    // dump EP context node model
+    if (dump_ep_context_model_) {
+      // "ep_cache_context" node attribute should be a relative path to context model directory
+      if (ep_cache_context_attr_.empty()) {
+        auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
+        ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
+      }
+      std::string compute_capability_hw_compat = compute_capability_ + "+";
+      std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxModel(graph_body_viewer,
+                                                                             ep_cache_context_attr_,
+                                                                             reinterpret_cast<char*>(serialized_engine->data()),
+                                                                             serialized_engine->size(),
+                                                                             ep_context_embed_mode_,
+                                                                             compute_capability_hw_compat,
+                                                                             model_path_,
+                                                                             GetLogger())};
+      DumpCtxModel(model_proto.get(), ctx_model_path_);
+    }
+  }
+
+  if (weight_stripped_engine_refit_) {
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Refit engine from main ONNX file after engine build";
+    char* onnx = string_buf.data();
+    size_t onnx_size = string_buf.size();
+    auto status = RefitEngine(model_path_,
+                              onnx_model_folder_path_,
+                              engine_cache_path,
+                              false /* path check for security */,
+                              onnx,
+                              onnx_size,
+                              trt_engine.get(),
+                              true /* serialize refitted engine to disk */,
+                              detailed_build_log_);
+    if (status != Status::OK()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+    }
+  }
+
+  // Build context
+  // Note: Creating an execution context from an engine is thread safe per TRT doc
+  // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+  if (context_memory_sharing_enable_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
+    size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+    if (mem_size > max_ctx_mem_size_) {
+      max_ctx_mem_size_ = mem_size;
+    }
+#if NV_TENSORRT_MAJOR < 10
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
+#else
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
+#endif
+  } else {
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
+  }
+  if (!trt_context) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "Nv EP could not build execution context for fused node: " + fused_node.Name());
+  }
+
+  // Create input to index map
+  for (int i = 0; i < num_inputs; ++i) {
+    auto input = trt_network->getInput(i);
+    const std::string& input_name = input->getName();
+    const auto& iter = input_map.find(input_name);
+    if (iter != input_map.end()) {
+      input_indexes[input_name] = iter->second;
+    }
+  }
+
+  // Create output to index and type maps
+  const auto& graph_output = model_proto->graph().output();
+  for (int i = 0; i < num_outputs; ++i) {
+    const std::string& output_name = trt_network->getOutput(i)->getName();
+    const auto& iter = output_map.find(output_name);
+    if (iter != output_map.end()) {
+      output_indexes[output_name] = iter->second;
+    }
+    const auto& tensor_type = graph_output[i].type().tensor_type();
+    output_types[output_name] = tensor_type.elem_type();
+  }
+
+  // Save TRT engine, other TRT objects and input/output info to map
+  parsers_.emplace(fused_node.Name(), std::move(trt_parser));
+  engines_.emplace(fused_node.Name(), std::move(trt_engine));
+  contexts_.emplace(fused_node.Name(), std::move(trt_context));
+  networks_.emplace(fused_node.Name(), std::move(trt_network));
+  input_info_[fused_node.Name()].push_back(input_indexes);
+  output_info_[fused_node.Name()].push_back(output_indexes);
+  output_info_[fused_node.Name()].push_back(output_types);
+  input_shape_ranges_[fused_node.Name()] = input_implicit_shape_ranges;
+  profiles_.emplace(fused_node.Name(), std::move(trt_profiles));
+
+  // Create function state
+  // TODO: remove default capture
+  NodeComputeInfo compute_info;
+  compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
+    std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
+    *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
+          &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
+          &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
+          input_shape_ranges_[context->node_name], &tensorrt_mu_, trt_node_name_with_precision,
+          engine_cache_enable_, cache_path_,
+          runtime_.get(), profiles_[context->node_name],
+          context_memory_sharing_enable_, &max_ctx_mem_size_,
+          engine_decryption_enable_, engine_decryption_, engine_encryption_,
+          detailed_build_log_, sparsity_enable_,
+          auxiliary_streams_, cuda_graph_enable_, cache_prefix_, cache_suffix};
+    *state = p.release();
+    return 0;
+  };
+
+  // Release function state
+  compute_info.release_state_func = [](FunctionState state) {
+    delete static_cast<TensorrtFuncState*>(state);
+  };
+
+  // Create compute function
+  compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    Ort::KernelContext ctx(context);
+
+    TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
+
+    // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
+    // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
+    // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    std::lock_guard<std::mutex> lock(*(trt_state->tensorrt_mu_ptr));
+    const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
+    const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
+    const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+    auto fused_node_name = trt_state->fused_node_name;
+    // This map "shape_ranges" contains the shape range info for setting TRT optimization profiles.
+    // The info is used for both shape tensor and execution tensor:
+    // tensor name->(dimension->[min, max, opt])
+    auto& shape_ranges = trt_state->input_shape_ranges;
+    std::unordered_map<std::string, std::vector<int32_t>> shape_tensor_values;        // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run
+    std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64;  // same as above but for int64 shape tensor input
+    auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
+    auto trt_engine = trt_state->engine->get();
+    auto trt_context = trt_state->context->get();
+    auto trt_profiles = trt_state->profiles;
+    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
+    int num_inputs = static_cast<int>(input_indexes.size());
+    int num_outputs = static_cast<int>(output_indexes.size());
+    std::unordered_set<std::string> input_names;
+
+    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
+    if (alloc_ == nullptr) {
+      Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
+    }
+    OrtAllocator* alloc = alloc_;
+
+    void* cuda_stream;
+    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+
+    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+    // Prepare cache name
+    std::string cache_path = "";
+    // Customize cache prefix if assigned
+    if (!cache_prefix_.empty()) {
+      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->cache_prefix) + trt_state->cache_suffix;
+    } else {
+      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
+    }
+
+    // Enable hardware compatility mode if assigned
+    std::string cache_hw_compat = "_sm" + compute_capability_;
+
+    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+    const std::string cache_path_prefix = cache_path + cache_hw_compat;
+    std::string engine_cache_path = cache_path_prefix + ".engine";
+    const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
+    const std::string profile_cache_path = cache_path_prefix + ".profile";
+
+    // If weight-stripped engine is enabled and refitted engine cache is not present,
+    // TRT EP will use the engine cache with ".stripped.engine" appended to the end.
+    const std::filesystem::path engine_cache_fs_path = engine_cache_path;
+    if (weight_stripped_engine_enable_ && !std::filesystem::exists(engine_cache_fs_path)) {
+      engine_cache_path = cache_path_prefix + ".stripped.engine";
+      weight_stripped_engine_refit_ = true;
+    }
+
+    // Check and update shape ranges for dynamic shape inputs.
+    for (int i = 0, end = num_inputs; i < end; ++i) {
+      auto input = trt_state->network->get()->getInput(i);
+      const std::string& input_name = input->getName();
+      input_names.insert(input_name);
+
+      // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
+      // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
+      if (shape_ranges.find(input_name) != shape_ranges.end()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "Nv EP failed to parse input tensor and generate optimization profiles.");
+      }
+    }
+
+    if (weight_stripped_engine_refit_) {
+      auto status = RefitEngine(model_path_,
+                                onnx_model_folder_path_,
+                                engine_cache_path,
+                                false /* path check for security */,
+                                onnx_model_bytestream_,
+                                onnx_model_bytestream_size_,
+                                trt_engine,
+                                true /* serialize refitted engine to disk */,
+                                detailed_build_log_);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    // Check before using trt_engine
+    if (trt_engine == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found.");
+    }
+
+    // Get input and output binding names
+    int total_bindings = trt_engine->getNbIOTensors();
+    std::vector<char const*> input_binding_names, output_binding_names;
+    for (int i = 0, end = total_bindings; i < end; ++i) {
+      auto const& name = trt_engine->getIOTensorName(i);
+      auto const& mode = trt_engine->getTensorIOMode(name);
+      if (mode == nvinfer1::TensorIOMode::kINPUT) {
+        input_binding_names.push_back(name);
+      } else {
+        output_binding_names.push_back(name);
+      }
+    }
+
+    /*
+     * Set input shapes and bind input buffers
+     */
+    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+      char const* input_name = input_binding_names[i];
+
+      size_t input_index = 0;
+      const auto iter = input_indexes.find(input_name);
+      if (iter != input_indexes.end()) {
+        input_index = iter->second;
+      }
+      auto input_tensor = ctx.GetInput(input_index);
+      auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+      const auto tensor_shapes = tensor_info.GetShape();
+
+      auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    /*
+     * Set output shapes and bind output buffers
+     */
+    std::unordered_map<char const*, void*> buffers;
+    buffers.reserve(num_outputs);
+    using OutputOrtValue = Ort::UnownedValue;
+    std::unordered_map<size_t, OutputOrtValue> output_tensors;
+    output_tensors.reserve(num_outputs);
+    std::unordered_map<size_t, int> output_dim_sizes;
+    output_dim_sizes.reserve(num_outputs);
+
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_index = 0;
+      const auto& index_iter = output_indexes.find(output_name);
+      if (index_iter != output_indexes.end()) {
+        output_index = index_iter->second;
+      }
+
+      size_t output_type = 0;
+      const auto type_iter = output_types.find(output_name);
+      if (type_iter != output_types.end()) {
+        output_type = type_iter->second;
+      }
+
+      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    // Set execution context memory
+    if (trt_state->context_memory_sharing_enable) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+      if (mem_size > *max_context_mem_size_ptr) {
+        *max_context_mem_size_ptr = mem_size;
+      }
+      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    }
+
+    // Start CUDA graph capture.
+    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
+    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
+      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
+      cuda_graph_.SetStream(stream);
+      CaptureBegin(0);
+    }
+
+    // Run TRT inference
+    if (!trt_context->enqueueV3(stream)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Nv EP execution context enqueue failed.");
+    }
+
+    /*
+     * Given that InferenceSession::Run() is guaranteed to be thread-safe meaning multiple threads can call this function concurrently,
+     * TRT EP needs to carefully take care of concurrency here, if not, following concurrent issue might happen:
+     *
+     * It's suggested that to perform inference concurrently in multiple streams, use one trt execution context per stream.
+     * In the design of TRT EP (Not apply per-thread context implementation) and if multiple threads are calling InferenceSession::Run() concurrently,
+     * the trt execution context instance is shared by all the threads and each thread aquires different stream from ORT.
+     * So TRT EP will end up having one trt execution context using multiple streams which is not suggested.
+     * But, since the whole compute_func() is protected by the lock and if cudaStreamSynchronize() is enforced here, one trt execution context per stream
+     * is guaranteed.
+     *
+     * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above.
+     * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture.
+     */
+    if (sync_stream_after_enqueue_) {
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    }
+
+    // Assign TRT output back to ORT output
+    // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
+    // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_type = 0;
+      const auto& iter = output_types.find(output_name);
+      if (iter != output_types.end()) {
+        output_type = iter->second;
+      }
+
+      if (dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end()) {
+        size_t output_index = 0;
+        const auto& index_iter = output_indexes.find(output_name);
+        if (index_iter != output_indexes.end()) {
+          output_index = index_iter->second;
+        }
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+        }
+      } else {
+        auto& output_tensor = output_tensors[i];
+        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+          }
+        }
+      }
+    }
+
+    // End CUDA graph capture.
+    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
+    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
+    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
+    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
+      if (IsGraphCaptureAllowed()) {
+        CaptureEnd(0);
+        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // so run the captured graph here to actually execute the work.
+        ORT_RETURN_IF_ERROR(ReplayGraph(0));
+      } else {
+        IncrementRegularRunCountBeforeGraphCapture();
+      }
+    }
+
+    return Status::OK();
+  };
+
+  node_compute_funcs.push_back(compute_info);
+  return Status::OK();
+}
+
+Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const GraphViewer& graph_body_viewer,
+                                                                       const Node& fused_node,
+                                                                       std::unordered_map<std::string, size_t>& input_map,
+                                                                       std::unordered_map<std::string, size_t>& output_map,
+                                                                       std::vector<NodeComputeInfo>& node_compute_funcs) {
+  std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+  std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+  std::unordered_map<std::string, size_t> input_indexes;   // TRT engine input name -> ORT kernel context input index
+  std::unordered_map<std::string, size_t> output_indexes;  // TRT engine output name -> ORT kernel context output index
+  std::unordered_map<std::string, size_t> output_types;    // TRT engine output name -> ORT output tensor type
+
+  // Get engine binary data and deserialize it
+  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine,
+                                                           runtime_.get(),
+                                                           model_path_,
+                                                           compute_capability_,
+                                                           weight_stripped_engine_enable_,
+                                                           onnx_model_folder_path_,
+                                                           onnx_model_bytestream_,
+                                                           onnx_model_bytestream_size_,
+                                                           detailed_build_log_);
+  auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
+  if (status != Status::OK()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+  }
+
+  // Build context
+  //
+  // Note: Creating an execution context from an engine is thread safe per TRT doc
+  // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+  if (context_memory_sharing_enable_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
+    size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+    if (mem_size > max_ctx_mem_size_) {
+      max_ctx_mem_size_ = mem_size;
+    }
+#if NV_TENSORRT_MAJOR < 10
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
+#else
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
+#endif
+  } else {
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
+  }
+  if (!trt_context) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "Nv EP could not build execution context for fused node: " + fused_node.Name());
+  }
+
+  // Create input/output to index maps
+  for (int32_t i = 0; i < trt_engine->getNbIOTensors(); ++i) {
+    auto const& name = trt_engine->getIOTensorName(i);
+    auto const& mode = trt_engine->getTensorIOMode(name);
+    if (mode == nvinfer1::TensorIOMode::kINPUT) {
+      const auto& iter = input_map.find(name);
+      if (iter != input_map.end()) {
+        input_indexes[name] = iter->second;
+      }
+    } else {
+      const auto& iter = output_map.find(name);
+      if (iter != output_map.end()) {
+        output_indexes[name] = iter->second;
+      }
+    }
+  }
+
+  // Create output to type map
+  for (auto node_arg : graph_body_viewer.GetOutputs()) {
+    auto output_name = node_arg->Name();
+    auto& type = node_arg->TypeAsProto()->tensor_type();
+    output_types[output_name] = type.elem_type();
+  }
+
+  // Save TRT engine, TRT context and input/output info to map
+  engines_.emplace(fused_node.Name(), std::move(trt_engine));
+  contexts_.emplace(fused_node.Name(), std::move(trt_context));
+  input_info_[fused_node.Name()].push_back(input_indexes);
+  output_info_[fused_node.Name()].push_back(output_indexes);
+  output_info_[fused_node.Name()].push_back(output_types);
+
+  // Create function state
+  // TODO: remove default capture
+  NodeComputeInfo compute_info;
+  compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
+    std::unique_ptr<TensorrtShortFuncState> p = std::make_unique<TensorrtShortFuncState>();
+    *p = {context->allocate_func,
+          context->release_func,
+          context->allocator_handle,
+          context->node_name,
+          &engines_[context->node_name],
+          &contexts_[context->node_name],
+          input_info_[context->node_name],
+          output_info_[context->node_name],
+          context_memory_sharing_enable_,
+          &max_ctx_mem_size_,
+          &tensorrt_mu_};
+    *state = p.release();
+    return 0;
+  };
+
+  // Release function state
+  compute_info.release_state_func = [](FunctionState state) {
+    delete static_cast<TensorrtShortFuncState*>(state);
+  };
+
+  // Create compute function
+  compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    Ort::KernelContext ctx(context);
+
+    TensorrtShortFuncState* trt_state = reinterpret_cast<TensorrtShortFuncState*>(state);
+
+    // The whole compute_function should be considered the critical section.
+    // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    std::lock_guard<std::mutex> lock(*(trt_state->tensorrt_mu_ptr));
+
+    const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
+    const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
+    const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+    auto fused_node_name = trt_state->fused_node_name;
+    auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
+    auto trt_engine = trt_state->engine->get();
+    auto trt_context = trt_state->context->get();
+    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
+    int num_outputs = static_cast<int>(output_indexes.size());
+    std::unordered_map<std::string, std::vector<int32_t>> shape_tensor_values;        // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run
+    std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64;  // same as above but for int64 shape tensor input
+
+    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
+    if (alloc_ == nullptr) {
+      Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
+    }
+    OrtAllocator* alloc = alloc_;
+
+    void* cuda_stream;
+    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+
+    // Check before using trt_engine
+    if (trt_engine == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found.");
+    }
+
+    // Get input and output binding names
+    int total_bindings = trt_engine->getNbIOTensors();
+    std::vector<char const*> input_binding_names, output_binding_names;
+    for (int i = 0, end = total_bindings; i < end; ++i) {
+      auto const& name = trt_engine->getIOTensorName(i);
+      auto const& mode = trt_engine->getTensorIOMode(name);
+      if (mode == nvinfer1::TensorIOMode::kINPUT) {
+        input_binding_names.push_back(name);
+      } else {
+        output_binding_names.push_back(name);
+      }
+    }
+
+    /*
+     * Set input shapes and bind input buffers
+     */
+    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+      char const* input_name = input_binding_names[i];
+
+      size_t input_index = 0;
+      const auto iter = input_indexes.find(input_name);
+      if (iter != input_indexes.end()) {
+        input_index = iter->second;
+      }
+
+      Status status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    /*
+     * Set output shapes and bind output buffers
+     */
+    std::unordered_map<char const*, void*> buffers;
+    buffers.reserve(num_outputs);
+    using OutputOrtValue = Ort::UnownedValue;
+    std::unordered_map<size_t, OutputOrtValue> output_tensors;
+    output_tensors.reserve(num_outputs);
+    std::unordered_map<size_t, int> output_dim_sizes;
+    output_dim_sizes.reserve(num_outputs);
+
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_index = 0;
+      const auto& index_iter = output_indexes.find(output_name);
+      if (index_iter != output_indexes.end()) {
+        output_index = index_iter->second;
+      }
+
+      size_t output_type = 0;
+      const auto type_iter = output_types.find(output_name);
+      if (type_iter != output_types.end()) {
+        output_type = type_iter->second;
+      }
+
+      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    // Set execution context memory
+    if (trt_state->context_memory_sharing_enable) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+      if (mem_size > *max_context_mem_size_ptr) {
+        *max_context_mem_size_ptr = mem_size;
+      }
+      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    }
+
+    // Start CUDA graph capture.
+    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
+    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
+      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
+      cuda_graph_.SetStream(stream);
+      CaptureBegin(0);
+    }
+
+    // Run TRT inference
+    if (!trt_context->enqueueV3(stream)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Nv EP execution context enqueue failed.");
+    }
+
+    /*
+     * Given that InferenceSession::Run() is guaranteed to be thread-safe meaning multiple threads can call this function concurrently,
+     * TRT EP needs to carefully take care of concurrency here, if not, following concurrent issue might happen:
+     *
+     * It's suggested that to perform inference concurrently in multiple streams, use one trt execution context per stream.
+     * In the design of TRT EP (Not apply per-thread context implementation) and if multiple threads are calling InferenceSession::Run() concurrently,
+     * the trt execution context instance is shared by all the threads and each thread aquires different stream from ORT.
+     * So TRT EP will end up having one trt execution context using multiple streams which is not suggested.
+     * But, since the whole compute_func() is protected by the lock and if cudaStreamSynchronize() is enforced here, one trt execution context per stream
+     * is guaranteed.
+     *
+     * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above.
+     * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture.
+     */
+    if (sync_stream_after_enqueue_) {
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    }
+
+    // Assign TRT output back to ORT output
+    // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
+    // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_type = 0;
+      const auto& iter = output_types.find(output_name);
+      if (iter != output_types.end()) {
+        output_type = iter->second;
+      }
+
+      if (dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end()) {
+        size_t output_index = 0;
+        const auto& index_iter = output_indexes.find(output_name);
+        if (index_iter != output_indexes.end()) {
+          output_index = index_iter->second;
+        }
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+        }
+      } else {
+        auto& output_tensor = output_tensors[i];
+        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+          }
+        }
+      }
+    }
+
+    // End CUDA graph capture.
+    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
+    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
+    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
+    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
+      if (IsGraphCaptureAllowed()) {
+        CaptureEnd(0);
+        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // so run the captured graph here to actually execute the work.
+        ORT_RETURN_IF_ERROR(ReplayGraph(0));
+      } else {
+        IncrementRegularRunCountBeforeGraphCapture();
+      }
+    }
+
+    return Status::OK();
+  };
+
+  node_compute_funcs.push_back(compute_info);
+  return Status::OK();
+}
+
+void NvExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const {
+  auto allocator = allocators[GetOrtDeviceByMemType(OrtMemTypeCPU)];
+  RegisterCudaStreamHandles(stream_handle_registry,
+                            OrtDevice::GPU,
+                            allocator,
+                            true /* release_cpu_buffer_on_cuda_stream */,
+                            stream_,
+                            external_stream_ /* use_existing_stream */,
+                            external_cudnn_handle_,
+                            external_cublas_handle_,
+                            {});
+}
+
+OrtDevice NvExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const {
+  if (mem_type == OrtMemTypeCPUInput) return OrtDevice();
+  if (mem_type == OrtMemTypeCPUOutput) return OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, 0 /*CPU device id always be 0*/);
+  return default_device_;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
new file mode 100644
index 0000000000000..76044b4fc2017
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
@@ -0,0 +1,526 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <ctime>
+#ifndef USE_CUDA_MINIMAL
+#include <cudnn.h>
+#else
+typedef void* cudnnHandle_t;
+typedef void* cublasHandle_t;
+typedef void* cudnnStatus_t;
+#endif
+#include "core/providers/nv_tensorrt_rtx/nv_includes.h"
+
+#include <mutex>
+#include "core/providers/cuda/cuda_graph.h"
+#include "nv_execution_provider_info.h"
+
+namespace onnxruntime {
+
+class TensorrtLogger : public nvinfer1::ILogger {
+  nvinfer1::ILogger::Severity verbosity_;
+
+ public:
+  TensorrtLogger(Severity verbosity = Severity::kWARNING)
+      : verbosity_(verbosity) {}
+  void log(Severity severity, const char* msg) noexcept override {
+    if (severity <= verbosity_) {
+      time_t rawtime = std::time(0);
+      struct tm stm;
+#ifdef _MSC_VER
+      gmtime_s(&stm, &rawtime);
+#else
+      gmtime_r(&rawtime, &stm);
+#endif
+      char buf[256];
+      strftime(&buf[0], 256,
+               "%Y-%m-%d %H:%M:%S",
+               &stm);
+      const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" : severity == Severity::kERROR ? "  ERROR"
+                                                                            : severity == Severity::kWARNING ? "WARNING"
+                                                                            : severity == Severity::kINFO    ? "   INFO"
+                                                                                                             : "UNKNOWN");
+      if (severity <= Severity::kERROR) {
+        LOGS_DEFAULT(ERROR) << "[" << buf << " " << sevstr << "] " << msg;
+      } else {
+        LOGS_DEFAULT(WARNING) << "[" << buf << " " << sevstr << "] " << msg;
+      }
+    }
+  }
+  void set_level(Severity verbosity) {
+    verbosity_ = verbosity;
+  }
+  Severity get_level() const {
+    return verbosity_;
+  }
+};
+
+namespace tensorrt_ptr {
+
+struct TensorrtInferDeleter {
+  template <typename T>
+  void operator()(T* obj) const {
+    if (obj) {
+      delete obj;
+    }
+  }
+};
+
+template <typename T>
+using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
+};  // namespace tensorrt_ptr
+
+//
+// Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
+// not possible.
+//
+class OutputAllocator : public nvinfer1::IOutputAllocator {
+ public:
+#if NV_TENSORRT_MAJOR >= 10
+  void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) noexcept override;
+#else
+  void* reallocateOutput(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override;
+#endif
+  void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override;
+
+  void* getBuffer() {
+    return outputPtr;
+  }
+
+  std::vector<int64_t>& getOutputShape() {
+    return output_shapes;
+  }
+
+  uint64_t getSize() {
+    return allocated_size;
+  }
+
+  ~OutputAllocator() override {
+    cudaFree(outputPtr);
+  }
+
+ private:
+  void* outputPtr{nullptr};
+  uint64_t allocated_size = 0;
+  std::vector<int64_t> output_shapes;
+};
+
+/*
+ * This map saves the dimension range of the shape of the shape tensor or execution tensor:
+ * tensor name -> ( dimension -> [min, max, opt] )
+ */
+using ShapeRangesMap = std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>>;
+
+// Information to construct kernel function state.
+struct TensorrtFuncState {
+  AllocateFunc test_allocate_func = nullptr;
+  DestroyFunc test_release_func = nullptr;
+  AllocatorHandle allocator = nullptr;
+  std::string fused_node_name;
+  nvinfer1::IBuilder* builder;
+  tensorrt_ptr::unique_pointer<nvonnxparser::IParser>* parser = nullptr;
+  std::unique_ptr<nvinfer1::ICudaEngine>* engine = nullptr;
+  std::unique_ptr<nvinfer1::IExecutionContext>* context = nullptr;
+  std::unique_ptr<nvinfer1::INetworkDefinition>* network = nullptr;
+  std::vector<std::unordered_map<std::string, size_t>> input_info;
+  std::vector<std::unordered_map<std::string, size_t>> output_info;
+  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
+  std::mutex* tensorrt_mu_ptr = nullptr;
+  std::string trt_node_name_with_precision;
+  bool engine_cache_enable = false;
+  std::string engine_cache_path;
+  nvinfer1::IRuntime* runtime = nullptr;
+  std::vector<nvinfer1::IOptimizationProfile*> profiles;
+  bool context_memory_sharing_enable = false;
+  size_t* max_context_mem_size_ptr = nullptr;
+  bool engine_decryption_enable = false;
+  int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
+  int (*engine_encryption)(const char*, char*, size_t) = nullptr;
+  bool detailed_build_log = false;
+  bool sparsity_enable = false;
+  int auxiliary_streams = -1;
+  bool cuda_graph_enable = 0;
+  std::string cache_prefix;
+  std::string cache_suffix;
+};
+
+// Minimum information to construct kernel function state for direct engine load code path
+struct TensorrtShortFuncState {
+  AllocateFunc test_allocate_func = nullptr;
+  DestroyFunc test_release_func = nullptr;
+  AllocatorHandle allocator = nullptr;
+  std::string fused_node_name;
+  std::unique_ptr<nvinfer1::ICudaEngine>* engine = nullptr;
+  std::unique_ptr<nvinfer1::IExecutionContext>* context = nullptr;
+  std::vector<std::unordered_map<std::string, size_t>> input_info;
+  std::vector<std::unordered_map<std::string, size_t>> output_info;
+  bool context_memory_sharing_enable = false;
+  size_t* max_context_mem_size_ptr = nullptr;
+  std::mutex* tensorrt_mu_ptr = nullptr;
+};
+
+// Holds important information for building valid ORT graph.
+struct SubGraphContext {
+  std::unordered_set<std::string> output_args;
+  std::unordered_map<std::string, const NodeArg*> inputs_and_initializers;
+  std::unordered_map<std::string, const NodeArg*> manually_added_graph_inputs;
+};
+
+using SubGraphContextMap = std::unordered_map<std::string, std::unique_ptr<SubGraphContext>>;
+using DDSOutputAllocatorMap = std::unordered_map<std::string, std::unique_ptr<OutputAllocator>>;
+std::string GetWeightRefittedEnginePath(std::string engine_cache_path);
+
+// Logical device representation.
+class NvExecutionProvider : public IExecutionProvider {
+ public:
+  explicit NvExecutionProvider(const NvExecutionProviderInfo& info);
+  // TODO: we might want to transition to this, it allows for an easier option specification:
+  //  explicit NvExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
+  virtual ~NvExecutionProvider();
+
+  cublasHandle_t PerThreadDefaultCublasHandle() {
+    return GetPerThreadContext().CublasHandle();
+  }
+
+  cudnnHandle_t PerThreadDefaultCudnnHandle() {
+    return GetPerThreadContext().CudnnHandle();
+  }
+
+  virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
+  std::unique_ptr<IDataTransfer> GetDataTransfer() const override;
+
+  std::vector<std::unique_ptr<ComputeCapability>>
+  GetCapability(const GraphViewer& graph,
+                const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& graph_optimizer_registry,
+                IResourceAccountant* /* resource_accountant */) const override;
+
+  int GetDeviceId() const { return device_id_; }
+
+  common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
+                         std::vector<NodeComputeInfo>& node_compute_funcs) override;
+
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
+
+  ProviderOptions GetProviderOptions() const override {
+    return NvExecutionProviderInfo::ToProviderOptions(info_);
+  }
+
+  void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
+
+  void GetCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list) const override;
+
+  OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
+
+  std::vector<AllocatorPtr> CreatePreferredAllocators() override;
+
+  bool IsGraphCaptureEnabled() const override;
+  bool IsGraphCaptured(int graph_annotation_id) const override;
+  Status ReplayGraph(int graph_annotation_id) override;
+
+  static common::Status RefitEngine(std::string onnx_model_filename,
+                                    std::string& onnx_model_folder_path,
+                                    std::string& weight_stripped_engine_cath_path,
+                                    bool path_check,
+                                    const void* onnx_model_bytestream,
+                                    size_t onnx_model_bytestream_size,
+                                    nvinfer1::ICudaEngine* trt_engine,
+                                    bool serialize_refitted_engine,
+                                    bool detailed_build_log);
+
+ private:
+  mutable NvExecutionProviderInfo info_;
+  bool external_stream_ = false;
+  cudaStream_t stream_ = nullptr;
+  int max_partition_iterations_ = 1000;
+  size_t min_subgraph_size_ = 1;
+  size_t max_workspace_size_ = 0;
+  bool force_sequential_engine_build_ = false;
+  bool dump_subgraphs_ = false;
+  bool engine_cache_enable_ = false;
+  bool weight_stripped_engine_enable_ = false;
+  bool weight_stripped_engine_refit_ = false;
+  std::string onnx_model_folder_path_;
+  const void* onnx_model_bytestream_;
+  size_t onnx_model_bytestream_size_;
+  bool sparsity_enable_ = false;
+  int auxiliary_streams_ = -1;
+  std::string cache_path_, engine_decryption_lib_path_;
+  std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
+  std::mutex tensorrt_mu_;
+  int device_id_;
+  std::string compute_capability_;
+  bool context_memory_sharing_enable_ = false;
+  size_t max_ctx_mem_size_ = 0;
+  IAllocatorUniquePtr<void> context_memory_ = nullptr;
+  mutable char model_path_[4096] = {};  // Reserved for max path length
+  bool engine_decryption_enable_ = false;
+  int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
+  int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
+  bool detailed_build_log_ = false;
+  bool cuda_graph_enable_ = false;
+  std::string cache_prefix_;
+  std::string op_types_to_exclude_;
+
+  // The format is as for TENSORRT_VERSION: (MAJOR * 100 + MINOR) * 100 + PATCH
+  int32_t trt_version_;
+  int32_t cuda_version_;
+
+  // The OrtAllocator object will be get during ep compute time
+  // and should be kept for the lifetime of TRT EP object.
+  OrtAllocator* alloc_ = nullptr;
+
+  // For create/dump EP context node model
+  bool dump_ep_context_model_ = false;
+  std::string ep_context_file_path_;
+  int ep_context_embed_mode_ = 0;
+  std::string ctx_model_path_;
+  std::string ep_cache_context_attr_;
+  std::string engine_cache_relative_path_to_context_model_dir;
+
+  std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
+  mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;
+
+  mutable std::unique_ptr<nvinfer1::IBuilder> builder_;
+
+  // Following maps that hold TRT objects will be accessible by different threads if ORT is using multithreading.
+  // In general, TensorRT objects are not thread safe; accesses to an object from different threads must be serialized by the client.
+  // But there are still some thread safe operations, please see here https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+  // For those non thread safe operations, TRT EP uses (1) lock_guard or (2) PerThreadContext to make sure synchronization.
+  std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
+  std::unordered_map<std::string, std::unique_ptr<nvinfer1::ICudaEngine>> engines_;
+  std::unordered_map<std::string, std::unique_ptr<nvinfer1::IExecutionContext>> contexts_;
+  std::unordered_map<std::string, std::unique_ptr<nvinfer1::IBuilder>> builders_;
+  std::unordered_map<std::string, std::unique_ptr<nvinfer1::INetworkDefinition>> networks_;
+  std::unordered_map<std::string, std::vector<std::unordered_map<std::string, size_t>>> input_info_;
+  std::unordered_map<std::string, std::vector<std::unordered_map<std::string, size_t>>> output_info_;
+  std::unordered_map<std::string, std::vector<std::vector<int64_t>>> profile_min_shapes_;
+  std::unordered_map<std::string, std::vector<std::vector<int64_t>>> profile_max_shapes_;
+  std::unordered_map<std::string, std::vector<std::vector<int64_t>>> profile_opt_shapes_;
+  std::unordered_map<std::string, ShapeRangesMap> input_shape_ranges_;  // The profile shape ranges that the engine is built with
+  std::unordered_map<std::string, std::vector<nvinfer1::IOptimizationProfile*>> profiles_;
+  std::unordered_map<std::string, DDSOutputAllocatorMap> dds_output_allocator_maps_;
+
+  // for external stream, we need to create its cudnn/cublass handle before cuda EP enable cuda graph capture
+  cudnnHandle_t external_cudnn_handle_ = nullptr;
+  cublasHandle_t external_cublas_handle_ = nullptr;
+
+  // Call cudaStreamSynchronize() after TRT enqueueV3()
+  mutable bool sync_stream_after_enqueue_ = true;
+
+  CUDAGraph cuda_graph_;
+  bool is_graph_captured_ = false;
+  int regular_run_count_before_graph_capture_ = 0;
+  // There is chance (currently only happens in CUDA EP) that the second regular run allocates GPU memory for causes like:
+  // (1) memory pattern is enabled. (2) arena allocation for stream.
+  // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs
+  // to allocate enough memory in Arena before graph capturing.
+  const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
+
+  // [Note] We don't use PerThreadContext for now since it has issue with multithreading
+  //
+  // TRT or CUDA objects that must be maintained on a per thread basis will be put under this PerThreadContext data structure.
+  // For example, TensorRT execution context and CUDA graph are the ones to be put here.
+  class PerThreadContext final {
+   public:
+    PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream);
+    ~PerThreadContext();
+
+    cublasHandle_t CublasHandle() const {
+      return external_cublas_handle_;
+    }
+
+    cudnnHandle_t CudnnHandle() const {
+      return external_cudnn_handle_;
+    }
+
+    bool IsTensorRTContextInMap(std::string fused_node);
+    nvinfer1::IExecutionContext& GetTensorRTContext(std::string fused_node);
+    bool UpdateTensorRTContext(std::string fused_node, std::unique_ptr<nvinfer1::IExecutionContext> context);
+    void ResetTensorRTContext(std::string fused_node);
+    bool CompareProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges);
+    void UpdateProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges);
+
+    void InitCUDAGraph();
+    void SetGraphStream(cudaStream_t stream);
+    bool IsGraphCaptureAllowed() const;
+    void CaptureBegin(int graph_annotation_id);
+    void CaptureEnd(int graph_annotation_id);
+    bool IsGraphCaptured(int graph_annotation_id) const;
+    Status ReplayGraph(int graph_annotation_id);
+    void IncrementRegularRunCountBeforeGraphCapture();
+
+   private:
+    cudnnHandle_t external_cudnn_handle_ = nullptr;
+    cublasHandle_t external_cublas_handle_ = nullptr;
+
+    // Maintaining execution context on a per thread basis is suggested by TRT doc.
+    // Also, for enqueueV2() in execution context, to perform inference concurrently in multiple streams, use one execution context per stream.
+    // ORT multi-streams feature uses one stream for one thread, therefore maintaining execution context on a per thread basis is necessary for TRT EP,
+    // otherwise it may result in undefined behavior or synchronization issues.
+    //
+    // See more details here:
+    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_execution_context.html#a63cd95430852038ce864e17c670e0b36
+    std::unordered_map<std::string, std::unique_ptr<nvinfer1::IExecutionContext>> trt_context_map_;
+
+    // The profile shape ranges for the engine that the execution context maintained by the PerThreadContext is built with.
+    // TRT EP needs this info to determine whether to rebuild the execution context.
+    std::unordered_map<std::string, ShapeRangesMap> input_shape_ranges_;
+
+    // Cuda graph with multi threads will be supported in the future, so cuda_graph_ is put under PerThreadContext.
+    // ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph instance is enough (no need to maintain one CUDAGraph instance per TRT subgraph)
+    CUDAGraph cuda_graph_;
+    bool is_graph_captured_ = false;
+    int regular_run_count_before_graph_capture_ = 0;
+    // There is chance (currently only happens in CUDA EP) that the second regular run allocates GPU memory for causes like:
+    // (1) memory pattern is enabled. (2) arena allocation for stream.
+    // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs
+    // to allocate enough memory in Arena before graph capturing.
+    const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
+  };
+
+  using PerThreadContextMap = std::unordered_map<const NvExecutionProvider*, std::weak_ptr<PerThreadContext>>;
+  // thread local PerThreadContext cache
+
+  struct ContextCacheHolder {
+    ContextCacheHolder() {
+      // Keep a weak pointer to the object, if the weak pointer can be locked, then the shared pointer is still around, so we can reset it
+      RunOnUnload([&, weak_p_ = std::weak_ptr<PerThreadContextMap>(p)] {
+        if (auto lock = weak_p_.lock()) {
+          p.reset();
+        }
+      });
+    }
+
+    std::shared_ptr<PerThreadContextMap> p = std::make_shared<PerThreadContextMap>();
+  };
+
+  static const std::shared_ptr<PerThreadContextMap>& PerThreadContextCache() {
+    thread_local const ContextCacheHolder per_thread_context_cache;
+    return per_thread_context_cache.p;
+  }
+
+  struct PerThreadContextState {
+    // contexts that are currently active
+    std::set<std::shared_ptr<PerThreadContext>, std::owner_less<std::shared_ptr<PerThreadContext>>> active_contexts;
+    // contexts available for reuse
+    std::vector<std::shared_ptr<PerThreadContext>> retired_context_pool;
+    // weak references to thread local caches from which this NvExecutionProvider instance's entry should be removed
+    // upon destruction
+    std::set<std::weak_ptr<PerThreadContextMap>, std::owner_less<std::weak_ptr<PerThreadContextMap>>>
+        caches_to_update_on_destruction;
+    // synchronizes access to PerThreadContextState members
+    std::mutex mutex;
+  };
+
+  // The execution provider maintains the PerThreadContexts in this structure.
+  // Synchronization is required to update the contained structures.
+  // On the other hand, access to an individual PerThreadContext is assumed to be from a single thread at a time,
+  // so synchronization is not required for that.
+  mutable PerThreadContextState context_state_;
+
+  PerThreadContext& GetPerThreadContext() const;
+  void ReleasePerThreadContext() const;
+
+  /**Get IndexedSubGraph based on node list of the subgraph*/
+  std::unique_ptr<IndexedSubGraph> GetSubGraph(SubGraph_t graph_nodes_index,
+                                               const GraphViewer& graph, const HashValue& model_hash, int subgraph_index) const;
+
+  /**
+  Get TensorRT supported node lists by calling Onnx-TensorRT parser recursively. Since each time the parser
+  can only detect first unsupported node failure, it needs to wait for Onnxruntime to partition the graph
+  and then detect next failure again. If there are too many iterations, which means many nodes in the graph
+  are not supported by TensorRT, the process will be terminated and the whole graph is simply assigned to
+  other execution provider.
+  */
+  SubGraphCollection_t GetSupportedList(SubGraphCollection_t supported_nodes_list, int iterations, const int max_iterations,
+                                        const GraphViewer& graph, bool* early_termination) const;
+
+  bool DetectTensorRTGraphCycles(SubGraphCollection_t& supported_nodes_vector, const GraphViewer& graph, const HashValue& model_hash, bool remove_cycles = true) const;
+
+  /**
+  Get a unique_lock object to control the concurrency behavior.
+  Every api call not in the thread-safe operations(https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading)
+  should be protected by a lock when invoked by multiple threads concurrently.
+  */
+  std::unique_lock<std::mutex> GetApiLock() const;
+
+  /**Check the graph is the subgraph of control flow op*/
+  bool IsSubGraphOfControlFlowOp(const GraphViewer& graph) const;
+
+  /**Check whether all the nodes of the graph are assigned to specific ep*/
+  bool AllNodesAssignedToSpecificEP(const GraphViewer& graph, const std::string& provider_type) const;
+
+  /**Check whether all the nodes of subgraph are supported*/
+  bool IsSubGraphFullySupported(SubGraphCollection_t supported_nodes_vector, const int number_of_ort_nodes) const;
+
+  /**
+   * Set inputs, initializers and outputs for all subgraphs during NvExecutionProvider::GetSupportedList()
+   * and save those information in subgraph context data structure. It's useful for building a valid graph and
+   * make Graph::Resolve() happy especially when dealing with nested control-flow op graph.
+   */
+  void BuildSubGraphContext(const Graph& build_graph) const;
+
+  /**
+   * Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
+   */
+  void SetGraphOuterScopeValuesAndInputs(Graph& build_graph, const Graph& graph) const;
+
+  /**
+   * If ORT TRT manually sets graph input in NvExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+   * we have to manully set all the graph inputs in order to pass Graph::Resolve().
+   */
+  void SetAllGraphInputs(Graph& graph) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
+   */
+  bool IsInputInitializerOrOutput(const Graph& graph, const std::string& name, bool check_ancestors) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsOuterScopeValue(). We have to implement this fuction again.
+   */
+  bool IsOuterScopeValue(const Graph& graph, const std::string& name) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsLocalValue(). We have to implement this fuction again.
+   */
+  bool IsLocalValue(const Graph& graph, const std::string& name) const;
+
+  /**
+   * Create a vector of NodeComputeInfo instances directly from "TRT engine" wrapped onnx model without
+   * going through the time-consuming processes of model parsing and engine building.
+   */
+  Status CreateNodeComputeInfoFromPrecompiledEngine(const GraphViewer& graph_body_viewer,
+                                                    const Node& fused_node,
+                                                    std::unordered_map<std::string, size_t>& input_map,
+                                                    std::unordered_map<std::string, size_t>& output_map,
+                                                    std::vector<NodeComputeInfo>& node_compute_funcs);
+
+  /**
+   * Create a vector of NodeComputeInfo instances from graph.
+   */
+  Status CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
+                                        const Node& fused_node,
+                                        std::unordered_map<std::string, size_t>& input_map,
+                                        std::unordered_map<std::string, size_t>& output_map,
+                                        std::vector<NodeComputeInfo>& node_compute_funcs);
+
+  bool IsGraphCaptureAllowed() const;
+  void CaptureBegin(int graph_annotation_id);
+  void CaptureEnd(int graph_annotation_id);
+  void IncrementRegularRunCountBeforeGraphCapture();
+
+  /**
+   * Get the pointer to the IBuilder instance.
+   * This function only creates the instance at the first time it's being called."
+   */
+  nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.cc
new file mode 100644
index 0000000000000..5559e2e791d40
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.cc
@@ -0,0 +1,141 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <unordered_set>
+
+#include "core/framework/provider_options.h"
+#include "nv_execution_provider_custom_ops.h"
+#include "nv_execution_provider.h"
+
+// The filename extension for a shared library is different per platform
+#ifdef _WIN32
+#define LIBRARY_PREFIX
+#define LIBRARY_EXTENSION ORT_TSTR(".dll")
+#elif defined(__APPLE__)
+#define LIBRARY_PREFIX "lib"
+#define LIBRARY_EXTENSION ".dylib"
+#else
+#define LIBRARY_PREFIX "lib"
+#define LIBRARY_EXTENSION ".so"
+#endif
+
+namespace onnxruntime {
+extern TensorrtLogger& GetTensorrtLogger(bool verbose);
+
+/*
+ * Create custom op domain list for TRT plugins.
+ *
+ * Here, we collect all registered TRT plugins from TRT registry and create custom ops with "trt.plugins" domain.
+ * Additionally, if users specify extra plugin libraries, TRT EP will load them at runtime which will register those
+ * plugins to TRT plugin registry and later TRT EP can get them as well.
+ *
+ * There are several TRT plugins registered as onnx schema op through contrib op with ONNX domain in the past,
+ * for example, EfficientNMS_TRT, MultilevelCropAndResize_TRT, PyramidROIAlign_TRT and DisentangledAttention_TRT.
+ * In order not to break the old models using those TRT plugins which were registered with ONNX domain and maintain
+ * backward compatible, we need to keep those legacy TRT plugins registered with ONNX domain with contrib ops.
+ *
+ * Note: Current TRT plugin doesn't have APIs to get number of inputs/outputs of the plugin.
+ * So, TensorRTCustomOp uses variadic inputs/outputs to pass ONNX graph validation.
+ */
+common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths) {
+  static std::unique_ptr<OrtCustomOpDomain> custom_op_domain = std::make_unique<OrtCustomOpDomain>();
+  static std::vector<std::unique_ptr<TensorRTCustomOp>> created_custom_op_list;
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> lock(mutex);
+  if (custom_op_domain->domain_ != "" && custom_op_domain->custom_ops_.size() > 0) {
+    domain_list.push_back(custom_op_domain.get());
+    return Status::OK();
+  }
+
+  // Load any extra TRT plugin library if any.
+  // When the TRT plugin library is loaded, the global static object is created and the plugin is registered to TRT registry.
+  // This is done through macro, for example, REGISTER_TENSORRT_PLUGIN(VisionTransformerPluginCreator).
+  // extra_plugin_lib_paths has the format of "path_1;path_2....;path_n"
+  static bool is_loaded = false;
+  if (!extra_plugin_lib_paths.empty() && !is_loaded) {
+    std::stringstream extra_plugin_libs(extra_plugin_lib_paths);
+    std::string lib;
+    while (std::getline(extra_plugin_libs, lib, ';')) {
+      auto status = LoadDynamicLibrary(ToPathString(lib));
+      if (status == Status::OK()) {
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] Successfully load " << lib;
+      } else {
+        LOGS_DEFAULT(WARNING) << "[Nv EP]" << status.ToString();
+      }
+    }
+    is_loaded = true;
+  }
+
+  try {
+    // Get all registered TRT plugins from registry
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Getting all registered TRT plugins from TRT plugin registry ...";
+    TensorrtLogger trt_logger = GetTensorrtLogger(false);
+    void* library_handle = nullptr;
+    const auto& env = onnxruntime::GetDefaultEnv();
+    auto full_path = env.GetRuntimePath() +
+                     PathString(LIBRARY_PREFIX ORT_TSTR("nvinfer_plugin") LIBRARY_EXTENSION);
+    ORT_THROW_IF_ERROR(env.LoadDynamicLibrary(full_path, false, &library_handle));
+
+    bool (*dyn_initLibNvInferPlugins)(void* logger, char const* libNamespace);
+    ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(library_handle, "initLibNvInferPlugins", (void**)&dyn_initLibNvInferPlugins));
+    dyn_initLibNvInferPlugins(&trt_logger, "");
+    LOGS_DEFAULT(INFO) << "[Nv EP] Default plugins successfully loaded.";
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)  // Ignore warning C4996: 'nvinfer1::*' was declared deprecated
+#endif
+  } catch (const std::exception&) {
+    LOGS_DEFAULT(INFO) << "[Nv EP] Default plugin library is not on the path and is therefore ignored";
+  }
+  try {
+    int num_plugin_creator = 0;
+    auto plugin_creators = getPluginRegistry()->getPluginCreatorList(&num_plugin_creator);
+    std::unordered_set<std::string> registered_plugin_names;
+
+    for (int i = 0; i < num_plugin_creator; i++) {
+      auto plugin_creator = plugin_creators[i];
+      std::string plugin_name(plugin_creator->getPluginName());
+      LOGS_DEFAULT(VERBOSE) << "[Nv EP] " << plugin_name << ", version : " << plugin_creator->getPluginVersion();
+
+      // plugin has different versions and we only register once
+      if (registered_plugin_names.find(plugin_name) != registered_plugin_names.end()) {
+        continue;
+      }
+
+      created_custom_op_list.push_back(std::make_unique<TensorRTCustomOp>(onnxruntime::kNvTensorRTRTXExecutionProvider, nullptr));  // Make sure TensorRTCustomOp object won't be cleaned up
+      created_custom_op_list.back().get()->SetName(plugin_creator->getPluginName());
+      custom_op_domain->custom_ops_.push_back(created_custom_op_list.back().get());
+      registered_plugin_names.insert(plugin_name);
+    }
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+    custom_op_domain->domain_ = "trt.plugins";
+    domain_list.push_back(custom_op_domain.get());
+  } catch (const std::exception&) {
+    LOGS_DEFAULT(WARNING) << "[Nv EP] Failed to get TRT plugins from TRT plugin registration. Therefore, TRT EP can't create custom ops for TRT plugins";
+  }
+  return Status::OK();
+}
+
+void ReleaseTensorRTCustomOpDomain(OrtCustomOpDomain* domain) {
+  if (domain != nullptr) {
+    for (auto ptr : domain->custom_ops_) {
+      if (ptr != nullptr) {
+        delete ptr;
+      }
+    }
+    delete domain;
+  }
+}
+
+void ReleaseTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list) {
+  for (auto ptr : custom_op_domain_list) {
+    ReleaseTensorRTCustomOpDomain(ptr);
+  }
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.h
new file mode 100644
index 0000000000000..897c2ce0e0b98
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.h
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_c_api.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/providers/shared_library/provider_api.h"
+#include "nv_execution_provider_info.h"
+
+using namespace onnxruntime;
+
+namespace onnxruntime {
+
+common::Status LoadDynamicLibrary(onnxruntime::PathString library_name);
+common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list,
+                                                const std::string extra_plugin_lib_paths);
+common::Status CreateTensorRTCustomOpDomainList(NvExecutionProviderInfo& info);
+void ReleaseTensorRTCustomOpDomain(OrtCustomOpDomain* domain);
+void ReleaseTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list);
+
+struct TensorRTCustomKernel {
+  TensorRTCustomKernel(const OrtKernelInfo* /*info*/, void* compute_stream)
+      : compute_stream_(compute_stream) {
+  }
+
+  void Compute(OrtKernelContext* /*context*/) {
+    // The implementation is in TensorRT plugin. No need to implement it here.
+  };
+
+ private:
+  void* compute_stream_;
+};
+
+struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKernel> {
+  explicit TensorRTCustomOp(const char* provider, void* compute_stream) : provider_(provider),
+                                                                          compute_stream_(compute_stream) {
+  }
+
+  void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const {
+    return new TensorRTCustomKernel(info, compute_stream_);
+  };
+
+  const char* GetName() const { return name_; };
+
+  void SetName(const char* name) { name_ = name; };
+
+  const char* GetExecutionProviderType() const { return provider_; };
+
+  size_t GetInputTypeCount() const { return num_inputs_; };
+
+  void SetInputTypeCount(size_t num) { num_inputs_ = num; };
+
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
+  };
+
+  size_t GetOutputTypeCount() const { return num_outputs_; };
+
+  void SetOutputTypeCount(size_t num) { num_outputs_ = num; };
+
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
+
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
+  };
+
+  bool GetVariadicInputHomogeneity() const {
+    return false;  // heterogenous
+  }
+
+  bool GetVariadicOutputHomogeneity() const {
+    return false;  // heterogeneous
+  }
+
+ private:
+  const char* provider_{onnxruntime::kNvTensorRTRTXExecutionProvider};
+  void* compute_stream_;
+  const char* name_;
+  size_t num_inputs_ = 1;   // set to 1 to match with default min_arity for variadic input
+  size_t num_outputs_ = 1;  // set to 1 to match with default min_arity for variadic output
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_helper.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_helper.cc
new file mode 100644
index 0000000000000..5373b6fd08afc
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_helper.cc
@@ -0,0 +1,261 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "nv_execution_provider.h"
+#include "core/framework/murmurhash3.h"
+#include <iostream>
+
+namespace onnxruntime {
+
+namespace {
+// Get unique graph name based on graph's name and all nodes' name
+std::string GetUniqueGraphName(const Graph& graph) {
+  HashValue model_hash = 0;
+  uint32_t hash[4] = {0, 0, 0, 0};
+
+  auto hash_str = [&hash](const std::string& str) {
+    MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
+  };
+
+  // Hash all nodes' name
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+    hash_str(node->Name());
+  }
+
+  model_hash = hash[0] | (uint64_t(hash[1]) << 32);
+
+  return graph.Name() + "_" + std::to_string(model_hash);
+}
+}  // namespace
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
+bool NvExecutionProvider::IsInputInitializerOrOutput(const Graph& graph,
+                                                     const std::string& name,
+                                                     bool check_ancestors) const {
+  const Graph* parent_graph = nullptr;
+  return IsLocalValue(graph, name) ||
+         (check_ancestors && (parent_graph = graph.ParentGraph()) != nullptr &&
+          IsInputInitializerOrOutput(*parent_graph, name, check_ancestors));
+}
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsOuterScopeValue(). We have to implement this function again.
+bool NvExecutionProvider::IsOuterScopeValue(const Graph& graph,
+                                            const std::string& name) const {
+  const Graph* parent_graph = nullptr;
+  return (parent_graph = graph.ParentGraph()) != nullptr &&
+         IsInputInitializerOrOutput(*parent_graph, name, true);
+}
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsLocalValue(). We have to implement this function again.
+bool NvExecutionProvider::IsLocalValue(const Graph& graph,
+                                       const std::string& name) const {
+  std::string unique_graph_name = GetUniqueGraphName(graph);
+  if (subgraph_context_map_.find(unique_graph_name) == subgraph_context_map_.end()) {
+    return false;
+  }
+  SubGraphContext* context = subgraph_context_map_.at(unique_graph_name).get();
+  return context->output_args.find(name) != context->output_args.cend() ||
+         context->inputs_and_initializers.find(name) != context->inputs_and_initializers.cend();
+}
+
+/**
+ * Set inputs, initializers and outputs for all subgraphs during NvExecutionProvider::GetSupportedList()
+ * and save those information in subgraph context data structure. It's useful for building a valid graph and
+ * make Graph::Resolve() happy especially when dealing with nested control-flow op graph.
+ */
+void NvExecutionProvider::BuildSubGraphContext(const Graph& graph) const {
+  // Iterate all the nodes and recurse into inner most subgraph first
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    auto subgraph_map = node->GetAttributeNameToSubgraphMap();
+    for (auto& entry : subgraph_map) {
+      const Graph* subgraph = entry.second;
+      BuildSubGraphContext(*subgraph);
+    }
+  }
+
+  std::string unique_graph_name = GetUniqueGraphName(graph);
+
+  // Subgraph context has been built before, no need to do it again
+  if (subgraph_context_map_.find(unique_graph_name) != subgraph_context_map_.end()) {
+    return;
+  }
+
+  subgraph_context_map_.emplace(unique_graph_name, std::make_unique<SubGraphContext>());
+  SubGraphContext* context = subgraph_context_map_.at(unique_graph_name).get();
+
+  // Collect all nodes' outputs and nodes' name
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    for (const auto& output : node->OutputDefs()) {
+      context->output_args.insert(output->Name());
+    }
+  }
+
+  // Go thru all node's inputs
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    for (const auto& input : node->InputDefs()) {
+      if (context->output_args.find(input->Name()) != context->output_args.end()) {
+        continue;
+      }
+      // This input arg is not the output of another node so must come from either a graph input or an initializer.
+      context->inputs_and_initializers[input->Name()] = input;
+    }
+  }
+}
+
+// Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
+void NvExecutionProvider::SetGraphOuterScopeValuesAndInputs(Graph& graph_build,
+                                                            const Graph& graph) const {
+  // Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph
+  for (int i = 0; i < graph_build.MaxNodeIndex(); ++i) {
+    auto graph_build_node = graph_build.GetNode(i);
+    if (graph_build_node == nullptr) {
+      continue;
+    }
+
+    auto graph_build_map = graph_build_node->GetAttributeNameToMutableSubgraphMap();
+    std::unordered_map<std::string, gsl::not_null<const Graph*>> subgraph_map;
+    const Node* graph_node = nullptr;
+
+    // Find corresponding original graph node's subgraphs
+    for (int j = 0; j < graph.MaxNodeIndex(); ++j) {
+      if (graph.GetNode(j) && graph.GetNode(j)->Name() == graph_build_node->Name()) {
+        graph_node = graph.GetNode(j);
+        subgraph_map = graph_node->GetAttributeNameToSubgraphMap();
+        break;
+      }
+    }
+
+    for (auto& entry : graph_build_map) {
+      auto attr_name = entry.first;
+      Graph* subgraph_build = entry.second;
+      if (subgraph_map.find(attr_name) != subgraph_map.end()) {
+        // recurse into subgraph
+        const Graph* subgraph = subgraph_map.at(attr_name);
+        SetGraphOuterScopeValuesAndInputs(*subgraph_build, *subgraph);
+      }
+    }
+  }
+
+  // Start from the inner most subgraph first and check whether its outer scope values are existed in the
+  // newly built graph. If not, we need to add those outer scope values as explicit inputs to the top-level
+  // of newly built graph.
+  if (graph_build.ParentNode()) {
+    auto top_level_graph = &graph_build;
+    while (top_level_graph->MutableParentGraph()) {
+      top_level_graph = top_level_graph->MutableParentGraph();
+    }
+    std::string unique_graph_name = GetUniqueGraphName(*top_level_graph);
+    if (subgraph_context_map_.find(unique_graph_name) == subgraph_context_map_.end()) {
+      LOGS_DEFAULT(ERROR) << "[Nv EP] Can't find top-level graph context. \
+                              Please check BuildSubGraphContext() has built the graph context correctly.";
+      return;
+    }
+
+    SubGraphContext* context = subgraph_context_map_.at(unique_graph_name).get();
+
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Subgraph name is " << graph_build.Name();
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Its parent node is " << graph.ParentNode()->Name();
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Its parent node's implicit inputs:";
+
+    // Iterate all the implicit inputs to set outer scope value for the newly built subgraph
+    for (const auto& input : graph.ParentNode()->ImplicitInputDefs()) {
+      LOGS_DEFAULT(VERBOSE) << "[Nv EP] \t" << input->Name();
+
+      // The node arg in parent node's implicit inputs could be used for parent node's other subgraph, for example
+      // "If" op has two subgraphs. So we need to make sure that the node arg is used in current subgraph only.
+      // (GetNodeArg searches for specific node arg in all node args in the graph)
+      if (graph_build.GetNodeArg(input->Name())) {
+        graph_build.AddOuterScopeNodeArg(input->Name());
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] \t" << input->Name() << " is used in this subgraph";
+
+        if (context &&
+            (context->manually_added_graph_inputs.find(input->Name()) != context->manually_added_graph_inputs.end())) {
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] \t" << input->Name() << " is already been added as an explicit input to graph";
+          continue;
+        }
+
+        // Handle the case where this outer scope value is not existed in any outer scope levels of the
+        // newly built graph (the newly built graph is the subgraph of the original graph). Need to add
+        // the outer scope value as an explicit input to the top-level of newly built graph.
+        if (!IsOuterScopeValue(graph_build, input->Name())) {
+          const auto& name = input->Name();
+          auto graph_inputs_including_initializers = top_level_graph->GetInputsIncludingInitializers();
+          auto added_graph_input = std::find_if(graph_inputs_including_initializers.begin(),
+                                                graph_inputs_including_initializers.end(),
+                                                [&name](const NodeArg* entry) { return entry->Name() == name; });
+
+          if (added_graph_input == graph_inputs_including_initializers.end()) {
+            if (context) {
+              auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
+              type_proto->copy_from(input->TypeAsProto());
+              auto& n_input = top_level_graph->GetOrCreateNodeArg(name, type_proto.get());
+              context->manually_added_graph_inputs[n_input.Name()] = &n_input;
+              LOGS_DEFAULT(VERBOSE) << "[Nv EP] \t" << n_input.Name() << " is added as an explicit input into the newly built graph";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// If ORT TRT manually sets graph input in NvExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+// we have to manully set all the graph inputs in order to pass Graph::Resolve()
+void NvExecutionProvider::SetAllGraphInputs(Graph& graph) const {
+  // If ORT TRT doesn't manully set graph input in NvExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+  // Graph::Resolve() will help set graph inputs in Graph::SetGraphInputsOutputs(), so no need to set graph inputs here.
+  std::string unique_graph_name = GetUniqueGraphName(graph);
+  if (subgraph_context_map_.find(unique_graph_name) == subgraph_context_map_.end() ||
+      subgraph_context_map_[unique_graph_name].get()->manually_added_graph_inputs.size() == 0) {
+    return;
+  }
+
+  SubGraphContext* context = subgraph_context_map_[unique_graph_name].get();
+  std::vector<const NodeArg*> graph_inputs_including_initializers;
+  std::unordered_set<std::string> graph_inputs_including_initializers_set;
+
+  for (const auto& entry : context->inputs_and_initializers) {
+    graph_inputs_including_initializers.push_back(entry.second);
+    graph_inputs_including_initializers_set.insert(entry.first);
+  }
+
+  for (const auto& entry : context->manually_added_graph_inputs) {
+    if (graph_inputs_including_initializers_set.find(entry.first) == graph_inputs_including_initializers_set.end()) {
+      graph_inputs_including_initializers.push_back(entry.second);
+      graph_inputs_including_initializers_set.insert(entry.first);
+    }
+  }
+
+  for (const auto& node_arg : graph.GetInputsIncludingInitializers()) {
+    if (graph_inputs_including_initializers_set.find(node_arg->Name()) == graph_inputs_including_initializers_set.end()) {
+      graph_inputs_including_initializers.push_back(node_arg);
+      graph_inputs_including_initializers_set.insert(node_arg->Name());
+    }
+  }
+
+  graph.SetInputs(graph_inputs_including_initializers);
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc
new file mode 100644
index 0000000000000..05e5f7659efac
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h"
+#include "core/providers/nv_tensorrt_rtx/nv_provider_options.h"
+
+#include "core/common/make_string.h"
+#include "core/common/parse_string.h"
+#include "core/framework/provider_options_utils.h"
+#include "core/providers/cuda/cuda_common.h"
+
+namespace onnxruntime {
+NvExecutionProviderInfo NvExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
+  NvExecutionProviderInfo info{};
+  void* user_compute_stream = nullptr;
+  void* onnx_bytestream = nullptr;
+  ORT_THROW_IF_ERROR(
+      ProviderOptionsParser{}
+          .AddValueParser(
+              nv::provider_option_names::kDeviceId,
+              [&info](const std::string& value_str) -> Status {
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.device_id));
+                int num_devices{};
+                CUDA_RETURN_IF_ERROR(cudaGetDeviceCount(&num_devices));
+                ORT_RETURN_IF_NOT(
+                    0 <= info.device_id && info.device_id < num_devices,
+                    "Invalid device ID: ", info.device_id,
+                    ", must be between 0 (inclusive) and ", num_devices, " (exclusive).");
+                return Status::OK();
+              })
+          .AddAssignmentToReference(nv::provider_option_names::kHasUserComputeStream, info.has_user_compute_stream)
+          .AddValueParser(
+              nv::provider_option_names::kUserComputeStream,
+              [&user_compute_stream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                user_compute_stream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
+          .AddAssignmentToReference(nv::provider_option_names::kMaxWorkspaceSize, info.max_workspace_size)
+          .AddAssignmentToReference(nv::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
+          .AddAssignmentToReference(nv::provider_option_names::kDetailedBuildLog, info.detailed_build_log)
+          .AddAssignmentToReference(nv::provider_option_names::kProfilesMinShapes, info.profile_min_shapes)
+          .AddAssignmentToReference(nv::provider_option_names::kProfilesMaxShapes, info.profile_max_shapes)
+          .AddAssignmentToReference(nv::provider_option_names::kProfilesOptShapes, info.profile_opt_shapes)
+          .AddAssignmentToReference(nv::provider_option_names::kCudaGraphEnable, info.cuda_graph_enable)
+          .AddValueParser(
+              nv::provider_option_names::kONNXBytestream,
+              [&onnx_bytestream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                onnx_bytestream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
+          .AddAssignmentToReference(nv::provider_option_names::kONNXBytestreamSize, info.onnx_bytestream_size)
+          .Parse(options));  // add new provider option here.
+
+  info.user_compute_stream = user_compute_stream;
+  info.has_user_compute_stream = (user_compute_stream != nullptr);
+  info.onnx_bytestream = onnx_bytestream;
+  return info;
+}
+
+ProviderOptions NvExecutionProviderInfo::ToProviderOptions(const NvExecutionProviderInfo& info) {
+  const ProviderOptions options{
+      {nv::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
+      {nv::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {nv::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
+      {nv::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.max_workspace_size)},
+      {nv::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)},
+      {nv::provider_option_names::kDetailedBuildLog, MakeStringWithClassicLocale(info.detailed_build_log)},
+      {nv::provider_option_names::kProfilesMinShapes, MakeStringWithClassicLocale(info.profile_min_shapes)},
+      {nv::provider_option_names::kProfilesMaxShapes, MakeStringWithClassicLocale(info.profile_max_shapes)},
+      {nv::provider_option_names::kProfilesOptShapes, MakeStringWithClassicLocale(info.profile_opt_shapes)},
+      {nv::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.cuda_graph_enable)},
+      {nv::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(info.onnx_bytestream)},
+      {nv::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.onnx_bytestream_size)},
+  };
+  return options;
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
new file mode 100644
index 0000000000000..c3c4dba1ed982
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <limits>
+
+#include "core/framework/ortdevice.h"
+#include "core/framework/provider_options.h"
+#include "core/framework/framework_provider_common.h"
+#include "core/session/onnxruntime_c_api.h"
+#include "core/framework/library_handles.h"
+
+#define TRT_DEFAULT_OPTIMIZER_LEVEL 3
+
+namespace onnxruntime {
+// Information needed to construct trt execution providers.
+struct NvExecutionProviderInfo {
+  int device_id{0};
+  bool has_user_compute_stream{false};
+  void* user_compute_stream{nullptr};
+  bool has_trt_options{false};
+  int max_partition_iterations{1000};
+  int min_subgraph_size{1};
+  size_t max_workspace_size{0};
+  bool fp16_enable{false};
+  bool int8_enable{false};
+  std::string int8_calibration_table_name{""};
+  bool int8_use_native_calibration_table{false};
+  bool dla_enable{false};
+  int dla_core{0};
+  bool dump_subgraphs{false};
+  bool engine_cache_enable{false};
+  std::string engine_cache_path{""};
+  bool weight_stripped_engine_enable{false};
+  std::string onnx_model_folder_path{""};
+  const void* onnx_bytestream{nullptr};
+  size_t onnx_bytestream_size{0};
+  bool engine_decryption_enable{false};
+  std::string engine_decryption_lib_path{""};
+  bool force_sequential_engine_build{false};
+  bool context_memory_sharing_enable{false};
+  bool layer_norm_fp32_fallback{false};
+  bool timing_cache_enable{false};
+  std::string timing_cache_path{""};
+  bool force_timing_cache{false};
+  bool detailed_build_log{false};
+  bool build_heuristics_enable{false};
+  bool sparsity_enable{false};
+  int builder_optimization_level{3};
+  int auxiliary_streams{-1};
+  std::string tactic_sources{""};
+  std::string extra_plugin_lib_paths{""};
+  std::string profile_min_shapes{""};
+  std::string profile_max_shapes{""};
+  std::string profile_opt_shapes{""};
+  bool cuda_graph_enable{false};
+  bool dump_ep_context_model{false};
+  std::string ep_context_file_path{""};
+  int ep_context_embed_mode{0};
+  std::string engine_cache_prefix{""};
+  bool engine_hw_compatible{false};
+  std::string op_types_to_exclude{""};
+
+  static NvExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
+  static ProviderOptions ToProviderOptions(const NvExecutionProviderInfo& info);
+  std::vector<OrtCustomOpDomain*> custom_op_domain_list;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
new file mode 100644
index 0000000000000..169127f222949
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
@@ -0,0 +1,685 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <fstream>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <filesystem>
+#include "flatbuffers/idl.h"
+#include <NvInferVersion.h>
+#include "core/providers/cuda/cuda_pch.h"
+#include "core/common/path_string.h"
+#include "core/framework/murmurhash3.h"
+
+namespace fs = std::filesystem;
+
+namespace onnxruntime {
+
+/*
+ * Get number of profile setting.
+ *
+ * profile_min_shapes/profile_max_shapes/profile_opt_shapes may contain multiple profile settings.
+ * Note: TRT EP currently only supports one profile setting.
+ *
+ * {
+ *   tensor_a: [[dim_0_value_0, dim_1_value_1, dim_2_value_2]],
+ *   tensor_b: [[dim_0_value_3, dim_1_value_4, dim_2_value_5]]
+ * }
+ *
+ */
+int GetNumProfiles(std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_shapes) {
+  int num_profile = 0;
+  for (auto it = profile_shapes.begin(); it != profile_shapes.end(); it++) {
+    num_profile = static_cast<int>(it->second.size());
+    if (num_profile > 0) {
+      break;
+    }
+  }
+  return num_profile;
+}
+
+/*
+ * Seralize engine profile
+ * The profile contains min/max shape ranges of dynamic shape dimensions of each input tensor
+ * For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b
+ * has one dynamic shape dimension: dim_1. The data in profile will be,
+ * key: tensor_a, value: dim_0 min_shape max_shape dim_2 min_shape max_shape
+ * key: tensor_b, value: dim_1 min_shape max_shape
+ *
+ * [Deprecated] Use SerializeProfileV2
+ */
+void SerializeProfile(const std::string& file_name, std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, int64_t>>>& shape_ranges) {
+  // Serialize profile
+  flexbuffers::Builder builder;
+  auto profile_start = builder.StartMap();
+  for (auto outer_it = shape_ranges.begin(); outer_it != shape_ranges.end(); ++outer_it) {
+    builder.TypedVector(outer_it->first.c_str(), [&] {
+      for (auto inner_it = outer_it->second.begin(); inner_it != outer_it->second.end(); ++inner_it) {
+        builder.Int(inner_it->first);
+        builder.Int(inner_it->second.first);
+        builder.Int(inner_it->second.second);
+      }
+    });
+  }
+  builder.EndMap(profile_start);
+  builder.Finish();
+
+  // Save flexbuffer
+  std::ofstream file(file_name, std::ios::binary | std::ios::out);
+  auto buf = builder.GetBuffer();
+  size_t size = builder.GetSize();
+  file.write(reinterpret_cast<const char*>(&buf[0]), size);
+  file.close();
+}
+
+// Deserialize engine profile
+// [Deprecated] Use DeserializeProfileV2
+std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, int64_t>>> DeserializeProfile(std::ifstream& infile) {
+  // Load flexbuffer
+  infile.seekg(0, std::ios::end);
+  size_t length = infile.tellg();
+  infile.seekg(0, std::ios::beg);
+  std::unique_ptr<char[]> data{new char[length]};
+  infile.read((char*)data.get(), length);
+  infile.close();
+
+  // Deserialize profile
+  std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, int64_t>>> shape_ranges;
+  auto tensors_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap();
+  auto keys = tensors_range_entries.Keys();
+  auto values = tensors_range_entries.Values();
+  for (size_t i = 0, i_end = keys.size(); i < i_end; ++i) {
+    auto dim_range_vectors = values[i].AsTypedVector();
+    std::unordered_map<size_t, std::pair<int64_t, int64_t>> inner_map;
+    for (size_t j = 0, j_end = dim_range_vectors.size() / 3; j < j_end; ++j) {
+      size_t idx = 3 * j;
+      inner_map[dim_range_vectors[idx].AsInt64()] = std::make_pair(dim_range_vectors[idx + 1].AsInt64(), dim_range_vectors[idx + 2].AsInt64());
+    }
+    shape_ranges[keys[i].AsString().c_str()] = inner_map;
+  }
+  return shape_ranges;
+}
+
+/*
+ * Seralize engine profile. (This function starts from ORT 1.15)
+ *
+ *
+ * (1) Single profile case:
+ * Assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2,
+ * and tensor_b has one dynamic shape dimension: dim_1.
+ *
+ * The data before serialization will be:
+ * {
+ *   tensor_a: {
+ *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0]],
+ *     dim_2: [[min_shape_2, max_shape_2, opt_shape_2]]
+ *   },
+ *   tensor_b: {
+ *     dim_1: [[min_shape_1, max_shape_1, opt_shape_1]]
+ *   }
+ * }
+ *
+ * The data after serialization will be:
+ * {
+ *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_2, min_shape_2, max_shape_2, opt_shape_2]
+ *   tensor_b: [dim_1, min_shape_1, max_shape_1, opt_shape_1]
+ * }
+ *
+ *
+ * (2) Multiple profiles case:
+ * For example, if the data before serialization is:
+ * {
+ *   tensor_a: {
+ *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
+ *   },
+ *   tensor_b: {
+ *     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
+ *   }
+ * }
+ *
+ * The data after serialization will be:
+ * {
+ *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_0, min_shape_1, max_shape_1, opt_shape_1]
+ *              |                                          |  |                                          |
+ *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
+ *
+ *   tensor_b: [dim_1, min_shape_2, max_shape_2, opt_shape_2, dim_1, min_shape_3, max_shape_3, opt_shape_3]
+ *              |                                          |  |                                          |
+ *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
+ * }
+ *
+ */
+void SerializeProfileV2(const std::string& file_name, std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>>& shape_ranges) {
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] In SerializeProfileV2()";
+  // Serialize profile
+  flexbuffers::Builder builder;
+  auto tensor_map_start = builder.StartMap();
+  for (auto tensor_it = shape_ranges.begin(); tensor_it != shape_ranges.end(); tensor_it++) {  // iterate tensors
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] input tensor is '" << tensor_it->first.c_str() << "'";
+    builder.TypedVector(tensor_it->first.c_str(), [&] {
+      for (auto dim_it = tensor_it->second.begin(); dim_it != tensor_it->second.end(); dim_it++) {
+        size_t num_profiles = dim_it->second.size();
+        for (size_t i = 0; i < num_profiles; i++) {
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] profile #" << i << ", dim is " << dim_it->first;
+          builder.Int(dim_it->first);
+          builder.Int(dim_it->second[i][0]);
+          builder.Int(dim_it->second[i][1]);
+          builder.Int(dim_it->second[i][2]);
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] " << dim_it->first << ", " << dim_it->second[i][0] << ", " << dim_it->second[i][1] << ", " << dim_it->second[i][2];
+        }
+      }
+    });
+  }
+  builder.EndMap(tensor_map_start);
+  builder.Finish();
+
+  // Save flexbuffer
+  std::ofstream file(file_name, std::ios::binary | std::ios::out);
+  auto buf = builder.GetBuffer();
+  size_t size = builder.GetSize();
+  file.write(reinterpret_cast<const char*>(&buf[0]), size);
+  file.close();
+}
+
+/*
+ * Deserialize engine profile. (This function starts from ORT 1.15)
+ *
+ *
+ * (1) Single profile case:
+ * Assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2,
+ * and tensor_b has one dynamic shape dimension: dim_1.
+ *
+ * The data in profile file will be:
+ * {
+ *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_2, min_shape_2, max_shape_2, opt_shape_2]
+ *   tensor_b: [dim_1, min_shape_1, max_shape_1, opt_shape_1]
+ * }
+ *
+ * The data after deserialization will be:
+ * {
+ *   tensor_a: {
+ *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0]],
+ *     dim_2: [[min_shape_2, max_shape_2, opt_shape_2]]
+ *   },
+ *   tensor_b: {
+ *     dim_1: [[min_shape_1, max_shape_1, opt_shape_1]]
+ *   }
+ * }
+ *
+ *
+ * (2) Multiple profiles case:
+ * For example, if the data in profile file is:
+ * {
+ *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_0, min_shape_1, max_shape_1, opt_shape_1]
+ *              |                                          |  |                                          |
+ *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
+ *
+ *   tensor_b: [dim_1, min_shape_2, max_shape_2, opt_shape_2, dim_1, min_shape_3, max_shape_3, opt_shape_3]
+ *              |                                          |  |                                          |
+ *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
+ * }
+ *
+ * The data after deserialization will be:
+ * {
+ *   tensor_a: {
+ *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
+ *   },
+ *   tensor_b: {
+ *     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
+ *   }
+ * }
+ */
+std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> DeserializeProfileV2(std::ifstream& infile) {
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] In DeserializeProfileV2()";
+  // Load flexbuffer
+  infile.seekg(0, std::ios::end);
+  size_t length = infile.tellg();
+  infile.seekg(0, std::ios::beg);
+  std::unique_ptr<char[]> data{new char[length]};
+  infile.read((char*)data.get(), length);
+  infile.close();
+
+  // Deserialize profile
+  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> shape_ranges;
+  auto tensors_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap();
+  auto keys = tensors_range_entries.Keys();
+  auto values = tensors_range_entries.Values();
+  for (size_t i = 0, end = keys.size(); i < end; ++i) {  // iterate tensors
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] input tensor is '" << keys[i].AsString().c_str() << "'";
+    auto dim_range_vector = values[i].AsTypedVector();
+    std::unordered_map<size_t, std::vector<std::vector<int64_t>>> inner_map;
+    std::vector<std::vector<int64_t>> profile_vector;
+
+    for (size_t k = 0; k < (dim_range_vector.size() / 4); k++) {  // iterate dim, min, max, opt for all profiles
+      std::vector<int64_t> shape_vector;
+      auto idx = 4 * k;
+      auto dim = dim_range_vector[idx].AsInt64();
+      shape_vector.push_back(dim_range_vector[idx + 1].AsInt64());  // min shape
+      shape_vector.push_back(dim_range_vector[idx + 2].AsInt64());  // max shape
+      shape_vector.push_back(dim_range_vector[idx + 3].AsInt64());  // opt shape
+
+      if (inner_map.find(dim) == inner_map.end()) {
+        inner_map[dim] = profile_vector;
+      }
+      inner_map[dim].push_back(shape_vector);
+      LOGS_DEFAULT(VERBOSE) << "[Nv EP] " << dim << ", " << shape_vector[0] << ", " << shape_vector[1] << ", " << shape_vector[2];
+    }
+    shape_ranges[keys[i].AsString().c_str()] = inner_map;
+  }
+  return shape_ranges;
+}
+
+/*
+ * Compare profile shapes from profile file (.profile) with explicit profile min/max/opt shapes.
+ * Return false meaning no need to rebuild engine if everything is same.
+ * Otherwise return true and engine needs to be rebuilt.
+ */
+bool CompareProfiles(const std::string& file_name,
+                     std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_min_shapes,
+                     std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_max_shapes,
+                     std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_opt_shapes) {
+  std::ifstream profile_file(file_name, std::ios::binary | std::ios::in);
+  if (!profile_file) {
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] " << file_name << " doesn't exist.";
+    return true;
+  }
+
+  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> shape_ranges;
+  shape_ranges = DeserializeProfileV2(profile_file);
+
+  /* The format of the two data structures are below, for example:
+   *
+   * shape_ranges:
+   * {
+   *   tensor_a: {
+   *     dim_0: [[min_shape, max_shape, opt_shape]],
+   *     dim_2: [[min_shape, max_shape, opt_shape]]
+   *   },
+   *   tensor_b: {
+   *     dim_1: [[min_shape, max_shape, opt_shape]]
+   *   }
+   * }
+   *
+   * profile_min_shapes:
+   * {
+   *   tensor_a: [[dim_0_value_0, dim_1_value_1, dim_2_value_2]],
+   *   tensor_b: [[dim_0_value_3, dim_1_value_4, dim_2_value_5]]
+   * }
+   *
+   */
+
+  // Check number of dynamic shape inputs
+  if (profile_min_shapes.size() != shape_ranges.size()) {
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Numbers of dynamic shape inputs are not the same.";
+    return true;
+  }
+
+  // Iterate through shape_ranges map
+  for (auto tensor_it = shape_ranges.begin(); tensor_it != shape_ranges.end(); tensor_it++) {  // iterate tensors
+    auto tensor_name = tensor_it->first;
+    if (profile_min_shapes.find(tensor_name) == profile_min_shapes.end()) {
+      LOGS_DEFAULT(VERBOSE) << "[Nv EP] Tensor name '" << tensor_name << "' doesn't exist in trt_profile_min_shapes.";
+      return true;
+    }
+
+    for (auto dim_it = tensor_it->second.begin(); dim_it != tensor_it->second.end(); dim_it++) {  // iterate dimensions
+      auto dim = dim_it->first;
+      auto num_profiles = GetNumProfiles(profile_min_shapes);
+
+      if (dim_it->second.size() != static_cast<size_t>(num_profiles)) {
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] Numbers of profiles are not the same.";
+        return true;
+      }
+
+      for (size_t i = 0; i < dim_it->second.size(); i++) {  // iterate (multiple) profile(s)
+        auto shape_values = dim_it->second[i];
+        if (dim > (profile_min_shapes[tensor_name][i].size() - 1)) {
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] dimension " << dim << " of '" << tensor_name << "' in " << file_name << " exceeds the total dimension of trt_profile_min_shapes.";
+          return true;
+        }
+
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] min shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_min_shapes[tensor_name][i][dim];
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] min shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[0] << " in " << file_name;
+        if (profile_min_shapes[tensor_name][i][dim] != shape_values[0]) {
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] min shape values of dimension " << dim << " of '" << tensor_name << "' are not the same";
+          return true;
+        }
+
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] max shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_max_shapes[tensor_name][i][dim];
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] max shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[1] << " in " << file_name;
+        if (profile_max_shapes[tensor_name][i][dim] != shape_values[1]) {
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] max shape values of dimension " << dim << " of '" << tensor_name << "' are not the same";
+          return true;
+        }
+
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] opt shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_opt_shapes[tensor_name][i][dim];
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] opt shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[2] << " in " << file_name;
+        if (profile_opt_shapes[tensor_name][i][dim] != shape_values[2]) {
+          LOGS_DEFAULT(VERBOSE) << "[Nv EP] opt shape values of dimension " << dim << " of '" << tensor_name << "' are not the same";
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/*
+ * Get cache by name
+ *
+ */
+std::string GetCachePath(const std::string& root, const std::string& name) {
+  if (root.empty()) {
+    return name;
+  } else {
+    fs::path path = root;
+    path.append(name);
+    return path.string();
+  }
+}
+
+/*
+ * Get compute capability
+ *
+ */
+std::string GetComputeCapacity(const cudaDeviceProp& prop) {
+  const std::string compute_capability = std::to_string(prop.major * 10 + prop.minor);
+  return compute_capability;
+}
+
+/*
+ * Get Timing by compute capability
+ *
+ */
+std::string GetTimingCachePath(const std::string& root, std::string& compute_cap) {
+  // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
+  const std::string timing_cache_name = "NvExecutionProvider_cache_sm" +
+                                        compute_cap + ".timing";
+  return GetCachePath(root, timing_cache_name);
+}
+
+/*
+ * Get cache by type
+ *
+ * \param root root path of the cache
+ * \param file_extension It could be ".engine", ".profile" or ".timing"
+ */
+std::vector<fs::path> GetCachesByType(const std::string& root, std::string file_extension) {
+  std::vector<fs::path> cache_files;
+  for (const auto& entry : fs::directory_iterator(root)) {
+    if (fs::path(file_extension) == fs::path(entry).extension()) {
+      cache_files.push_back(fs::path(entry));
+    }
+  }
+  return cache_files;
+}
+
+bool IsCacheExistedByType(const std::string& root, std::string file_extension) {
+  auto cache_files = GetCachesByType(root, file_extension);
+  if (cache_files.size() == 0) {
+    return false;
+  }
+  return true;
+}
+
+void RemoveCachesByType(const std::string& root, std::string file_extension) {
+  auto cache_files = GetCachesByType(root, file_extension);
+  for (const auto& entry : cache_files) {
+    fs::remove(entry);
+  }
+}
+
+/**
+ * <summary>
+ * Helper class to generate engine id via model name/model content/env metadata
+ * </summary>
+ * <remarks>
+ * The TensorRT Execution Provider is used in multiple sessions and the underlying infrastructure caches
+ * compiled kernels, so the name must be unique and deterministic across models and sessions.
+ * </remarks>
+ */
+HashValue TRTGenerateId(const GraphViewer& graph_viewer, std::string trt_version, std::string cuda_version) {
+  HashValue model_hash = 0;
+
+  // find the top level graph
+  const Graph* cur_graph = &graph_viewer.GetGraph();
+  while (cur_graph->IsSubgraph()) {
+    cur_graph = cur_graph->ParentGraph();
+  }
+
+  const Graph& main_graph = *cur_graph;
+  uint32_t hash[4] = {0, 0, 0, 0};
+
+  auto hash_str = [&hash](const std::string& str) {
+    MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
+  };
+
+  // Use the model's file name instead of the entire path to avoid cache regeneration if path changes
+  if (main_graph.ModelPath().has_filename()) {
+    std::string model_name = PathToUTF8String(main_graph.ModelPath().filename());
+
+    LOGS_DEFAULT(INFO) << "[Nv EP] Model name is " << model_name;
+    // Ensure enough characters are hashed in case model names are too short
+    const size_t model_name_length = model_name.size();
+    constexpr size_t hash_string_length = 500;
+    std::string repeat_model_name = model_name;
+    for (size_t i = model_name_length; i > 0 && i < hash_string_length; i += model_name_length) {
+      repeat_model_name += model_name;
+    }
+    hash_str(repeat_model_name);
+  } else {
+    LOGS_DEFAULT(INFO) << "[Nv EP] Model path is empty";
+  }
+
+  // fingerprint current graph by hashing graph inputs
+  for (const auto* node_arg : graph_viewer.GetInputsIncludingInitializers()) {
+    hash_str(node_arg->Name());
+  }
+
+  // hashing output of each node
+  const int number_of_ort_nodes = graph_viewer.NumberOfNodes();
+  std::vector<size_t> nodes_vector(number_of_ort_nodes);
+  std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);
+  const std::vector<NodeIndex>& node_index = graph_viewer.GetNodesInTopologicalOrder();
+  for (const auto& index : nodes_vector) {
+    const auto& node = graph_viewer.GetNode(node_index[index]);
+    for (const auto* node_arg : node->OutputDefs()) {
+      if (node_arg->Exists()) {
+        hash_str(node_arg->Name());
+      }
+    }
+  }
+
+#ifdef __linux__
+  hash_str("LINUX");
+#elif defined(_WIN32)
+  hash_str("WINDOWS");
+#endif
+
+#ifdef ORT_VERSION
+  hash_str(ORT_VERSION);
+#endif
+
+#ifdef CUDA_VERSION
+  hash_str(cuda_version);
+#endif
+
+#if defined(NV_TENSORRT_MAJOR) && defined(NV_TENSORRT_MINOR)
+  hash_str(trt_version);
+#endif
+
+  model_hash = hash[0] | (uint64_t(hash[1]) << 32);
+
+  // return the current unique id
+  return model_hash;
+}
+
+bool ValidateProfileShapes(std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_min_shapes,
+                           std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_max_shapes,
+                           std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_opt_shapes) {
+  if (profile_min_shapes.empty() && profile_max_shapes.empty() && profile_opt_shapes.empty()) {
+    return true;
+  }
+
+  if ((profile_min_shapes.size() != profile_max_shapes.size()) &&
+      (profile_min_shapes.size() != profile_opt_shapes.size()) &&
+      (profile_max_shapes.size() != profile_opt_shapes.size())) {
+    return false;
+  }
+
+  std::unordered_map<std::string, std::vector<std::vector<int64_t>>>::iterator it;
+  for (it = profile_min_shapes.begin(); it != profile_min_shapes.end(); it++) {
+    auto input_name = it->first;
+    auto num_profile = it->second.size();
+
+    // input_name must also be in max/opt profile
+    if ((profile_max_shapes.find(input_name) == profile_max_shapes.end()) ||
+        (profile_opt_shapes.find(input_name) == profile_opt_shapes.end())) {
+      return false;
+    }
+
+    // number of profiles should be the same
+    if ((num_profile != profile_max_shapes[input_name].size()) ||
+        (num_profile != profile_opt_shapes[input_name].size())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/*
+ * Make input-name and shape as a pair.
+ * This helper function is being used by ParseProfileShapes().
+ *
+ * For example:
+ * The input string is "input_id:32x1",
+ * after the string is being parsed, the pair object is returned as below.
+ * pair("input_id", [32, 1])
+ *
+ * Return true if string can be successfully parsed or false if string has wrong format.
+ */
+bool MakeInputNameShapePair(std::string pair_string, std::pair<std::string, std::vector<int64_t>>& pair) {
+  if (pair_string.empty()) {
+    return true;
+  }
+
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] " << pair_string;
+
+  std::stringstream input_string_stream(pair_string);
+  char first_delim = ':';
+  char second_delim = 'x';
+  std::string input_name;
+  std::string shape;
+  std::getline(input_string_stream, input_name, first_delim);
+  std::getline(input_string_stream, shape, first_delim);
+
+  std::vector<int64_t> shapes;
+  std::stringstream shape_string_stream(shape);
+  std::string value;
+  while (std::getline(shape_string_stream, value, second_delim)) {
+    shapes.push_back(std::stoi(value));
+  }
+
+  // wrong input string
+  if (input_name.empty() || shapes.empty()) {
+    return false;
+  }
+
+  pair.first = input_name;
+  pair.second = shapes;
+
+  return true;
+}
+
+/*
+ * Parse explicit profile min/max/opt shapes from Nv EP provider options.
+ *
+ * For example:
+ * The provider option is --trt_profile_min_shapes="input_id:32x1,attention_mask:32x1,input_id:32x41,attention_mask:32x41",
+ * after string is being parsed, the profile shapes has two profiles and is being represented as below.
+ * {"input_id": [[32, 1], [32, 41]], "attention_mask": [[32, 1], [32, 41]]}
+ *
+ * Return true if string can be successfully parsed or false if string has wrong format.
+ */
+bool ParseProfileShapes(std::string profile_shapes_string, std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_shapes) {
+  if (profile_shapes_string.empty()) {
+    return true;
+  }
+
+  std::stringstream input_string_stream(profile_shapes_string);
+  char delim = ',';
+  std::string input_name_with_shape;  // input_name:shape, ex: "input_id:32x1"
+  while (std::getline(input_string_stream, input_name_with_shape, delim)) {
+    std::pair<std::string, std::vector<int64_t>> pair;
+    if (!MakeInputNameShapePair(input_name_with_shape, pair)) {
+      return false;
+    }
+
+    std::string input_name = pair.first;
+    if (profile_shapes.find(input_name) == profile_shapes.end()) {
+      std::vector<std::vector<int64_t>> profile_shape_vector;
+      profile_shapes[input_name] = profile_shape_vector;
+    }
+    profile_shapes[input_name].push_back(pair.second);
+
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] " << input_name;
+    std::string shape_string = "";
+    for (auto v : pair.second) {
+      shape_string += std::to_string(v);
+      shape_string += ", ";
+    }
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] " << shape_string;
+  }
+
+  return true;
+}
+
+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+
+std::string join(const std::vector<std::string>& vec, const std::string& delimiter) {
+  std::string result;
+  for (size_t i = 0; i < vec.size(); ++i) {
+    result += vec[i];
+    if (i < vec.size() - 1) {
+      result += delimiter;
+    }
+  }
+  return result;
+}
+
+/*
+ * Parse engine cache name suffix when user customizes prefix for engine cache name
+ *
+ * For example:
+ * When default subgraph name is "NvExecutionProvider_TRTKernel_graph_torch-jit-export_2068723788287043730_189_189_fp16"
+ * This func will generate the suffix "2068723788287043730_189_fp16"
+ *
+ */
+std::string GetCacheSuffix(const std::string& fused_node_name, const std::string& trt_node_name_with_precision) {
+  std::vector<std::string> split_fused_node_name = split(fused_node_name, '_');
+  if (split_fused_node_name.size() >= 3) {
+    // Get index of model hash from fused_node_name
+    std::string model_hash = split_fused_node_name[split_fused_node_name.size() - 3];
+    size_t index = fused_node_name.find(model_hash);
+    // Parse suffix from trt_node_name_with_precision, as it has additional precision info
+    std::vector<std::string> suffix_group = split(trt_node_name_with_precision.substr(index), '_');
+    if (suffix_group.size() > 2) {
+      suffix_group.erase(suffix_group.begin() + 2);
+    }
+    return join(suffix_group, "_");
+  }
+  return "";
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_includes.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_includes.h
new file mode 100644
index 0000000000000..047f325f49b70
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_includes.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+// File to include the required TRT headers with workarounds for warnings we can't fix or not fixed yet.
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)  // Ignore warning C4100: unreferenced formal parameter
+#pragma warning(disable : 4996)  // Ignore warning C4996: 'nvinfer1::IPluginV2' was declared deprecated
+#endif
+
+#include <NvInfer.h>
+#include <NvInferPlugin.h>
+#include <NvInferRuntime.h>
+#include <NvOnnxParser.h>
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
new file mode 100644
index 0000000000000..ec353590810b7
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
@@ -0,0 +1,122 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "nv_provider_factory.h"
+#include <atomic>
+#include "nv_execution_provider.h"
+#include "nv_provider_factory_creator.h"
+#include "core/framework/provider_options.h"
+#include "core/providers/nv_tensorrt_rtx/nv_provider_options.h"
+#include "core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.h"
+#include <string.h>
+
+using namespace onnxruntime;
+
+namespace onnxruntime {
+
+void InitializeRegistry();
+void DeleteRegistry();
+
+struct ProviderInfo_Nv_Impl final : ProviderInfo_Nv {
+  OrtStatus* GetCurrentGpuDeviceId(_In_ int* device_id) override {
+    auto cuda_err = cudaGetDevice(device_id);
+    if (cuda_err != cudaSuccess) {
+      return CreateStatus(ORT_FAIL, "Failed to get device id.");
+    }
+    return nullptr;
+  }
+
+  OrtStatus* GetTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths) override {
+    common::Status status = CreateTensorRTCustomOpDomainList(domain_list, extra_plugin_lib_paths);
+    if (!status.IsOK()) {
+      return CreateStatus(ORT_FAIL, "[Nv EP] Can't create custom ops for TRT plugins.");
+    }
+    return nullptr;
+  }
+
+  OrtStatus* ReleaseCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list) override {
+    ReleaseTensorRTCustomOpDomainList(domain_list);
+    return nullptr;
+  }
+} g_info;
+
+struct NvProviderFactory : IExecutionProviderFactory {
+  NvProviderFactory(const NvExecutionProviderInfo& info) : info_{info} {}
+  ~NvProviderFactory() override {}
+
+  std::unique_ptr<IExecutionProvider> CreateProvider() override;
+  std::unique_ptr<IExecutionProvider> CreateProvider(const OrtSessionOptions& session_options,
+                                                     const OrtLogger& session_logger);
+
+ private:
+  NvExecutionProviderInfo info_;
+};
+
+std::unique_ptr<IExecutionProvider> NvProviderFactory::CreateProvider() {
+  return std::make_unique<NvExecutionProvider>(info_);
+}
+
+std::unique_ptr<IExecutionProvider> NvProviderFactory::CreateProvider(const OrtSessionOptions& session_options, const OrtLogger& session_logger) {
+  const ConfigOptions& config_options = session_options.GetConfigOptions();
+  const std::unordered_map<std::string, std::string>& config_options_map = config_options.GetConfigOptionsMap();
+
+  // The implementation of the SessionOptionsAppendExecutionProvider C API function automatically adds EP options to
+  // the session option configurations with the key prefix "ep.<lowercase_ep_name>.".
+  // We extract those EP options to create a new "provider options" key/value map.
+  std::string lowercase_ep_name = kNvTensorRTRTXExecutionProvider;
+  std::transform(lowercase_ep_name.begin(), lowercase_ep_name.end(), lowercase_ep_name.begin(), [](unsigned char c) {
+    return static_cast<char>(std::tolower(c));
+  });
+
+  ProviderOptions provider_options;
+  std::string key_prefix = "ep.";
+  key_prefix += lowercase_ep_name;
+  key_prefix += ".";
+
+  for (const auto& [key, value] : config_options_map) {
+    if (key.rfind(key_prefix, 0) == 0) {
+      provider_options[key.substr(key_prefix.size())] = value;
+    }
+  }
+  NvExecutionProviderInfo info = onnxruntime::NvExecutionProviderInfo::FromProviderOptions(provider_options);
+
+  auto ep = std::make_unique<NvExecutionProvider>(info);
+  ep->SetLogger(reinterpret_cast<const logging::Logger*>(&session_logger));
+  return ep;
+}
+
+struct Nv_Provider : Provider {
+  void* GetInfo() override { return &g_info; }
+  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) override {
+    NvExecutionProviderInfo info;
+    info.device_id = device_id;
+    info.has_trt_options = false;
+
+    return std::make_shared<NvProviderFactory>(info);
+  }
+
+  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* options) {
+    const ProviderOptions* provider_options = reinterpret_cast<const ProviderOptions*>(options);
+    NvExecutionProviderInfo info = onnxruntime::NvExecutionProviderInfo::FromProviderOptions(*provider_options);
+    return std::make_shared<NvProviderFactory>(info);
+  }
+
+  void Initialize() override {
+    InitializeRegistry();
+  }
+
+  void Shutdown() override {
+    DeleteRegistry();
+  }
+
+} g_provider;
+
+}  // namespace onnxruntime
+
+extern "C" {
+
+ORT_API(onnxruntime::Provider*, GetProvider) {
+  return &onnxruntime::g_provider;
+}
+}
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.h
new file mode 100644
index 0000000000000..928874475735f
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "onnxruntime_c_api.h"
+#include "core/framework/provider_options.h"
+
+namespace onnxruntime {
+struct ProviderInfo_Nv {
+  virtual OrtStatus* GetCurrentGpuDeviceId(_In_ int* device_id) = 0;
+  virtual OrtStatus* GetTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths) = 0;
+  virtual OrtStatus* ReleaseCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list) = 0;
+
+ protected:
+  ~ProviderInfo_Nv() = default;  // Can only be destroyed through a subclass instance
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory_creator.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory_creator.h
new file mode 100644
index 0000000000000..7eeb6cce4fa03
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory_creator.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+
+#include "core/framework/provider_options.h"
+#include "core/providers/providers.h"
+
+namespace onnxruntime {
+// defined in provider_bridge_ort.cc
+struct NvProviderFactoryCreator {
+  static std::shared_ptr<IExecutionProviderFactory> Create(int device_id);
+  static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options);
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/onnx_ctx_model_helper.cc
new file mode 100644
index 0000000000000..4f84e853f999c
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/onnx_ctx_model_helper.cc
@@ -0,0 +1,420 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <fstream>
+#include <filesystem>
+
+#include "onnx_ctx_model_helper.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "core/framework/execution_provider.h"
+#include "nv_execution_provider.h"
+
+namespace onnxruntime {
+extern TensorrtLogger& GetTensorrtLogger(bool verbose_log);
+
+/*
+ *  Check whether the graph has the EP context contrib op.
+ *  The op can contain the precompiled engine info for TRT EP to directly load the engine.
+ *
+ *  Note: Please see more details about "EPContext" contrib op in contrib_defs.cc
+ */
+bool GraphHasCtxNode(const GraphViewer& graph_viewer) {
+  for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
+    auto node = graph_viewer.GetNode(i);
+    if (node != nullptr && node->OpType() == EPCONTEXT_OP) {
+      return true;
+    }
+  }
+  return false;
+}
+
+const std::filesystem::path& GetModelPath(const GraphViewer& graph_viewer) {
+  // find the top level graph
+  const Graph* cur_graph = &graph_viewer.GetGraph();
+  while (cur_graph->IsSubgraph()) {
+    cur_graph = cur_graph->ParentGraph();
+  }
+
+  const Graph& main_graph = *cur_graph;
+  return main_graph.ModelPath();
+}
+
+/*
+ * Update ep_cache_context attribute of the EP context node with the given engine binary data
+ */
+void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
+                                     char* engine_data,
+                                     size_t size) {
+  ONNX_NAMESPACE::GraphProto* graph_proto = model_proto->mutable_graph();
+  ONNX_NAMESPACE::NodeProto* node_proto = graph_proto->mutable_node(0);
+
+  for (int i = 0; i < node_proto->attribute_size(); ++i) {
+    ONNX_NAMESPACE::AttributeProto* attribute_proto = node_proto->mutable_attribute(i);
+    if (attribute_proto->name() == EP_CACHE_CONTEXT) {
+      std::string engine_data_str = "";
+      if (size > 0) {
+        engine_data_str.assign(engine_data, size);
+      }
+      attribute_proto->set_s(engine_data_str);
+    }
+  }
+}
+
+/*
+ * Create "EP context node" model where engine information is embedded
+ */
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           const std::string compute_capability,
+                                           const std::string onnx_model_path,
+                                           const logging::Logger* logger) {
+  auto model_build = graph_viewer.CreateModel(*logger);
+  auto& graph_build = model_build->MainGraph();
+
+  // Get graph inputs and outputs
+  std::vector<onnxruntime::NodeArg*> inputs, outputs;
+  for (auto input : graph_viewer.GetInputs()) {
+    auto& n_input = graph_build.GetOrCreateNodeArg(input->Name(), input->TypeAsProto());
+    inputs.push_back(&n_input);
+  }
+
+  for (auto output : graph_viewer.GetOutputs()) {
+    auto& n_output = graph_build.GetOrCreateNodeArg(output->Name(), output->TypeAsProto());
+    outputs.push_back(&n_output);
+  }
+
+  // Create EP context node attributes
+  auto attr_0 = ONNX_NAMESPACE::AttributeProto::Create();  // embed_mode
+  auto attr_1 = ONNX_NAMESPACE::AttributeProto::Create();  // ep_cache_context
+  auto attr_2 = ONNX_NAMESPACE::AttributeProto::Create();  // hardware_architecture
+  auto attr_3 = ONNX_NAMESPACE::AttributeProto::Create();  // onnx_model_filename
+  std::string engine_data_str = "";
+  attr_0->set_name(EMBED_MODE);
+  attr_0->set_type(onnx::AttributeProto_AttributeType_INT);
+  attr_0->set_i(embed_mode);
+  attr_1->set_name(EP_CACHE_CONTEXT);
+  attr_1->set_type(onnx::AttributeProto_AttributeType_STRING);
+  if (embed_mode) {
+    if (size > 0) {
+      engine_data_str.assign(engine_data, size);
+    }
+    attr_1->set_s(engine_data_str);
+    // TODO(maximilianm) we might want to disable this warning as we only support weightless engines that are really small
+    //                   the reason we had this was that the field will be hashed and storing a large bytestream has significant overhead
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
+  } else {
+    attr_1->set_s(engine_cache_path);
+  }
+  attr_2->set_name(COMPUTE_CAPABILITY);
+  attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
+  attr_2->set_s(compute_capability);
+  attr_3->set_name(ONNX_MODEL_FILENAME);
+  attr_3->set_type(onnx::AttributeProto_AttributeType_STRING);
+  attr_3->set_s(std::filesystem::path(onnx_model_path).filename().string());
+
+  auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
+  constexpr int num_attributes = 4;
+  node_attributes->reserve(num_attributes);
+  node_attributes->emplace(EMBED_MODE, *attr_0);
+  node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
+  node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
+  node_attributes->emplace(ONNX_MODEL_FILENAME, *attr_3);
+
+  // Create EP context node
+  graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
+  ORT_ENFORCE(graph_build.Resolve().IsOK());
+
+  // Serialize modelproto to string
+  auto new_graph_viewer = graph_build.CreateGraphViewer();
+  auto& metadata = graph_viewer.GetGraph().GetModel().MetaData();
+  auto model = new_graph_viewer->CreateModel(*logger, metadata);
+  auto model_proto = model->ToProto();
+  new_graph_viewer->ToProto(*model_proto->mutable_graph(), true, true);
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+
+  return model_proto.release();
+}
+
+/*
+ * Return the directory where the ep context model locates
+ */
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path) {
+  if (ep_context_file_path.empty()) {
+    return std::filesystem::path();
+  }
+  std::filesystem::path ctx_path(ep_context_file_path);
+  if (std::filesystem::is_directory(ep_context_file_path)) {
+    return ctx_path;
+  } else {
+    return ctx_path.parent_path();
+  }
+}
+
+/*
+ * Get "EP context" model path.
+ *
+ * Function logic:
+ * If ep_context_file_path is provided,
+ *     - If ep_context_file_path is a file, return "ep_context_file_path".
+ *     - If ep_context_file_path is a directory, return "ep_context_file_path/original_model_name_ctx.onnx".
+ * If ep_context_file_path is not provided,
+ *     - Return "original_model_name_ctx.onnx".
+ *
+ * TRT EP has rules about context model path and engine cache path (see tensorrt_execution_provider.cc):
+ * - If dump_ep_context_model_ and engine_cache_enabled_ is enabled, TRT EP will dump context model and save engine cache
+ *   to the same directory provided by ep_context_file_path_. (i.e. engine_cache_path_ = ep_context_file_path_)
+ *
+ * Example 1:
+ * ep_context_file_path = "/home/user/ep_context_model_directory"
+ * original_model_path = "model.onnx"
+ * => return "/home/user/ep_context_model_folder/model_ctx.onnx"
+ *
+ * Example 2:
+ * ep_context_file_path = "my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "my_ctx_model.onnx"
+ *
+ * Example 3:
+ * ep_context_file_path = "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ *
+ */
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path) {
+  std::string ctx_model_path;
+
+  if (!ep_context_file_path.empty() && !std::filesystem::is_directory(ep_context_file_path)) {
+    ctx_model_path = ep_context_file_path;
+  } else {
+    std::filesystem::path model_path = original_model_path;
+    std::filesystem::path model_name_stem = model_path.stem();  // model_name.onnx -> model_name
+    std::string ctx_model_name = model_name_stem.string() + "_ctx.onnx";
+
+    if (std::filesystem::is_directory(ep_context_file_path)) {
+      std::filesystem::path model_directory = ep_context_file_path;
+      ctx_model_path = model_directory.append(ctx_model_name).string();
+    } else {
+      ctx_model_path = ctx_model_name;
+    }
+  }
+  return ctx_model_path;
+}
+
+/*
+ * Dump "EP context" model
+ *
+ */
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path) {
+  std::fstream dump(ctx_model_path, std::ios::out | std::ios::trunc | std::ios::binary);
+  model_proto->SerializeToOstream(dump);
+  LOGS_DEFAULT(VERBOSE) << "[Nv EP] Dumped " + ctx_model_path;
+}
+
+bool IsAbsolutePath(const std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  return path.is_absolute();
+#else
+  if (!path_string.empty() && path_string[0] == '/') {
+    return true;
+  }
+  return false;
+#endif
+}
+
+// Like "../file_path"
+bool IsRelativePathToParentPath(const std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  auto relative_path = path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#else
+  if (!path_string.empty() && path_string.find("..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#endif
+}
+
+/*
+ * Get the weight-refitted engine cache path from a weight-stripped engine cache path
+ *
+ * Weight-stipped engine:
+ * An engine with weights stripped and its size is smaller than a regualr engine.
+ * The cache name of weight-stripped engine is NvExecutionProvider_TRTKernel_XXXXX.stripped.engine
+ *
+ * Weight-refitted engine:
+ * An engine that its weights have been refitted and it's simply a regular engine.
+ * The cache name of weight-refitted engine is NvExecutionProvider_TRTKernel_XXXXX.engine
+ */
+std::string GetWeightRefittedEnginePath(std::string stripped_engine_cache) {
+  std::filesystem::path stripped_engine_cache_path(stripped_engine_cache);
+  std::string refitted_engine_cache_path = stripped_engine_cache_path.stem().stem().string() + ".engine";
+  return refitted_engine_cache_path;
+}
+
+bool IsWeightStrippedEngineCache(std::filesystem::path& engine_cache_path) {
+  // The weight-stripped engine cache has the naming of xxx.stripped.engine
+  return engine_cache_path.stem().extension().string() == ".stripped";
+}
+
+Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
+  if (!ValidateEPCtxNode(graph_viewer)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "It's not a valid EP Context node");
+  }
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  // Only make path checks if model not provided as byte buffer
+  bool make_secure_path_checks = !GetModelPath(graph_viewer).empty();
+
+  if (embed_mode) {
+    // Get engine from byte stream.
+    const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
+    *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(const_cast<char*>(context_binary.c_str()),
+                                                                                                static_cast<size_t>(context_binary.length())));
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] Read engine as binary data from \"ep_cache_context\" attribute of ep context node and deserialized it";
+    if (!(*trt_engine_)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP could not deserialize engine from binary data");
+    }
+
+    if (weight_stripped_engine_refit_) {
+      const std::string onnx_model_filename = attrs.at(ONNX_MODEL_FILENAME).s();
+      std::string placeholder;
+      auto status = NvExecutionProvider::RefitEngine(onnx_model_filename,
+                                                     onnx_model_folder_path_,
+                                                     placeholder,
+                                                     make_secure_path_checks,
+                                                     onnx_model_bytestream_,
+                                                     onnx_model_bytestream_size_,
+                                                     (*trt_engine_).get(),
+                                                     false /* serialize refitted engine to disk */,
+                                                     detailed_build_log_);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+  } else {
+    // Get engine from cache file.
+    std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();
+
+    // For security purpose, in the case of running context model, TRT EP won't allow
+    // engine cache path to be the relative path like "../file_path" or the absolute path.
+    // It only allows the engine cache to be in the same directory or sub directory of the context model.
+    if (IsAbsolutePath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "For security purpose, the ep_cache_context attribute should be set with a relative path, but it is an absolute path:  " + cache_path);
+    }
+    if (IsRelativePathToParentPath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "The file path in ep_cache_context attribute has '..'. For security purpose, it's not allowed to point outside the directory.");
+    }
+
+    // The engine cache and context model (current model) should be in the same directory
+    std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
+    auto engine_cache_path = ctx_model_dir.append(cache_path);
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] GetEpContextFromGraph engine_cache_path: " + engine_cache_path.string();
+
+    // If it's a weight-stripped engine cache, it needs to be refitted even though the refit flag is not enabled
+    if (!weight_stripped_engine_refit_) {
+      weight_stripped_engine_refit_ = IsWeightStrippedEngineCache(engine_cache_path);
+    }
+
+    // If the serialized refitted engine is present, use it directly without refitting the engine again
+    if (weight_stripped_engine_refit_) {
+      const std::filesystem::path refitted_engine_cache_path = GetWeightRefittedEnginePath(engine_cache_path.string());
+      if (std::filesystem::exists(refitted_engine_cache_path)) {
+        LOGS_DEFAULT(VERBOSE) << "[Nv EP] " + refitted_engine_cache_path.string() + " exists.";
+        engine_cache_path = refitted_engine_cache_path.string();
+        weight_stripped_engine_refit_ = false;
+      }
+    }
+
+    if (!std::filesystem::exists(engine_cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP can't find engine cache: " + engine_cache_path.string() +
+                                 ". Please make sure engine cache is in the same directory or sub-directory of context model.");
+    }
+
+    std::ifstream engine_file(engine_cache_path.string(), std::ios::binary | std::ios::in);
+    engine_file.seekg(0, std::ios::end);
+    size_t engine_size = engine_file.tellg();
+    engine_file.seekg(0, std::ios::beg);
+    std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+    engine_file.read((char*)engine_buf.get(), engine_size);
+    *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+    if (!(*trt_engine_)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "Nv EP could not deserialize engine from cache: " + engine_cache_path.string());
+    }
+    LOGS_DEFAULT(VERBOSE) << "[Nv EP] DeSerialized " + engine_cache_path.string();
+
+    if (weight_stripped_engine_refit_) {
+      const std::string onnx_model_filename = attrs.at(ONNX_MODEL_FILENAME).s();
+      std::string weight_stripped_engine_cache = engine_cache_path.string();
+      auto status = NvExecutionProvider::RefitEngine(onnx_model_filename,
+                                                     onnx_model_folder_path_,
+                                                     weight_stripped_engine_cache,
+                                                     make_secure_path_checks,
+                                                     onnx_model_bytestream_,
+                                                     onnx_model_bytestream_size_,
+                                                     (*trt_engine_).get(),
+                                                     true /* serialize refitted engine to disk */,
+                                                     detailed_build_log_);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+/*
+ * The sanity check for EP context contrib op.
+ */
+bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewer) {
+  assert(graph_viewer.NumberOfNodes() == 1);
+  assert(graph_viewer.GetNode(0)->OpType() == EPCONTEXT_OP);
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  // Show the warning if compute capability is not matched
+  if (attrs.count(COMPUTE_CAPABILITY) > 0) {
+    std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
+    // Verify if engine was compiled with ampere+ hardware compatibility enabled
+    if (model_compute_capability == "80+") {
+      LOGS_DEFAULT(WARNING) << "[Nv EP] Engine is compatible to all Ampere+ GPU (except Jetson)";
+      if (std::stoi(compute_capability_) < 80) {
+        LOGS_DEFAULT(WARNING) << "[Nv EP] However, this GPU doesn't match. The compute capability of the GPU: " << compute_capability_;
+      }
+    } else if (model_compute_capability != compute_capability_) {
+      LOGS_DEFAULT(WARNING) << "[Nv EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";
+      LOGS_DEFAULT(WARNING) << "[Nv EP] The compute capability of the engine: " << model_compute_capability;
+      LOGS_DEFAULT(WARNING) << "[Nv EP] The compute capability of the GPU: " << compute_capability_;
+    }
+  }
+
+  // "embed_mode" attr and "ep_cache_context" attr should be present
+  assert(attrs.count(EMBED_MODE) > 0);
+  assert(attrs.count(EP_CACHE_CONTEXT) > 0);
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  if (embed_mode == 1) {
+    // engine binary data
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
+  }
+
+  return true;
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/onnx_ctx_model_helper.h b/onnxruntime/core/providers/nv_tensorrt_rtx/onnx_ctx_model_helper.h
new file mode 100644
index 0000000000000..ccd06750692fc
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/onnx_ctx_model_helper.h
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <filesystem>
+#include <memory>
+
+#include "core/providers/nv_tensorrt_rtx/nv_includes.h"
+#include "core/providers/shared_library/provider_api.h"
+
+namespace onnxruntime {
+
+static const std::string EPCONTEXT_OP = "EPContext";
+static const std::string EMBED_MODE = "embed_mode";
+static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
+static const std::string COMPUTE_CAPABILITY = "hardware_architecture";
+static const std::string ONNX_MODEL_FILENAME = "onnx_model_filename";
+static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
+static const std::string EPCONTEXT_WARNING =
+    "It's suggested to set the ORT graph optimization level to 0 and  \
+                                              make \"embed_mode\" to 0 (\"ep_cache_context\" is the cache path)\
+                                              for the best model loading time";
+
+bool GraphHasCtxNode(const GraphViewer& graph_viewer);
+const std::filesystem::path& GetModelPath(const GraphViewer& graph_viewer);
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path);
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           const std::string compute_capability,
+                                           const std::string onnx_model_path,
+                                           const logging::Logger* logger);
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path);
+bool IsAbsolutePath(const std::string& path_string);
+bool IsRelativePathToParentPath(const std::string& path_string);
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path);
+void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
+                                     char* engine_data,
+                                     size_t size);
+
+class TensorRTCacheModelHandler {
+ public:
+  TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
+                            nvinfer1::IRuntime* trt_runtime,
+                            std::string ep_context_model_path,
+                            std::string compute_capability,
+                            bool weight_stripped_engine_refit,
+                            std::string onnx_model_folder_path,
+                            const void* onnx_model_bytestream,
+                            size_t onnx_model_bytestream_size,
+                            bool detailed_build_log)
+      : trt_engine_(trt_engine),
+        trt_runtime_(trt_runtime),
+        ep_context_model_path_(ep_context_model_path),
+        compute_capability_(compute_capability),
+        weight_stripped_engine_refit_(weight_stripped_engine_refit),
+        onnx_model_folder_path_(onnx_model_folder_path),
+        onnx_model_bytestream_(onnx_model_bytestream),
+        onnx_model_bytestream_size_(onnx_model_bytestream_size),
+        detailed_build_log_(detailed_build_log) {
+  }
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
+
+  bool ValidateEPCtxNode(const GraphViewer& graph_viewer);
+
+  Status GetEpContextFromGraph(const GraphViewer& graph_viewer);
+
+ private:
+  std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine_;
+  nvinfer1::IRuntime* trt_runtime_;
+  std::string ep_context_model_path_;  // If using context model, it implies context model and engine cache is in the same directory
+  std::string compute_capability_;
+  bool weight_stripped_engine_refit_;
+  std::string onnx_model_folder_path_;
+  const void* onnx_model_bytestream_;
+  size_t onnx_model_bytestream_size_;
+  bool detailed_build_log_;
+};  // TRTCacheModelHandler
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/symbols.def b/onnxruntime/core/providers/nv_tensorrt_rtx/symbols.def
new file mode 100644
index 0000000000000..4ec2f7914c208
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/symbols.def
@@ -0,0 +1,2 @@
+EXPORTS
+   GetProvider
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/symbols.txt b/onnxruntime/core/providers/nv_tensorrt_rtx/symbols.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/version_script.lds b/onnxruntime/core/providers/nv_tensorrt_rtx/version_script.lds
new file mode 100644
index 0000000000000..094abb3329781
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/version_script.lds
@@ -0,0 +1,9 @@
+#_init and _fini should be local
+VERS_1.0 {
+  global:
+    GetProvider;    
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/onnxruntime/core/providers/provider_factory_creators.h b/onnxruntime/core/providers/provider_factory_creators.h
index 1c62c1a7a8d0b..9f33df54a4330 100644
--- a/onnxruntime/core/providers/provider_factory_creators.h
+++ b/onnxruntime/core/providers/provider_factory_creators.h
@@ -78,6 +78,10 @@
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
 #endif
 
+#if defined(USE_NV)
+#include "core/providers/nv_tensorrt_rtx/nv_provider_factory_creator.h"
+#endif
+
 #if defined(USE_VITISAI)
 #include "core/providers/vitisai/vitisai_provider_factory_creator.h"
 #endif
@@ -101,3 +105,7 @@
 #if defined(USE_AZURE)
 #include "core/providers/azure/azure_provider_factory_creator.h"
 #endif
+
+#if defined(USE_NV)
+#include "core/providers/nv_tensorrt_rtx/nv_provider_factory_creator.h"
+#endif
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
index 27b0c86827531..5acfcb859b63a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
@@ -29,8 +31,63 @@ class CastOpBuilder : public BaseOpBuilder {
                                      std::vector<std::string>&& input_names,
                                      const logging::Logger& logger,
                                      bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+ private:
+  // QNN HTP currently does not support casting FP16/FP32 to Bool, and thus such Cast will be replaced by NotEqual with
+  // an additional static input 0.f to achieve the idential functional.
+  bool IsFpToBoolCast(const NodeUnit& node_unit) const;
+  Status ProcessExtraInputForNotEqual(QnnModelWrapper& qnn_model_wrapper,
+                                      const NodeUnit& node_unit,
+                                      std::vector<std::string>& input_names,
+                                      const logging::Logger& logger) const;
 };
 
+bool CastOpBuilder::IsFpToBoolCast(const NodeUnit& node_unit) const {
+  const auto* input_type_proto = node_unit.Inputs()[0].node_arg.TypeAsProto();
+  const auto* output_type_proto = node_unit.Outputs()[0].node_arg.TypeAsProto();
+
+  Qnn_DataType_t input_qnn_dtype = QNN_DATATYPE_UNDEFINED;
+  Qnn_DataType_t output_qnn_dtype = QNN_DATATYPE_UNDEFINED;
+
+  if (utils::GetQnnDataType(false, input_type_proto, input_qnn_dtype) != Status::OK() ||
+      utils::GetQnnDataType(false, output_type_proto, output_qnn_dtype) != Status::OK()) {
+    return false;
+  }
+
+  return ((input_qnn_dtype == QNN_DATATYPE_FLOAT_16 || input_qnn_dtype == QNN_DATATYPE_FLOAT_32) &&
+          output_qnn_dtype == QNN_DATATYPE_BOOL_8);
+}
+
+Status CastOpBuilder::ProcessExtraInputForNotEqual(QnnModelWrapper& qnn_model_wrapper,
+                                                   const NodeUnit& node_unit,
+                                                   std::vector<std::string>& input_names,
+                                                   const logging::Logger& logger) const {
+  const auto& input = node_unit.Inputs()[0];
+  if (input.quant_param.has_value()) {
+    return Status::OK();
+  }
+
+  // Build additional static input with value 0.
+  const std::string& input_name = utils::GetNodeName(node_unit) + "_notequal_zero";
+
+  Qnn_DataType_t qnn_data_type = QNN_DATATYPE_UNDEFINED;
+  const auto* type_proto = input.node_arg.TypeAsProto();
+  ORT_RETURN_IF_ERROR(utils::GetQnnDataType(false, type_proto, qnn_data_type));
+
+  QnnTensorWrapper input_tensor_wrapper(input_name,
+                                        QNN_TENSOR_TYPE_STATIC,
+                                        qnn_data_type,
+                                        QnnQuantParamsWrapper(),
+                                        std::vector<uint32_t>{1},
+                                        std::vector<uint8_t>(utils::GetElementSizeByType(qnn_data_type), 0));
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensor_wrapper)),
+                    "Failed to add additional input tensor for QNN Cast node that will be replaced by NotEqual.");
+  input_names.push_back(input_name);
+
+  LOGS(logger, VERBOSE) << "FP-to-Bool Cast node " << utils::GetNodeName(node_unit) << " is replaced by NotEqual.";
+  return Status::OK();
+}
+
 Status CastOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                     const NodeUnit& node_unit,
                                     const logging::Logger& logger,
@@ -47,7 +104,9 @@ Status CastOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   if (qnn_model_wrapper.IsQnnTensorWrapperExist(input_name)) {
     LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << input_name;
     input_names.push_back(input_name);
-    return Status::OK();
+    return IsFpToBoolCast(node_unit)
+               ? ProcessExtraInputForNotEqual(qnn_model_wrapper, node_unit, input_names, logger)
+               : Status::OK();
   }
 
   std::vector<uint8_t> unpacked_tensor;
@@ -75,7 +134,9 @@ Status CastOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                     "Failed to add input tensor for QNN Cast node.");
   input_names.push_back(input_name);
 
-  return Status::OK();
+  return IsFpToBoolCast(node_unit)
+             ? ProcessExtraInputForNotEqual(qnn_model_wrapper, node_unit, input_names, logger)
+             : Status::OK();
 }
 
 Status CastOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
@@ -113,14 +174,17 @@ Status CastOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)),
                     "Failed to add output tensor for QNN Cast node.");
 
+  const std::string qnn_op_type = IsFpToBoolCast(node_unit)
+                                      ? QNN_OP_ELEMENT_WISE_NOT_EQUAL
+                                      : GetQnnOpType(node_unit.OpType());
   ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit),
                                                     QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                    GetQnnOpType(node_unit.OpType()),
+                                                    qnn_op_type,
                                                     std::move(input_names),
                                                     {output_name},
                                                     {},
                                                     do_op_validation),
-                    "Failed to create QNN Cast node.");
+                    "Failed to create " + qnn_op_type + " node.");
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 9d61e1f12f5b6..26adc0aaa8686 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -297,6 +297,7 @@ constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
 constexpr const char* kVitisAIExecutionProvider = "VitisAIExecutionProvider";
 constexpr const char* kRocmExecutionProvider = "ROCMExecutionProvider";
 constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";
+constexpr const char* kNvTensorRTRTXExecutionProvider = "NvTensorRTRTXExecutionProvider";
 constexpr const char* kMIGraphXExecutionProvider = "MIGraphXExecutionProvider";
 constexpr const char* kQnnExecutionProvider = "QNNExecutionProvider";
 constexpr const char* kCpuExecutionProvider = "CPUExecutionProvider";
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 0f6cf56cd951f..ff29de6aa71db 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -347,7 +347,7 @@ common::Status IExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
   return g_host->IExecutionProvider__Compile(this, fused_nodes_and_graphs, node_compute_funcs);
 }
 
-#ifdef USE_TENSORRT
+#if defined(USE_TENSORRT) || defined(USE_NV)
 std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) {
   return g_host->CreateCUDAAllocator(device_id, name);
 }
diff --git a/onnxruntime/core/providers/webgpu/nn/instance_norm.cc b/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
index 0cab454a5a530..f3bccec4872fc 100644
--- a/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
+++ b/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
@@ -13,23 +13,25 @@ namespace onnxruntime {
 namespace webgpu {
 
 Status ComputeChannelScaleShiftProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseIndicesTypeAlias);
+  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   const auto& scale = shader.AddInput("scale", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   const auto& bias = shader.AddInput("bias", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
-  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
 
-  shader.AdditionalImplementation() << "var<workgroup> workgroup_shared_sum : array<x_value_t, " << workgroup_size_ << ">;\n"
-                                    << "var<workgroup> workgroup_shared_squared_sum : array<x_value_t, " << workgroup_size_ << ">;\n"
+  shader.AdditionalImplementation() << "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n"
+                                    << "var<workgroup> workgroup_shared_sum : array<f32_val_t, " << workgroup_size_ << ">;\n"
+                                    << "var<workgroup> workgroup_shared_squared_sum : array<f32_val_t, " << workgroup_size_ << ">;\n"
                                     << "const workgroup_size = " << workgroup_size_ << ";\n";
+
   shader.MainFunctionBody() << "  let batch = workgroup_idx / uniforms.x_shape[1];\n"
                             << "  let channel = workgroup_idx % uniforms.x_shape[1];\n"
                             << "  let hight = uniforms.x_shape[2];\n"
                             << "   // initialize workgroup memory<< \n"
-                            << "  var sum = x_value_t(0);\n"
-                            << "  var squared_sum = x_value_t(0);\n"
+                            << "  var sum = f32_val_t(0);\n"
+                            << "  var squared_sum = f32_val_t(0);\n"
                             << "  for (var h = local_idx; h < hight; h += workgroup_size) {\n"
                             << "    let indices = x_indices_t(batch, channel, h);\n"
-                            << "    let value =" << input.GetByIndices("indices") << ";\n"
+                            << "    let value = f32_val_t(" << input.GetByIndices("indices") << ");\n"
                             << "    sum += value;\n"
                             << "    squared_sum += value * value;\n"
                             << "  }\n"
@@ -44,12 +46,12 @@ Status ComputeChannelScaleShiftProgram::GenerateShaderCode(ShaderHelper& shader)
                             << "    workgroupBarrier();\n"
                             << "  }\n"
                             << "  if (local_idx == 0) {\n"
-                            << "    let sum_final = " << SumVector("workgroup_shared_sum[0]", components_) << " / x_element_t(hight * " << components_ << ");\n"
-                            << "    let squared_sum_final = " << SumVector("workgroup_shared_squared_sum[0]", components_) << " / x_element_t(hight * " << components_ << ");\n"
-                            << "    let inv_std_dev = inverseSqrt(squared_sum_final - sum_final * sum_final + x_element_t(" << std::to_string(epsilon_) << "));\n"
-                            << "    let channel_scale = inv_std_dev * " << scale.GetByOffset("channel") << ";\n"
-                            << "    let channel_shift = " << bias.GetByOffset("channel") << " - sum_final * channel_scale;\n"
-                            << "    " << output.SetByOffset("workgroup_idx", "output_value_t(channel_scale, channel_shift)") << ";\n"
+                            << "    let sum_final = " << SumVector("workgroup_shared_sum[0]", components_) << " / f32(hight * " << components_ << ");\n"
+                            << "    let squared_sum_final = " << SumVector("workgroup_shared_squared_sum[0]", components_) << " / f32(hight * " << components_ << ");\n"
+                            << "    let inv_std_dev = inverseSqrt(squared_sum_final - sum_final * sum_final + f32(" << std::to_string(epsilon_) << "));\n"
+                            << "    let channel_scale = inv_std_dev * f32(" << scale.GetByOffset("channel") << ");\n"
+                            << "    let channel_shift = f32(" << bias.GetByOffset("channel") << ") - sum_final * channel_scale;\n"
+                            << "    " << output.SetByOffset("workgroup_idx", "output_value_t(output_element_t(channel_scale), output_element_t(channel_shift))") << ";\n"
                             << "  }\n";
   return Status::OK();
 }
@@ -110,7 +112,7 @@ Status InstanceNormProgramNHWC::GenerateShaderCode(ShaderHelper& shader) const {
                             << "let input_value = " << input.GetByOffset("global_idx") << ";\n";
   if (components_ > 1) {
     shader.MainFunctionBody() << "for (var i : u32 = 0; i < uniforms.components; i = i + 1) {\n"
-                              << "  let scale_sift =  " << channel_scale_shift.GetByOffset("scale_offset + i") << ";\n"
+                              << "  let scale_sift = " << channel_scale_shift.GetByOffset("uniforms.components * scale_offset + i") << ";\n"
                               << "  scale[i] = input_element_t(scale_sift.x);\n"
                               << "  shift[i] = input_element_t(scale_sift.y);\n"
                               << "}\n";
diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
new file mode 100644
index 0000000000000..e7f902cc08b40
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
@@ -0,0 +1,147 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/tensor/depth_to_space.h"
+#include "core/providers/webgpu/webgpu_utils.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+#define WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(start, end, domain, is_nhwc) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                        \
+      DepthToSpace,                                                         \
+      domain,                                                               \
+      start,                                                                \
+      end,                                                                  \
+      kWebGpuExecutionProvider,                                             \
+      (*KernelDefBuilder::Create())                                         \
+          .TypeConstraint("T", WebGpuSupportedFloatTypes()),                \
+      DepthToSpace<is_nhwc>);
+
+#define WEBGPU_DEPTH_TO_SPACE_KERNEL(version, domain, is_nhwc) \
+  ONNX_OPERATOR_KERNEL_EX(                                     \
+      DepthToSpace,                                            \
+      domain,                                                  \
+      version,                                                 \
+      kWebGpuExecutionProvider,                                \
+      (*KernelDefBuilder::Create())                            \
+          .TypeConstraint("T", WebGpuSupportedFloatTypes()),   \
+      DepthToSpace<is_nhwc>);
+
+WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kOnnxDomain, false)
+WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kOnnxDomain, false)
+
+WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kMSInternalNHWCDomain, true)
+WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kMSInternalNHWCDomain, true)
+
+void AppendPermFunction(std::ostream& os, const ShaderVariableHelper& input, const int64_t* perm) {
+  os << "fn perm(i: input_indices_t) -> input_indices_t {\n"
+     << "  var a: input_indices_t;\n";
+  for (int idx = 0; idx < input.Rank(); ++idx) {
+    os << "  " << input.IndicesSet("a", std::to_string(perm[idx]), "i[" + std::to_string(idx) + "]") << "\n";
+  }
+  os << "  return a;\n"
+     << "}\n";
+}
+
+Status DepthToSpaceProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& input = shader.AddInput("input");
+  const ShaderVariableHelper& output = shader.AddOutput("output");
+
+  AppendPermFunction(shader.AdditionalImplementation(), input, perm_);
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "  let indices = " << output.OffsetToIndices("global_idx") << ";\n"
+                            << "  let aIndices = perm(indices);\n"
+                            << "  " << output.SetByOffset("global_idx", input.GetByIndices("aIndices"));
+
+  return Status::OK();
+}
+
+template <bool is_nhwc>
+Status DepthToSpace<is_nhwc>::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
+  const auto* input = context.Input(0);
+  const TensorShape input_shape = input->Shape();
+  int64_t input_rank = input_shape.NumDimensions();
+  ORT_ENFORCE(input_rank == 4, "Input must be rank 4.");
+
+  int64_t n, c, h, w;
+  int64_t shape[6];
+  int64_t perm[6];
+  if (is_nhwc) {
+    n = input_shape[0];
+    h = input_shape[1];
+    w = input_shape[2];
+    c = input_shape[3];
+
+    if (is_dcr_) {
+      int64_t shape_values[] = {n, h, w, blocksize_, blocksize_, c / (blocksize_ * blocksize_)};
+      int64_t perm_values[] = {0, 1, 3, 2, 4, 5};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    } else {
+      int64_t shape_values[] = {n, h, w, c / (blocksize_ * blocksize_), blocksize_, blocksize_};
+      int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    }
+  } else {
+    n = input_shape[0];
+    h = input_shape[2];
+    w = input_shape[3];
+    c = input_shape[1];
+
+    if (is_dcr_) {
+      int64_t shape_values[] = {n, blocksize_, blocksize_, c / (blocksize_ * blocksize_), h, w};
+      int64_t perm_values[] = {0, 3, 4, 1, 5, 2};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    } else {
+      int64_t shape_values[] = {n, c / (blocksize_ * blocksize_), blocksize_, blocksize_, h, w};
+      int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    }
+  }
+
+  std::vector<int64_t> shape_vec(shape, shape + 6);
+  TensorShape input_override_shape(shape_vec);
+
+  // Calculate the final 4D output shape
+  int64_t output_shape[4];
+  if (is_nhwc) {
+    int64_t output_shape_values[] = {n, h * blocksize_, w * blocksize_, c / (blocksize_ * blocksize_)};
+    std::copy(output_shape_values, output_shape_values + 4, output_shape);
+  } else {
+    int64_t output_shape_values[] = {n, c / (blocksize_ * blocksize_), h * blocksize_, w * blocksize_};
+    std::copy(output_shape_values, output_shape_values + 4, output_shape);
+  }
+  TensorShape final_output_shape(gsl::make_span(output_shape));
+
+  auto* output = context.Output(0, final_output_shape);
+  int64_t output_size = output->Shape().Size();
+
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  std::vector<int64_t> shape_after_permutation_vec(6);
+  for (int i = 0; i < 6; i++) {
+    shape_after_permutation_vec[i] = shape[perm[i]];
+  }
+  TensorShape output_override_shape(shape_after_permutation_vec);
+
+  DepthToSpaceProgram program{perm};
+  program
+      .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1})
+      .AddOutput({output, ProgramTensorMetadataDependency::None, output_override_shape, 1})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD")
+      .AddUniformVariable({static_cast<uint32_t>(output_size)});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h
new file mode 100644
index 0000000000000..153618b5d0237
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class DepthToSpaceProgram final : public Program<DepthToSpaceProgram> {
+ public:
+  DepthToSpaceProgram(int64_t* perm) : Program{"DepthToSpace"}, perm_{perm} {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  int64_t* perm_;
+};
+
+template <bool is_nhwc>
+class DepthToSpace final : public WebGpuKernel {
+ public:
+  DepthToSpace(const OpKernelInfo& info) : WebGpuKernel(info) {
+    blocksize_ = info.GetAttr<int64_t>("blocksize");
+    std::string mode = info.GetAttrOrDefault<std::string>("mode", "DCR");
+    is_dcr_ = (mode == "DCR");
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int64_t blocksize_;
+  bool is_dcr_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index b126ca823970a..f5f108121cb8d 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -581,10 +581,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, 22, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 23, Transpose)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 21, Conv)>,
diff --git a/onnxruntime/core/session/abi_key_value_pairs.h b/onnxruntime/core/session/abi_key_value_pairs.h
index 28de183fde405..3242be817881a 100644
--- a/onnxruntime/core/session/abi_key_value_pairs.h
+++ b/onnxruntime/core/session/abi_key_value_pairs.h
@@ -19,10 +19,17 @@ struct OrtKeyValuePairs {
     Sync();
   }
   void Add(const char* key, const char* value) {
-    return Add(std::string(key), std::string(value));
+    // ignore if either are nullptr.
+    if (key && value) {
+      Add(std::string(key), std::string(value));
+    }
   }
 
   void Add(const std::string& key, const std::string& value) {
+    if (key.empty()) {  // ignore empty keys
+      return;
+    }
+
     auto iter_inserted = entries.insert({key, value});
     bool inserted = iter_inserted.second;
     if (inserted) {
@@ -37,6 +44,10 @@ struct OrtKeyValuePairs {
 
   // we don't expect this to be common. reconsider using std::vector if it turns out to be.
   void Remove(const char* key) {
+    if (key == nullptr) {
+      return;
+    }
+
     auto iter = entries.find(key);
     if (iter != entries.end()) {
       auto key_iter = std::find(keys.begin(), keys.end(), iter->first.c_str());
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 06fa3d4827366..f0fcf9841c2c1 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -104,6 +104,7 @@ using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwIn
 #include "core/providers/migraphx/migraphx_provider_factory_creator.h"
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
+#include "core/providers/nv_tensorrt_rtx/nv_provider_factory_creator.h"
 #include "core/providers/vitisai/vitisai_provider_factory_creator.h"
 #include "core/providers/qnn/qnn_provider_factory_creator.h"
 
@@ -118,8 +119,10 @@ using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwIn
 #include "core/providers/cuda/cuda_provider_options.h"
 #include "core/providers/cann/cann_provider_options.h"
 #include "core/providers/dnnl/dnnl_provider_options.h"
+#include "core/providers/nv_tensorrt_rtx/nv_provider_factory.h"
+#include "core/providers/nv_tensorrt_rtx/nv_provider_options.h"
 
-#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+#if !defined(ORT_MINIMAL_BUILD) && (defined(USE_TENSORRT) || defined(USE_NV))
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #endif
 
@@ -154,6 +157,9 @@ ProviderInfo_ROCM& GetProviderInfo_ROCM();
 ProviderHostCPU& GetProviderHostCPU();
 ProviderInfo_MIGraphX* TryGetProviderInfo_MIGraphX();
 ProviderInfo_MIGraphX& GetProviderInfo_MIGraphX();
+ProviderInfo_Nv* TryGetProviderInfo_Nv();
+ProviderInfo_Nv& GetProviderInfo_Nv();
+
 ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vector<const OrtCustomOp*>& ops);
 struct TensorShapeProto_Dimension_Iterator_Impl : TensorShapeProto_Dimension_Iterator {
   TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
@@ -1788,6 +1794,7 @@ void ProviderLibrary::Unload() {
       }
     }
 
+    initialized_ = false;
     handle_ = nullptr;
     provider_ = nullptr;
   }
@@ -1849,6 +1856,7 @@ static ProviderLibrary s_library_tensorrt(LIBRARY_PREFIX ORT_TSTR("onnxruntime_p
                                           false
 #endif
 );
+static ProviderLibrary s_library_nv(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_nv_tensorrt_rtx") LIBRARY_EXTENSION);
 static ProviderLibrary s_library_migraphx(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_migraphx") LIBRARY_EXTENSION);
 
 // QNN EP can be built either as a static library or a shared library. Can safely define s_library_qnn even if static.
@@ -1866,6 +1874,7 @@ void UnloadSharedProviders() {
   s_library_shared.Unload();
   s_library_migraphx.Unload();
   s_library_qnn.Unload();
+  s_library_nv.Unload();
 }
 
 // Used by test code
@@ -1996,6 +2005,14 @@ std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Creat
   return s_library_tensorrt.Get().CreateExecutionProviderFactory(provider_options);
 }
 
+std::shared_ptr<IExecutionProviderFactory> NvProviderFactoryCreator::Create(int device_id) {
+  return s_library_nv.Get().CreateExecutionProviderFactory(device_id);
+}
+
+std::shared_ptr<IExecutionProviderFactory> NvProviderFactoryCreator::Create(const ProviderOptions& provider_options) {
+  return s_library_nv.Get().CreateExecutionProviderFactory(&provider_options);
+}
+
 std::shared_ptr<IExecutionProviderFactory> MIGraphXProviderFactoryCreator::Create(const OrtMIGraphXProviderOptions* provider_options) {
   return s_library_migraphx.Get().CreateExecutionProviderFactory(provider_options);
 }
@@ -2091,6 +2108,20 @@ ProviderInfo_TensorRT& GetProviderInfo_TensorRT() {
   ORT_THROW("TensorRT Provider not available, can't get interface for it");
 }
 
+ProviderInfo_Nv* TryGetProviderInfo_Nv() try {
+  return reinterpret_cast<ProviderInfo_Nv*>(s_library_nv.Get().GetInfo());
+} catch (const std::exception& exception) {
+  LOGS_DEFAULT(ERROR) << exception.what();
+  return nullptr;
+}
+
+ProviderInfo_Nv& GetProviderInfo_Nv() {
+  if (auto* info = TryGetProviderInfo_Nv())
+    return *info;
+
+  ORT_THROW("NV Provider not available, can't get interface for it");
+}
+
 ProviderInfo_CUDA* TryGetProviderInfo_CUDA() try {
   return reinterpret_cast<ProviderInfo_CUDA*>(s_library_cuda.Get().GetInfo());
 } catch (const std::exception& exception) {
@@ -2553,7 +2584,7 @@ ORT_API_STATUS_IMPL(OrtApis::UpdateTensorRTProviderOptions,
   API_IMPL_END
 }
 
-#if defined(USE_TENSORRT) || defined(USE_CUDA) || defined(USE_CANN) || defined(USE_DNNL) || defined(USE_ROCM)
+#if defined(USE_TENSORRT) || defined(USE_CUDA) || defined(USE_CANN) || defined(USE_DNNL) || defined(USE_ROCM) || defined(USE_NV)
 static std::string BuildOptionsString(const onnxruntime::ProviderOptions::iterator& begin,
                                       const onnxruntime::ProviderOptions::iterator& end) {
   std::ostringstream options;
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index be3a9ff9ef62e..82201741cb047 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -26,6 +26,9 @@
 #include "core/providers/dml/dml_provider_factory_creator.h"
 #endif
 
+#if defined(USE_NV)
+#include "core/providers/nv_tensorrt_rtx/nv_provider_options.h"
+#endif
 using namespace onnxruntime;
 
 namespace onnxruntime {
@@ -97,6 +100,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     JS,
     VitisAI,
     CoreML,
+    NvTensorRtRtx,  // TensorRt EP for RTX GPUs.
   };
 
   struct EpToAppend {
@@ -105,7 +109,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     const char* canonical_name = nullptr;
   };
 
-  static std::array<EpToAppend, 11> supported_eps = {
+  static std::array<EpToAppend, 12> supported_eps = {
       EpToAppend{EpID::DML, "DML", kDmlExecutionProvider},
       EpToAppend{EpID::QNN, "QNN", kQnnExecutionProvider},
       EpToAppend{EpID::OpenVINO, "OpenVINO", kOpenVINOExecutionProvider},
@@ -117,7 +121,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
       EpToAppend{EpID::JS, "JS", kJsExecutionProvider},
       EpToAppend{EpID::VitisAI, "VitisAI", kVitisAIExecutionProvider},
       EpToAppend{EpID::CoreML, "CoreML", kCoreMLExecutionProvider},
-  };
+      EpToAppend{EpID::NvTensorRtRtx, "NvTensorRtRtx", kNvTensorRTRTXExecutionProvider}};
 
   ProviderOptions provider_options;
   OrtStatus* status = ParseProviderOptions(provider_options_keys,
@@ -280,6 +284,18 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
       options->provider_factories.push_back(CoreMLProviderFactoryCreator::Create(provider_options));
 #else
       status = create_not_supported_status();
+#endif
+      break;
+    }
+    case EpID::NvTensorRtRtx: {
+#if defined(USE_NV)
+      auto factory = onnxruntime::NvProviderFactoryCreator::Create(provider_options);
+      if (!factory) {
+        return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Nv_TensorRT_RTX: Failed to load shared library");
+      }
+      options->provider_factories.push_back(factory);
+#else
+      status = create_not_supported_status();
 #endif
       break;
     }
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 785eb9c485d25..ed0298a85b8e7 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -507,6 +507,23 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
                 self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
             else:
                 self._fallback_providers = ["CPUExecutionProvider"]
+        if "NvTensorRTRTXExecutionProvider" in available_providers:
+            if (
+                providers
+                and any(
+                    provider == "CUDAExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
+                    for provider in providers
+                )
+                and any(
+                    provider == "NvTensorRTRTXExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "NvExecutionProvider")
+                    for provider in providers
+                )
+            ):
+                self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+            else:
+                self._fallback_providers = ["CPUExecutionProvider"]
         # MIGraphX can fall back to ROCM if it's explicitly assigned. All others fall back to CPU.
         elif "MIGraphXExecutionProvider" in available_providers:
             if providers and any(
@@ -582,6 +599,15 @@ def _register_ep_custom_ops(self, session_options, providers, provider_options,
             ):
                 C.register_tensorrt_plugins_as_custom_ops(session_options, providers[i][1])
 
+            if providers[i] in available_providers and providers[i] == "NvTensorRTRTXExecutionProvider":
+                C.register_nv_tensorrt_rtx_plugins_as_custom_ops(session_options, provider_options[i])
+            elif (
+                isinstance(providers[i], tuple)
+                and providers[i][0] in available_providers
+                and providers[i][0] == "NvTensorrtRTXExecutionProvider"
+            ):
+                C.register_nv_tensorrt_rtx_plugins_as_custom_ops(session_options, providers[i][1])
+
 
 class IOBinding:
     """
diff --git a/onnxruntime/python/onnxruntime_pybind_schema.cc b/onnxruntime/python/onnxruntime_pybind_schema.cc
index 958da26f4faf0..cd1d2a8da10aa 100644
--- a/onnxruntime/python/onnxruntime_pybind_schema.cc
+++ b/onnxruntime/python/onnxruntime_pybind_schema.cc
@@ -48,6 +48,9 @@ void addGlobalSchemaFunctions(pybind11::module& m) {
 #ifdef USE_TENSORRT
             onnxruntime::TensorrtProviderFactoryCreator::Create(0),
 #endif
+#ifdef USE_NV
+            onnxruntime::NvProviderFactoryCreator::Create(0),
+#endif
 #ifdef USE_MIGRAPHX
             onnxruntime::MIGraphXProviderFactoryCreator::Create(0),
 #endif
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 22914c9dec7fe..60bc6865b2ccf 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -507,6 +507,38 @@ void RegisterTensorRTPluginsAsCustomOps(PySessionOptions& so, const ProviderOpti
 }
 #endif
 
+#ifdef USE_NV
+void RegisterNvTensorRTRtxPluginsAsCustomOps(PySessionOptions& so, const ProviderOptions& options) {
+  if (auto* nv_tensorrt_rtx_provider_info = TryGetProviderInfo_Nv()) {
+    auto is_already_in_domains = [&](std::string& domain_name, std::vector<OrtCustomOpDomain*>& domains) {
+      for (auto ptr : domains) {
+        if (domain_name == ptr->domain_) {
+          return true;
+        }
+      }
+      return false;
+    };
+
+    std::string extra_plugin_lib_paths = "";
+    const auto it = options.find("extra_plugin_lib_paths");
+    if (it != options.end()) {
+      extra_plugin_lib_paths = it->second;
+    }
+    std::vector<OrtCustomOpDomain*> custom_op_domains;
+    nv_tensorrt_rtx_provider_info->GetTensorRTCustomOpDomainList(custom_op_domains, extra_plugin_lib_paths);
+    for (auto ptr : custom_op_domains) {
+      if (!is_already_in_domains(ptr->domain_, so.custom_op_domains_)) {
+        so.custom_op_domains_.push_back(ptr);
+      } else {
+        LOGS_DEFAULT(WARNING) << "The custom op domain name " << ptr->domain_ << " is already in session option.";
+      }
+    }
+  } else {
+    ORT_THROW("Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.");
+  }
+}
+#endif
+
 std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     const SessionOptions& session_options,
     const std::string& type,
@@ -851,6 +883,28 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
                           << "https://onnxruntime.ai/docs/execution-providers/"
                           << "TensorRT-ExecutionProvider.html#requirements to ensure all dependencies are met.";
 #endif
+
+  } else if (type == kNvTensorRTRTXExecutionProvider) {
+#ifdef USE_NV
+    if (Env::Default().GetEnvironmentVar("ORT_NV_TENSORRT_RTX_UNAVAILABLE").empty()) {
+      auto it = provider_options_map.find(type);
+      if (it != provider_options_map.end()) {
+        ProviderOptions info = it->second;
+        if (std::shared_ptr<IExecutionProviderFactory> nv_tensorrt_rtx_provider_factory = onnxruntime::NvProviderFactoryCreator::Create(info)) {
+          return nv_tensorrt_rtx_provider_factory->CreateProvider();
+        }
+      } else {
+        if (std::shared_ptr<IExecutionProviderFactory> nv_tensorrt_rtx_provider_factory = onnxruntime::NvProviderFactoryCreator::Create(cuda_device_id)) {
+          return nv_tensorrt_rtx_provider_factory->CreateProvider();
+        }
+      }
+    }
+    LOGS_DEFAULT(WARNING) << "Failed to create "
+                          << type
+                          << ". Please reference "
+                          << "https://onnxruntime.ai/docs/execution-providers/"
+                          << "TensorRT-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+#endif
   } else if (type == kMIGraphXExecutionProvider) {
 #ifdef USE_MIGRAPHX
     std::string calibration_table;
@@ -1535,6 +1589,12 @@ void addGlobalMethods(py::module& m) {
       "Register TensorRT plugins as custom ops.");
 #endif
 
+#ifdef USE_NV
+  m.def(
+      "register_nv_tensorrt_rtx_plugins_as_custom_ops", [](PySessionOptions& so, const ProviderOptions& options) { RegisterNvTensorRTRtxPluginsAsCustomOps(so, options); },
+      "Register NV TensorRT RTX plugins as custom ops.");
+#endif
+
 #ifdef ENABLE_ATEN
   m.def("register_aten_op_executor",
         [](const std::string& is_tensor_argument_address_str, const std::string& aten_op_executor_address_str) -> void {
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 8d4a882b140ac..168880517c3a5 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -29,6 +29,7 @@ struct OrtStatus {
 #include "core/providers/providers.h"
 #include "core/providers/provider_factory_creators.h"
 #include "core/providers/tensorrt/tensorrt_provider_options.h"
+#include "core/providers/nv_tensorrt_rtx/nv_provider_options.h"
 
 #if defined(USE_CUDA) || defined(USE_ROCM)
 #define BACKEND_PROC "GPU"
@@ -122,6 +123,9 @@ struct OrtStatus {
 #ifdef USE_TENSORRT
 #include "core/providers/tensorrt/tensorrt_provider_factory.h"
 #endif
+#ifdef USE_NV
+#include "core/providers/nv_tensorrt_rtx/nv_provider_factory.h"
+#endif
 #ifdef USE_MIGRAPHX
 #include "core/providers/migraphx/migraphx_provider_factory.h"
 #endif
@@ -173,6 +177,13 @@ ProviderInfo_TensorRT& GetProviderInfo_TensorRT();
 }  // namespace onnxruntime
 #endif
 
+#ifdef USE_NV
+namespace onnxruntime {
+ProviderInfo_Nv* TryGetProviderInfo_Nv();
+ProviderInfo_Nv& GetProviderInfo_Nv();
+}  // namespace onnxruntime
+#endif
+
 #ifdef USE_CANN
 namespace onnxruntime {
 ProviderInfo_CANN* TryGetProviderInfo_CANN();
diff --git a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
index 0f0c12b0e0200..2b7fbffa842f7 100644
--- a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
@@ -126,8 +126,14 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [1, 0, 0, 0, 0],
             )
             if k_nodes is None:
-                logger.debug("fuse_conformer_attention: failed to match k path")
-                return
+                k_nodes = self.model.match_parent_path(
+                    matmul_qk,
+                    ["Transpose", "Reshape", "Add", "MatMul"],
+                    [1, 0, 0, 0],
+                )
+                if k_nodes is None:
+                    logger.debug("fuse_conformer_attention: failed to match k path")
+                    return
         else:
             concat_k = k_nodes[1]
             concat_parent = self.model.get_parent(concat_k, 0, None)
@@ -188,7 +194,6 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             logger.debug("fuse_conformer_attention: MultiHeadAttention node creation failed")
             return
 
-        self.increase_counter(new_node.op_type)
         self.nodes_to_add.append(new_node)
         self.node_name_to_graph_name[new_node.name] = self.this_graph_name
 
diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py
index 5343c77adb97a..e31bc7e6c1bcb 100644
--- a/onnxruntime/python/tools/transformers/fusion_utils.py
+++ b/onnxruntime/python/tools/transformers/fusion_utils.py
@@ -309,11 +309,9 @@ def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray:
         # When weights are in external data format but not presented, we can still test the optimizer with two changes:
         # (1) set fill_zeros = True  (2) change load_external_data=False in optimizer.py
         if fill_zeros:
-            from onnx import mapping
-
             return ndarray(
                 shape=tensor.dims,
-                dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type],
+                dtype=helper.tensor_dtype_to_np_dtype(tensor.data_type),
             )
 
         return numpy_helper.to_array(tensor)
diff --git a/onnxruntime/test/autoep/test_autoep_selection.cc b/onnxruntime/test/autoep/test_autoep_selection.cc
index b5d9c81f250c2..f9c50bf5069f1 100644
--- a/onnxruntime/test/autoep/test_autoep_selection.cc
+++ b/onnxruntime/test/autoep/test_autoep_selection.cc
@@ -58,7 +58,7 @@ template <typename ModelOutputT, typename ModelInputT = float, typename InputT =
 static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& model_uri,
                           const std::string& ep_to_select,
                           std::optional<std::filesystem::path> library_path,
-                          const OrtKeyValuePairs& provider_options,
+                          const Ort::KeyValuePairs& ep_options,
                           const std::vector<InputT>& inputs,
                           const char* output_name,
                           const std::vector<int64_t>& expected_dims_y,
@@ -75,13 +75,15 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
 
   if (auto_select) {
     // manually specify EP to select for now
-    ASSERT_ORTSTATUS_OK(Ort::GetApi().AddSessionConfigEntry(session_options, "test.ep_to_select",
-                                                            ep_to_select.c_str()));
+    session_options.AddConfigEntry("test.ep_to_select", ep_to_select.c_str());
 
+    // add the provider options to the session options with the required prefix
     const std::string option_prefix = OrtSessionOptions::GetProviderOptionPrefix(ep_to_select.c_str());
-    for (const auto& [key, value] : provider_options.entries) {
+    std::vector<const char*> keys, values;
+    ep_options.GetKeyValuePairs(keys, values);
+    for (size_t i = 0, end = keys.size(); i < end; ++i) {
       // add the default value with prefix
-      session_options.AddConfigEntry((option_prefix + key).c_str(), value.c_str());
+      session_options.AddConfigEntry((option_prefix + keys[i]).c_str(), values[i]);
     }
   } else {
     std::vector<const OrtEpDevice*> devices;
@@ -92,9 +94,17 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
       DefaultDeviceSelection(ep_to_select, devices);
     }
 
-    ASSERT_ORTSTATUS_OK(Ort::GetApi().SessionOptionsAppendExecutionProvider_V2(
-        session_options, env, devices.data(), devices.size(),
-        provider_options.keys.data(), provider_options.values.data(), provider_options.entries.size()));
+    // C API. Test the C++ API because if it works the C API must also work.
+    // ASSERT_ORTSTATUS_OK(Ort::GetApi().SessionOptionsAppendExecutionProvider_V2(
+    //    session_options, env, devices.data(), devices.size(),
+    //    provider_options.keys.data(), provider_options.values.data(), provider_options.entries.size()));
+    std::vector<Ort::ConstEpDevice> ep_devices;
+    ep_devices.reserve(devices.size());
+    for (const auto* device : devices) {
+      ep_devices.emplace_back(device);
+    }
+
+    session_options.AppendExecutionProvider_V2(*ort_env, ep_devices, ep_options);
   }
 
   // if session creation passes, model loads fine
@@ -115,7 +125,7 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
 
 namespace {
 void RunBasicTest(const std::string& ep_name, std::optional<std::filesystem::path> library_path,
-                  const OrtKeyValuePairs& provider_options = {},
+                  const Ort::KeyValuePairs& provider_options = Ort::KeyValuePairs{},
                   const std::function<void(std::vector<const OrtEpDevice*>&)>& select_devices = nullptr) {
   const auto run_test = [&](bool auto_select) {
     std::vector<Input<float>> inputs(1);
@@ -149,7 +159,7 @@ TEST(AutoEpSelection, CpuEP) {
 
 #if defined(USE_CUDA)
 TEST(AutoEpSelection, CudaEP) {
-  OrtKeyValuePairs provider_options;
+  Ort::KeyValuePairs provider_options;
   provider_options.Add("prefer_nhwc", "1");
   RunBasicTest(kCudaExecutionProvider, "onnxruntime_providers_cuda", provider_options);
 }
@@ -157,7 +167,7 @@ TEST(AutoEpSelection, CudaEP) {
 
 #if defined(USE_DML)
 TEST(AutoEpSelection, DmlEP) {
-  OrtKeyValuePairs provider_options;
+  Ort::KeyValuePairs provider_options;
   provider_options.Add("disable_metacommands", "true");  // checking options are passed through
 
   const auto select_devices = [&](std::vector<const OrtEpDevice*>& devices) {
@@ -172,6 +182,7 @@ TEST(AutoEpSelection, DmlEP) {
       if (strcmp(c_api->EpDevice_EpName(ep_device), kDmlExecutionProvider) == 0) {
         const auto* device = c_api->EpDevice_Device(ep_device);
         const OrtKeyValuePairs* kvps = c_api->HardwareDevice_Metadata(device);
+
         if (devices.empty()) {
           // add the first device
           devices.push_back(ep_device);
@@ -179,13 +190,7 @@ TEST(AutoEpSelection, DmlEP) {
           // if this is available, 0 == best performance
           auto* perf_index = c_api->GetKeyValue(kvps, "HighPerformanceIndex");
           if (perf_index && strcmp(perf_index, "0") == 0) {
-            devices.push_back(ep_device);
-          } else {
-            // let an NVIDIA device override the first device
-            if (strcmp(c_api->EpDevice_EpVendor(ep_device), "NVIDIA") == 0) {
-              devices.clear();
-              devices[0] = ep_device;
-            }
+            devices[0] = ep_device;  // replace as this is the higher performance device
           }
         }
       }
@@ -204,16 +209,71 @@ TEST(AutoEpSelection, WebGpuEP) {
 }
 #endif
 
-TEST(OrtEpLibrary, LoadUnloadPluginLibrary) {
+// tests for AutoEP selection related things in the API that aren't covered by the other tests.
+TEST(AutoEpSelection, MiscApiTests) {
+  const OrtApi* c_api = &Ort::GetApi();
+
+  // nullptr and empty input to OrtKeyValuePairs
+  {
+    OrtKeyValuePairs* kvps = nullptr;
+    c_api->CreateKeyValuePairs(&kvps);
+    c_api->AddKeyValuePair(kvps, "key1", nullptr);    // should be ignored
+    c_api->AddKeyValuePair(kvps, nullptr, "value1");  // should be ignored
+    c_api->RemoveKeyValuePair(kvps, nullptr);         // should be ignored
+
+    c_api->AddKeyValuePair(kvps, "", "value2");  // empty key should be ignored
+    ASSERT_EQ(c_api->GetKeyValue(kvps, ""), nullptr);
+
+    c_api->AddKeyValuePair(kvps, "key2", "");  // empty value is allowed
+    ASSERT_EQ(c_api->GetKeyValue(kvps, "key2"), std::string(""));
+  }
+
+  // construct KVP from std::unordered_map
+  {
+    std::unordered_map<std::string, std::string> kvps;
+    kvps["key1"] = "value1";
+    kvps["key2"] = "value2";
+    Ort::KeyValuePairs ort_kvps(kvps);
+    ASSERT_EQ(ort_kvps.GetValue("key1"), std::string("value1"));
+    ASSERT_EQ(ort_kvps.GetValue("key2"), std::string("value2"));
+  }
+
+  std::vector<Ort::ConstEpDevice> ep_devices = ort_env->GetEpDevices();
+
+  // explicit EP selection with Ort::KeyValuePairs for options
+  {
+    Ort::SessionOptions session_options;
+    Ort::KeyValuePairs ep_options;
+    ep_options.Add("option1", "true");
+    session_options.AppendExecutionProvider_V2(*ort_env, {ep_devices[0]}, ep_options);
+  }
+
+  // explicit EP selection with <std::string, std::string> for options
+  {
+    Ort::SessionOptions session_options;
+    std::unordered_map<std::string, std::string> ep_options;
+    ep_options["option1"] = "true";
+    session_options.AppendExecutionProvider_V2(*ort_env, {ep_devices[0]}, ep_options);
+  }
+}
+
+namespace {
+struct ExamplePluginInfo {
+  const std::filesystem::path library_path =
 #if _WIN32
-  std::filesystem::path library_path = "example_plugin_ep.dll";
+      "example_plugin_ep.dll";
 #else
-  std::filesystem::path library_path = "libexample_plugin_ep.so";
+      "libexample_plugin_ep.so";
 #endif
-
   const std::string registration_name = "example_ep";
+};
 
-  Ort::SessionOptions session_options;
+static const ExamplePluginInfo example_plugin_info;
+}  // namespace
+
+TEST(OrtEpLibrary, LoadUnloadPluginLibrary) {
+  const std::filesystem::path& library_path = example_plugin_info.library_path;
+  const std::string& registration_name = example_plugin_info.registration_name;
 
   OrtEnv* c_api_env = *ort_env;
   const OrtApi* c_api = &Ort::GetApi();
@@ -238,6 +298,48 @@ TEST(OrtEpLibrary, LoadUnloadPluginLibrary) {
   ASSERT_ORTSTATUS_OK(Ort::GetApi().UnregisterExecutionProviderLibrary(c_api_env,
                                                                        registration_name.c_str()));
 }
+
+TEST(OrtEpLibrary, LoadUnloadPluginLibraryCxxApi) {
+  const std::filesystem::path& library_path = example_plugin_info.library_path;
+  const std::string& registration_name = example_plugin_info.registration_name;
+
+  // this should load the library and create OrtEpDevice
+  ort_env->RegisterExecutionProviderLibrary(registration_name.c_str(), library_path.c_str());
+
+  std::vector<Ort::ConstEpDevice> ep_devices = ort_env->GetEpDevices();
+
+  // should be one device for the example EP
+  auto test_ep_device = std::find_if(ep_devices.begin(), ep_devices.end(),
+                                     [&registration_name](Ort::ConstEpDevice& device) {
+                                       // the example uses the registration name for the EP name
+                                       // but that is not a requirement and the two can differ.
+                                       return device.EpName() == registration_name;
+                                     });
+  ASSERT_NE(test_ep_device, ep_devices.end()) << "Expected an OrtEpDevice to have been created by the test library.";
+
+  // test all the C++ getters. expected values are from \onnxruntime\test\autoep\library\example_plugin_ep.cc
+  ASSERT_STREQ(test_ep_device->EpVendor(), "Contoso");
+
+  auto metadata = test_ep_device->EpMetadata();
+  ASSERT_STREQ(metadata.GetValue("version"), "0.1");
+
+  auto options = test_ep_device->EpOptions();
+  ASSERT_STREQ(options.GetValue("run_really_fast"), "true");
+
+  // the CPU device info will vary by machine so check for the lowest common denominator values
+  Ort::ConstHardwareDevice device = test_ep_device->Device();
+  ASSERT_EQ(device.Type(), OrtHardwareDeviceType_CPU);
+  ASSERT_GE(device.VendorId(), 0);
+  ASSERT_GE(device.DeviceId(), 0);
+  ASSERT_NE(device.Vendor(), nullptr);
+  Ort::ConstKeyValuePairs device_metadata = device.Metadata();
+  std::unordered_map<std::string, std::string> metadata_entries = device_metadata.GetKeyValuePairs();
+  ASSERT_GT(metadata_entries.size(), 0);  // should have at least SPDRP_HARDWAREID on Windows
+
+  // and this should unload it without throwing
+  ort_env->UnregisterExecutionProviderLibrary(registration_name.c_str());
+}
+
 }  // namespace test
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/test/onnx/gen_test_models.py b/onnxruntime/test/onnx/gen_test_models.py
index a5224925251cf..8790010e45310 100644
--- a/onnxruntime/test/onnx/gen_test_models.py
+++ b/onnxruntime/test/onnx/gen_test_models.py
@@ -94,7 +94,7 @@ def generate_size_op_test(type, X, test_folder):
 
 
 def generate_reducesum_op_test(X, test_folder):
-    type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype]
+    type = helper.np_dtype_to_tensor_dtype(X.dtype)
     data_dir = os.path.join(test_folder, "test_data_0")
     os.makedirs(data_dir, exist_ok=True)
     # Create one output (ValueInfoProto)
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 77800505df9b7..f7760c49d4e79 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -5349,6 +5349,62 @@ TEST(QDQTransformerTests, WeightBiasQuantization_Conv_Weight_Bias) {
 #endif
 }
 
+// Tests that the WeightBiasQuantization optimizer does not process nodes that do not
+// already have an output that is consumed by a single QuantizeLinear node.
+TEST(QDQTransformerTests, WeightBiasQuantization_SkipIfOutputNotQuantized) {
+  auto test_case = [](bool add_final_reshape) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      NodeArg* input_arg = builder.MakeInput<uint8_t>({1, 24, 67, 67}, std::numeric_limits<uint8_t>::min(),
+                                                      std::numeric_limits<uint8_t>::max());
+      NodeArg* weight_arg = builder.MakeInitializer<float>({24, 1, 5, 5}, -0.1f, 0.1f);
+      NodeArg* bias_arg = builder.MakeInitializer<float>({24}, -0.1f, 0.1f);
+      NodeArg* input_dq_arg = builder.MakeIntermediate();
+      NodeArg* conv_output_arg = add_final_reshape ? builder.MakeIntermediate() : builder.MakeOutput();
+
+      builder.AddDequantizeLinearNode<uint8_t>(input_arg, 0.014f, static_cast<uint8_t>(127), input_dq_arg);
+      auto& conv_node = builder.AddNode("Conv", {input_dq_arg, weight_arg, bias_arg}, {conv_output_arg});
+      conv_node.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{5, 5});
+      conv_node.AddAttribute("strides", std::vector<int64_t>{2, 2});
+      conv_node.AddAttribute("group", static_cast<int64_t>(24));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+      // Make adding a final Reshape node configurable to test two cases:
+      //  - Conv produces a graph output
+      //  - Conv output is consumed by some node that is NOT a QuantizeLinear
+      // In either case, the WeightBiasQuantization optimizer should skip this node.
+      if (add_final_reshape) {
+        NodeArg* reshape_output_arg = builder.MakeOutput();
+        NodeArg* new_shape_arg = builder.Make1DInitializer<int64_t>({1, -1});
+        builder.AddNode("Reshape", {conv_output_arg, new_shape_arg}, {reshape_output_arg});
+      }
+    };
+
+    auto check_graph = [add_final_reshape](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+
+      // Should retain the same nodes in the original graph.
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+      EXPECT_EQ(op_to_count["Conv"], 1);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+      EXPECT_EQ(op_to_count["Reshape"], static_cast<int>(add_final_reshape));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Default,
+                      TransformerLevel::Level1,
+                      21,
+                      /*per_sample_tolerance*/ 0.0,
+                      /*relative_per_sample_tolerance*/ 0.0,
+                      std::make_unique<WeightBiasQuantization>());
+  };
+
+  test_case(false);  // Conv produces a graph output directly
+  test_case(true);   // Conv -> Reshape -> graph_output
+}
+
 TEST(QDQTransformerTests, WeightBiasQuantization_ConvTranspose_Weight) {
   auto test_case = [](bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 591e277b2bbca..103da5f534ea7 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -40,7 +40,7 @@ namespace perftest {
       "\t-I: Generate tensor input binding. Free dimensions are treated as 1 unless overridden using -f.\n"
       "\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n"
       "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|qnn|snpe|rocm|migraphx|xnnpack|vitisai|webgpu]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
-      "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack', 'vitisai' or 'webgpu'. "
+      "'nvtensorrtrtx', 'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack', 'vitisai' or 'webgpu'. "
       "Default:'cpu'.\n"
       "\t-b [tf|ort]: backend to use. Default:ort\n"
       "\t-r [repeated_times]: Specifies the repeated times if running in 'times' test mode.Default:1000.\n"
@@ -264,6 +264,8 @@ static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier,
           test_config.machine_config.provider_type_name = onnxruntime::kVitisAIExecutionProvider;
         } else if (!CompareCString(optarg, ORT_TSTR("webgpu"))) {
           test_config.machine_config.provider_type_name = onnxruntime::kWebGpuExecutionProvider;
+        } else if (!CompareCString(optarg, ORT_TSTR("nvtensorrtrtx"))) {
+          test_config.machine_config.provider_type_name = onnxruntime::kNvTensorRTRTXExecutionProvider;
         } else {
           return false;
         }
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index b685b170c163f..8257cbfaa7f95 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -190,6 +190,12 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     session_options.AppendExecutionProvider_CUDA(cuda_options);
 #else
     ORT_THROW("TensorRT is not supported in this build\n");
+#endif
+  } else if (provider_name_ == onnxruntime::kNvTensorRTRTXExecutionProvider) {
+#ifdef USE_NV
+    session_options.AppendExecutionProvider("NvTensorRtRtx", provider_options);
+#else
+    ORT_THROW("NV TensorRT RTX is not supported in this build\n");
 #endif
   } else if (provider_name_ == onnxruntime::kQnnExecutionProvider) {
 #ifdef USE_QNN
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 85f96206b5dba..5e9b50c537465 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -658,6 +658,7 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
 #endif
           kDnnlExecutionProvider,
           kTensorrtExecutionProvider,
+          kNvTensorRTRTXExecutionProvider,
           kOpenVINOExecutionProvider,
           kDmlExecutionProvider,
           kAclExecutionProvider,
@@ -700,6 +701,8 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           execution_provider = DefaultDnnlExecutionProvider();
         else if (provider_type == onnxruntime::kOpenVINOExecutionProvider)
           execution_provider = DefaultOpenVINOExecutionProvider();
+        else if (provider_type == onnxruntime::kNvTensorRTRTXExecutionProvider)
+          execution_provider = DefaultNvTensorRTRTXExecutionProvider();
         else if (provider_type == onnxruntime::kTensorrtExecutionProvider)
           execution_provider = DefaultTensorrtExecutionProvider();
         else if (provider_type == onnxruntime::kNnapiExecutionProvider)
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index a40b85b7754a3..f97de7a54bc99 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -468,5 +468,111 @@ TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) {
   test.Run();
 }
 
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode1) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+
+  constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+  std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};
+
+  test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode2) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+
+  constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+  std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                          29, 30, 31};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
+                               30, 27, 31};
+
+  test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR1) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "DCR");
+
+  constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+  std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};
+
+  test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR2) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "DCR");
+
+  constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+  std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                          29, 30, 31};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
+                               30, 27, 31};
+
+  test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD1) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "CRD");
+
+  constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+  std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD2) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "CRD");
+
+  constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+  std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                          29, 30, 31};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15, 16, 18, 17, 19, 20, 22, 21, 23, 24, 26, 25, 27, 28,
+                               30, 29, 31};
+
+  test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
new file mode 100644
index 0000000000000..9515c8eb78ed6
--- /dev/null
+++ b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
@@ -0,0 +1,317 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/graph/onnx_protobuf.h"
+#include "core/session/inference_session.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/framework/test_utils.h"
+#include "gtest/gtest.h"
+#include "test/util/include/scoped_env_vars.h"
+#include "test/common/trt_op_test_utils.h"
+
+#include <onnxruntime_cxx_api.h>
+#include <string>
+#include <thread>
+#include <filesystem>
+#include <chrono>
+
+using namespace std;
+using namespace ONNX_NAMESPACE;
+using namespace ::onnxruntime::logging;
+
+namespace onnxruntime {
+
+namespace test {
+
+std::string WideToUTF8(const std::wstring& wstr) {
+  std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+  return converter.to_bytes(wstr);
+}
+
+template <typename T>
+void VerifyOutputs(const std::vector<OrtValue>& fetches, const std::vector<int64_t>& expected_dims,
+                   const std::vector<T>& expected_values) {
+  ASSERT_EQ(1, fetches.size());
+  auto& rtensor = fetches.front().Get<Tensor>();
+  TensorShape expected_shape(expected_dims);
+  ASSERT_EQ(expected_shape, rtensor.Shape());
+  const std::vector<T> found(rtensor.Data<T>(), rtensor.Data<T>() + expected_values.size());
+  ASSERT_EQ(expected_values, found);
+}
+
+/**
+ * Create a simple model with dynamic or non-dynamic input shape.
+ * \param model_name - model name
+ * \param graph_name - graph name
+ * \param dims - input dimensions
+ * \param add_fast_gelu - add FastGelu node which makes the whole model partition into TRT EP and CUDA EP subgraphs.
+ *
+ * input: "X", "Y" and "Z"
+ *        you can specify input dimensions, for example (1, 3, 2), (1, 2) or (1, -1, -1)). Note: -1 means the dimension is dynamic.
+ *        All three inputs have the same dimensions.
+ * output: "M"
+ *
+ *      "X"  "Y"
+ *        \  /
+ *    "Z"  Add
+ *      \  /
+ *       Add
+ *       /
+ *       Add (+ float scalar "S")
+ *       /
+ *     "O"
+ *
+ *     or
+ *
+ *      "X"  "Y"
+ *        \  /
+ *    "Z"  Add
+ *      \  /
+ *       Add
+ *       /
+ *    FastGelu (This node will be placed on CUDA EP)
+ *     /
+ *     *       Add (+ float scalar "S")
+ *    /
+ *   "O"
+ */
+void CreateBaseModel(const PathString& model_name,
+                     std::string graph_name,
+                     std::vector<int> dims,
+                     bool add_fast_gelu = false) {
+  onnxruntime::Model model(graph_name, false, DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+  std::vector<onnxruntime::NodeArg*> inputs;
+  std::vector<onnxruntime::NodeArg*> outputs;
+
+  // FLOAT tensor
+  ONNX_NAMESPACE::TypeProto float_tensor;
+  float_tensor.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+  for (auto dim : dims) {
+    float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(dim);
+  }
+  ONNX_NAMESPACE::TypeProto dyn_float_tensor;
+  dyn_float_tensor.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+  auto& input_arg_1 = graph.GetOrCreateNodeArg("X", &float_tensor);
+  auto& input_arg_2 = graph.GetOrCreateNodeArg("Y", &float_tensor);
+  inputs.push_back(&input_arg_1);
+  inputs.push_back(&input_arg_2);
+  auto& output_arg = graph.GetOrCreateNodeArg("node_1_out_1", &float_tensor);
+  outputs.push_back(&output_arg);
+  graph.AddNode("node_1", "Add", "node 1.", inputs, outputs);
+
+  auto& input_arg_3 = graph.GetOrCreateNodeArg("Z", &float_tensor);
+  inputs.clear();
+  inputs.push_back(&output_arg);
+  inputs.push_back(&input_arg_3);
+
+  auto& output_arg_2 = graph.GetOrCreateNodeArg("node_2_out_1", &float_tensor);
+  outputs.clear();
+  outputs.push_back(&output_arg_2);
+  graph.AddNode("node_2", "Add", "node 2.", inputs, outputs);
+
+  inputs.clear();
+  inputs.push_back(&output_arg_2);
+
+  if (add_fast_gelu) {
+    auto& output_arg_3 = graph.GetOrCreateNodeArg("node_3_out_1", &dyn_float_tensor);
+    outputs.clear();
+    outputs.push_back(&output_arg_3);
+
+    graph.AddNode("node_3", "FastGelu", "node 3.", inputs, outputs,
+                  /* attributes */ nullptr, kMSDomain);
+
+    inputs.clear();
+    inputs.push_back(&output_arg_3);
+  }
+
+  ONNX_NAMESPACE::TypeProto float_scalar;
+  float_scalar.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  float_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+  auto& input_scalar = graph.GetOrCreateNodeArg("S", &float_scalar);
+  inputs.push_back(&input_scalar);
+
+  auto& output_arg_4 = graph.GetOrCreateNodeArg("O", &dyn_float_tensor);
+
+  outputs.clear();
+  outputs.push_back(&output_arg_4);
+  graph.AddNode("node_5", "Add", "node 5.", inputs, outputs);
+
+  auto status = graph.Resolve();
+  ASSERT_TRUE(status.IsOK());
+  status = onnxruntime::Model::Save(model, model_name);
+}
+
+Ort::IoBinding generate_io_binding(Ort::Session& session, std::map<std::string, std::vector<int64_t>> shape_overwrites = {}) {
+  Ort::IoBinding binding(session);
+  auto allocator = Ort::AllocatorWithDefaultOptions();
+  for (int input_idx = 0; input_idx < int(session.GetInputCount()); ++input_idx) {
+    auto input_name = session.GetInputNameAllocated(input_idx, Ort::AllocatorWithDefaultOptions());
+    auto full_tensor_info = session.GetInputTypeInfo(input_idx);
+    auto tensor_info = full_tensor_info.GetTensorTypeAndShapeInfo();
+    auto shape = tensor_info.GetShape();
+    auto type = tensor_info.GetElementType();
+    if (shape_overwrites.find(input_name.get()) == shape_overwrites.end()) {
+      for (auto& v : shape) {
+        if (v == -1) {
+          v = 1;
+        }
+      }
+    } else {
+      shape = shape_overwrites[input_name.get()];
+    }
+    auto input_value = Ort::Value::CreateTensor(allocator,
+                                                shape.data(),
+                                                shape.size(),
+                                                type);
+    binding.BindInput(input_name.get(), input_value);
+  }
+
+  for (int output_idx = 0; output_idx < int(session.GetOutputCount()); ++output_idx) {
+    auto output_name = session.GetOutputNameAllocated(output_idx, Ort::AllocatorWithDefaultOptions());
+    binding.BindOutput(output_name.get(), allocator.GetInfo());
+  }
+  return binding;
+}
+
+TEST(NvExecutionProviderTest, ContextEmbedAndReload) {
+  PathString model_name = ORT_TSTR("nv_execution_provider_test.onnx");
+  PathString model_name_ctx = ORT_TSTR("nv_execution_provider_test_ctx.onnx");
+  std::string graph_name = "test";
+  std::vector<int> dims = {1, 3, 2};
+
+  CreateBaseModel(model_name, graph_name, dims);
+
+  auto env = Ort::Env();
+  auto logging_level = OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING;
+  env.UpdateEnvWithCustomLogLevel(logging_level);
+
+  // AOT time
+  {
+    auto start = std::chrono::high_resolution_clock::now();
+    Ort::SessionOptions so;
+    Ort::RunOptions run_options;
+    so.AddConfigEntry("ep.context_enable", "1");
+    so.AddConfigEntry("ep.context_file_path", WideToUTF8(model_name_ctx).c_str());
+    so.AppendExecutionProvider("NvTensorRtRtx", {});
+    Ort::Session session_object(env, model_name.c_str(), so);
+    auto stop = std::chrono::high_resolution_clock::now();
+    std::cout << "Session creation AOT: " << std::chrono::duration_cast<std::chrono::milliseconds>((stop - start)).count() << " ms" << std::endl;
+
+    auto io_binding = generate_io_binding(session_object);
+    session_object.Run(run_options, io_binding);
+  }
+
+  // JIT time
+  {
+    auto start = std::chrono::high_resolution_clock::now();
+    Ort::SessionOptions so;
+    Ort::RunOptions run_options;
+    so.AddConfigEntry("ep.context_enable", "1");
+    so.AppendExecutionProvider("NvTensorRtRtx", {});
+    Ort::Session session_object(env, model_name.c_str(), so);
+    auto stop = std::chrono::high_resolution_clock::now();
+    std::cout << "Session creation JIT: " << std::chrono::duration_cast<std::chrono::milliseconds>((stop - start)).count() << " ms" << std::endl;
+
+    auto io_binding = generate_io_binding(session_object);
+    session_object.Run(run_options, io_binding);
+  }
+}
+
+TEST(NvExecutionProviderTest, ContextEmbedAndReloadDynamic) {
+  PathString model_name = ORT_TSTR("nv_execution_provider_dyn_test.onnx");
+  PathString model_name_ctx = ORT_TSTR("nv_execution_provider_dyn_test_ctx.onnx");
+  std::string graph_name = "test";
+  std::vector<int> dims = {1, -1, -1};
+
+  CreateBaseModel(model_name, graph_name, dims);
+
+  auto env = Ort::Env();
+  auto logging_level = OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING;
+  env.UpdateEnvWithCustomLogLevel(logging_level);
+
+  // AOT time
+  {
+    auto start = std::chrono::high_resolution_clock::now();
+    Ort::SessionOptions so;
+    Ort::RunOptions run_options;
+    so.AddConfigEntry("ep.context_enable", "1");
+    so.AddConfigEntry("ep.context_file_path", WideToUTF8(model_name_ctx).c_str());
+    so.AppendExecutionProvider("NvTensorRtRtx", {});
+    Ort::Session session_object(env, model_name.c_str(), so);
+    auto stop = std::chrono::high_resolution_clock::now();
+    std::cout << "Session creation AOT: " << std::chrono::duration_cast<std::chrono::milliseconds>((stop - start)).count() << " ms" << std::endl;
+
+    auto io_binding = generate_io_binding(session_object);
+    session_object.Run(run_options, io_binding);
+  }
+
+  // JIT time
+  {
+    auto start = std::chrono::high_resolution_clock::now();
+    Ort::SessionOptions so;
+    Ort::RunOptions run_options;
+    so.AddConfigEntry("ep.context_enable", "1");
+    so.AppendExecutionProvider("NvTensorRtRtx", {});
+    Ort::Session session_object(env, model_name.c_str(), so);
+    auto stop = std::chrono::high_resolution_clock::now();
+    std::cout << "Session creation JIT: " << std::chrono::duration_cast<std::chrono::milliseconds>((stop - start)).count() << " ms" << std::endl;
+
+    std::map<std::string, std::vector<int64_t>> shape_overwrites;
+    shape_overwrites["X"] = {1, 5, 5};
+    shape_overwrites["Y"] = {1, 5, 1};
+    auto io_binding = generate_io_binding(session_object, shape_overwrites);
+    session_object.Run(run_options, io_binding);
+  }
+}
+
+TEST(NvExecutionProviderTest, ContextEmbedAndReloadDataDynamic) {
+  PathString model_name = ORT_TSTR("nv_execution_provider_data_dyn_test.onnx");
+  PathString model_name_ctx = ORT_TSTR("nv_execution_provider_data_dyn_test_ctx.onnx");
+  std::string graph_name = "test";
+  std::vector<int> dims = {1, -1, -1};
+
+  CreateBaseModel(model_name, graph_name, dims, true);
+
+  auto env = Ort::Env();
+  auto logging_level = OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING;
+  env.UpdateEnvWithCustomLogLevel(logging_level);
+
+  // AOT time
+  {
+    auto start = std::chrono::high_resolution_clock::now();
+    Ort::SessionOptions so;
+    Ort::RunOptions run_options;
+    so.AddConfigEntry("ep.context_enable", "1");
+    so.AddConfigEntry("ep.context_file_path", WideToUTF8(model_name_ctx).c_str());
+    so.AppendExecutionProvider("NvTensorRtRtx", {});
+    Ort::Session session_object(env, model_name.c_str(), so);
+    auto stop = std::chrono::high_resolution_clock::now();
+    std::cout << "Session creation AOT: " << std::chrono::duration_cast<std::chrono::milliseconds>((stop - start)).count() << " ms" << std::endl;
+
+    auto io_binding = generate_io_binding(session_object);
+    session_object.Run(run_options, io_binding);
+  }
+
+  // JIT time
+  {
+    auto start = std::chrono::high_resolution_clock::now();
+    Ort::SessionOptions so;
+    Ort::RunOptions run_options;
+    so.AddConfigEntry("ep.context_enable", "1");
+    so.AppendExecutionProvider("NvTensorRtRtx", {});
+    Ort::Session session_object(env, model_name.c_str(), so);
+    auto stop = std::chrono::high_resolution_clock::now();
+    std::cout << "Session creation JIT: " << std::chrono::duration_cast<std::chrono::milliseconds>((stop - start)).count() << " ms" << std::endl;
+
+    std::map<std::string, std::vector<int64_t>> shape_overwrites;
+    shape_overwrites["X"] = {1, 5, 5};
+    shape_overwrites["Y"] = {1, 5, 5};
+    auto io_binding = generate_io_binding(session_object, shape_overwrites);
+    session_object.Run(run_options, io_binding);
+  }
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc
index 2326b2949a6bd..fa26c764c1b7a 100644
--- a/onnxruntime/test/providers/qnn/cast_test.cc
+++ b/onnxruntime/test/providers/qnn/cast_test.cc
@@ -5,13 +5,14 @@
 
 #include <string>
 #include <unordered_map>
+#include <vector>
 
-#include "test/optimizer/qdq_test_utils.h"
-#include "test/providers/qnn/qnn_test_utils.h"
+#include "gtest/gtest.h"
 
+#include "core/framework/float16.h"
 #include "core/graph/onnx_protobuf.h"
-
-#include "gtest/gtest.h"
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -67,6 +68,31 @@ static void RunCastOpTest(const std::vector<int64_t>& shape, ONNX_NAMESPACE::Ten
                   expected_ep_assignment);
 }
 
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+static void RunCastFP16HTPTest(const std::vector<int64_t>& shape,
+                               ONNX_NAMESPACE::TensorProto_DataType dst_type,
+                               ExpectedEPNodeAssignment expected_ep_assignment) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto testcase = [shape, dst_type](ModelTestBuilder& builder) {
+    auto input_def_fp = TestInputDef(shape, false, static_cast<float>(0), static_cast<float>(20));
+    auto input_def = ConvertToFP16InputDef(input_def_fp);
+    auto input = MakeTestInput<MLFloat16>(builder, input_def);
+
+    auto* output = builder.MakeOutput();
+    Node& cast_node = builder.AddNode("Cast", {input}, {output});
+    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
+  };
+
+  RunQnnModelTest(testcase, provider_options, /* opset */ 13, expected_ep_assignment);
+}
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
 //
 // CPU tests:
 //
@@ -125,6 +151,21 @@ TEST_F(QnnHTPBackendTests, TestCastInt32ToInt64HTP) {
   RunCastOpTest<int32_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64,
                          ExpectedEPNodeAssignment::All, true);
 }
+
+// Cast float to bool on HTP.
+TEST_F(QnnHTPBackendTests, TestCastFloatToBoolHTP) {
+  RunCastOpTest<float>({3, 3},
+                       ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL,
+                       ExpectedEPNodeAssignment::All,
+                       true);
+}
+
+// Cast float16 to bool on HTP.
+TEST_F(QnnHTPBackendTests, TestCastFloat16ToBoolHTP) {
+  RunCastFP16HTPTest({3, 3},
+                     ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL,
+                     ExpectedEPNodeAssignment::All);
+}
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index b75751f89a6c7..f736abcd3006d 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -27,6 +27,8 @@ using namespace onnxruntime::logging;
 
 // in test_main.cc
 extern std::unique_ptr<Ort::Env> ort_env;
+extern "C" void ortenv_setup();
+extern "C" void ortenv_teardown();
 
 namespace onnxruntime {
 namespace test {
@@ -1232,6 +1234,37 @@ TEST_F(QnnHTPBackendTests, UseHtpSharedMemoryAllocatorForInputs) {
 }
 #endif  // BUILD_QNN_EP_STATIC_LIB
 
+#if !BUILD_QNN_EP_STATIC_LIB
+// Tests that loading and unloading of an EP library in the same process does not cause a segfault.
+TEST_F(QnnHTPBackendTests, LoadingAndUnloadingOfQnnLibrary_FixSegFault) {
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx";
+
+  onnxruntime::ProviderOptions options;
+  options["backend_type"] = "htp";
+  options["offload_graph_io_quantization"] = "0";
+
+  // This first session will load the QNN EP library for the first time.
+  {
+    Ort::SessionOptions so;
+    so.AppendExecutionProvider("QNN", options);
+
+    EXPECT_NO_THROW(Ort::Session session(*ort_env, ort_model_path, so));
+  }
+
+  {
+    ortenv_teardown();  // Destroy Env to force unloading of EP libraries.
+    ortenv_setup();
+
+    // This next session will reload the QNN EP library.
+    // Should not get a segfault.
+    Ort::SessionOptions so;
+    so.AppendExecutionProvider("QNN", options);
+
+    EXPECT_NO_THROW(Ort::Session session(*ort_env, ort_model_path, so));
+  }
+}
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
diff --git a/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py b/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py
index 1459dfc61c84c..73b096a694054 100644
--- a/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py
+++ b/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py
@@ -23,7 +23,7 @@ def prepare_dir(path):
 def _extract_value_info(arr, name, ele_type=None):
     return onnx.helper.make_tensor_value_info(
         name=name,
-        elem_type=ele_type if ele_type else onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[arr.dtype],
+        elem_type=ele_type if ele_type else onnx.helper.np_dtype_to_tensor_dtype(arr.dtype),
         shape=arr.shape,
     )
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index 77f9e6f5cf39c..843b27c102fa7 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -9,7 +9,6 @@
 from numpy.testing import assert_almost_equal
 from onnx import TensorProto, helper
 from onnx.defs import onnx_opset_version
-from onnx.mapping import TENSOR_TYPE_MAP
 
 import onnxruntime as onnxrt
 from onnxruntime.capi._pybind_state import OrtDevice as C_OrtDevice  # pylint: disable=E0611
@@ -168,8 +167,7 @@ def test_bind_onnx_types_supported_by_numpy(self):
                 TensorProto.UINT64,
             ]:
                 with self.subTest(onnx_dtype=onnx_dtype, inner_device=str(inner_device)):
-                    assert onnx_dtype in TENSOR_TYPE_MAP
-                    np_dtype = TENSOR_TYPE_MAP[onnx_dtype].np_dtype
+                    np_dtype = helper.tensor_dtype_to_np_dtype(onnx_dtype)
                     x = np.arange(8).reshape((-1, 2)).astype(np_dtype)
 
                     # create onnx graph
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index c1564997c42b8..9b1c1608ea25d 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -50,6 +50,14 @@ std::unique_ptr<IExecutionProvider> DefaultTensorrtExecutionProvider() {
   return nullptr;
 }
 
+std::unique_ptr<IExecutionProvider> DefaultNvTensorRTRTXExecutionProvider() {
+#ifdef USE_NV
+  if (auto factory = NvProviderFactoryCreator::Create(0))
+    return factory->CreateProvider();
+#endif
+  return nullptr;
+}
+
 std::unique_ptr<IExecutionProvider> TensorrtExecutionProviderWithOptions(const OrtTensorRTProviderOptions* params) {
 #ifdef USE_TENSORRT
   if (auto factory = TensorrtProviderFactoryCreator::Create(params))
diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h
index 9b44150d972db..3595c6f71633a 100644
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@@ -44,6 +44,7 @@ std::unique_ptr<IExecutionProvider> DefaultDnnlExecutionProvider();
 std::unique_ptr<IExecutionProvider> DnnlExecutionProviderWithOptions(const OrtDnnlProviderOptions* provider_options);
 // std::unique_ptr<IExecutionProvider> DefaultTvmExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultTensorrtExecutionProvider();
+std::unique_ptr<IExecutionProvider> DefaultNvTensorRTRTXExecutionProvider();
 std::unique_ptr<IExecutionProvider> TensorrtExecutionProviderWithOptions(const OrtTensorRTProviderOptions* params);
 std::unique_ptr<IExecutionProvider> TensorrtExecutionProviderWithOptions(const OrtTensorRTProviderOptionsV2* params);
 std::unique_ptr<IExecutionProvider> DefaultMIGraphXExecutionProvider();
diff --git a/setup.py b/setup.py
index 5fc78963eca9a..1e426ea8e060b 100644
--- a/setup.py
+++ b/setup.py
@@ -191,6 +191,17 @@ def _rewrite_ld_preload_tensorrt(self, to_preload):
                     f.write("    import os\n")
                     f.write('    os.environ["ORT_TENSORRT_UNAVAILABLE"] = "1"\n')
 
+        def _rewrite_ld_preload_nv_tensorrt_rtx(self, to_preload):
+            with open("onnxruntime/capi/_ld_preload.py", "a", encoding="ascii") as f:
+                if len(to_preload) > 0:
+                    f.write("from ctypes import CDLL, RTLD_GLOBAL\n")
+                    f.write("try:\n")
+                    for library in to_preload:
+                        f.write('    _{} = CDLL("{}", mode=RTLD_GLOBAL)\n'.format(library.split(".")[0], library))
+                    f.write("except OSError:\n")
+                    f.write("    import os\n")
+                    f.write('    os.environ["ORT_NV_TENSORRT_RTX_UNAVAILABLE"] = "1"\n')
+
         def run(self):
             if is_manylinux:
                 source = "onnxruntime/capi/onnxruntime_pybind11_state.so"
@@ -201,6 +212,7 @@ def run(self):
                 to_preload = []
                 to_preload_cuda = []
                 to_preload_tensorrt = []
+                to_preload_nv_tensorrt_rtx = []
                 to_preload_cann = []
 
                 cuda_dependencies = [
@@ -268,6 +280,7 @@ def run(self):
                 self._rewrite_ld_preload(to_preload)
                 self._rewrite_ld_preload_cuda(to_preload_cuda)
                 self._rewrite_ld_preload_tensorrt(to_preload_tensorrt)
+                self._rewrite_ld_preload_tensorrt(to_preload_nv_tensorrt_rtx)
                 self._rewrite_ld_preload(to_preload_cann)
 
             else:
@@ -303,6 +316,7 @@ def finalize_options(self):
 
 providers_cuda_or_rocm = "onnxruntime_providers_" + ("rocm" if is_rocm else "cuda")
 providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt")
+providers_nv_tensorrt_rtx = "onnxruntime_providers_nv_tensorrt_rtx"
 providers_openvino = "onnxruntime_providers_openvino"
 providers_cann = "onnxruntime_providers_cann"
 providers_qnn = "onnxruntime_providers_qnn"
@@ -316,6 +330,7 @@ def finalize_options(self):
 elif platform.system() == "Windows":
     providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll"
     providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll"
+    providers_nv_tensorrt_rtx = providers_nv_tensorrt_rtx + ".dll"
     providers_openvino = providers_openvino + ".dll"
     providers_cann = providers_cann + ".dll"
     providers_qnn = providers_qnn + ".dll"
@@ -384,6 +399,7 @@ def finalize_options(self):
         "libiomp5md.dll",
         providers_cuda_or_rocm,
         providers_tensorrt_or_migraphx,
+        providers_nv_tensorrt_rtx,
         providers_cann,
         "onnxruntime.dll",
     ]
@@ -391,6 +407,7 @@ def finalize_options(self):
     libs.extend(["onnxruntime_providers_shared.dll"])
     libs.extend(["onnxruntime_providers_dnnl.dll"])
     libs.extend(["onnxruntime_providers_tensorrt.dll"])
+    libs.extend(["onnxruntime_providers_nv_tensorrt_rtx.dll"])
     libs.extend(["onnxruntime_providers_openvino.dll"])
     libs.extend(["onnxruntime_providers_cuda.dll"])
     libs.extend(["onnxruntime_providers_vitisai.dll"])
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index e8625e77e9a63..0a6af27da8bc2 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -434,6 +434,7 @@ def generate_build_tree(
         "-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"),
         "-Donnxruntime_USE_VITISAI=" + ("ON" if args.use_vitisai else "OFF"),
         "-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"),
+        "-Donnxruntime_USE_NV=" + ("ON" if args.use_nv_tensorrt_rtx else "OFF"),
         "-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER="
         + ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"),
         # interface variables are used only for building onnxruntime/onnxruntime_shared.dll but not EPs
@@ -715,7 +716,7 @@ def generate_build_tree(
     if args.use_rocm:
         cmake_args.append("-Donnxruntime_ROCM_HOME=" + rocm_home)
         cmake_args.append("-Donnxruntime_ROCM_VERSION=" + args.rocm_version)
-    if args.use_tensorrt:
+    if args.use_tensorrt or args.use_nv_tensorrt_rtx:
         cmake_args.append("-Donnxruntime_TENSORRT_HOME=" + tensorrt_home)
 
     if args.use_cuda:
@@ -1323,7 +1324,7 @@ def setup_cann_vars(args):
 
 def setup_tensorrt_vars(args):
     tensorrt_home = ""
-    if args.use_tensorrt:
+    if args.use_tensorrt or args.use_nv_tensorrt_rtx:
         tensorrt_home = args.tensorrt_home if args.tensorrt_home else os.getenv("TENSORRT_HOME")
         tensorrt_home_valid = tensorrt_home is not None and os.path.exists(tensorrt_home)
         if not tensorrt_home_valid:
@@ -1607,7 +1608,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             run_ios_tests(args, source_dir, config, cwd)
             continue
         dll_path_list = []
-        if args.use_tensorrt:
+        if args.use_tensorrt or args.use_nv_tensorrt_rtx:
             dll_path_list.append(os.path.join(args.tensorrt_home, "lib"))
 
         dll_path = None
@@ -2179,11 +2180,15 @@ def main():
     # shared lib being build in a separate process. So we skip the testing if none of the primary EPs are built with ONNXRuntime
     # shared lib
     if args.enable_generic_interface and not (
-        args.use_tensorrt or args.use_openvino or args.use_vitisai or (args.use_qnn and args.use_qnn != "static_lib")
+        args.use_nv_tensorrt_rtx
+        or args.use_tensorrt
+        or args.use_openvino
+        or args.use_vitisai
+        or (args.use_qnn and args.use_qnn != "static_lib")
     ):
         args.test = False
 
-    if args.use_tensorrt:
+    if args.use_tensorrt or args.use_nv_tensorrt_rtx:
         args.use_cuda = True
 
     if args.build_wheel or args.gen_doc or args.enable_training:
@@ -2287,7 +2292,7 @@ def main():
 
     # if using tensorrt, setup tensorrt paths
     tensorrt_home = ""
-    if args.use_tensorrt:
+    if args.use_tensorrt or args.use_nv_tensorrt_rtx:
         tensorrt_home = setup_tensorrt_vars(args)
 
     # if using migraphx, setup migraphx paths
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index a54500c176a87..215ad77335083 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -661,6 +661,10 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     trt_group.add_argument("--use_tensorrt_oss_parser", action="store_true", help="Use TensorRT OSS ONNX parser.")
     trt_group.add_argument("--tensorrt_home", help="Path to TensorRT installation directory.")
 
+    # --- Nv ---
+    nv_group = parser.add_argument_group("Nv Execution Provider")
+    nv_group.add_argument("--use_nv_tensorrt_rtx", action="store_true", help="Enable Nv EP.")
+
     # --- DirectML ---
     dml_group = parser.add_argument_group("DirectML Execution Provider (Windows)")
     dml_group.add_argument("--use_dml", action="store_true", help="Enable DirectML EP (Windows).")
diff --git a/tools/ci_build/gen_def.py b/tools/ci_build/gen_def.py
index 76d9c9499c478..526cc7bde519e 100755
--- a/tools/ci_build/gen_def.py
+++ b/tools/ci_build/gen_def.py
@@ -81,6 +81,7 @@ def parse_arguments():
             "tensorrt",
             "azure",
             "webgpu",
+            "nv_tensorrt_rtx",
         ):
             file.write(f"#include <core/providers/{c}/{c}_provider_factory.h>\n")
     file.write("void* GetFunctionEntryByName(const char* name){\n")
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 5313af8e7e2d5..2cb64733f6f6c 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -54,10 +54,13 @@ parameters:
   displayName: Build flags to append to build command
   type: string
   default: '--use_azure'
+
+# Do not update this to a version that does not exist for the qnn-runtime Maven package:
+# https://mvnrepository.com/artifact/com.qualcomm.qti/qnn-runtime
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.33.2.250410
+  default: 2.33.0.250327
 
 resources:
   repositories:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index b8095a4e8c45c..3b307abe5fcef 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -93,11 +93,11 @@ stages:
 
       - ${{ if notIn(parameters['sln_platform'], 'Win32', 'x64') }}:
         - powershell: |
-            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --build_csharp --build --update --config $(BuildConfig) --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
+            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --build_csharp --build --update --config $(BuildConfig) --build_nuget --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
 
       - ${{ else }}:
         - powershell: |
-            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --build_csharp --build --update --config $(BuildConfig) --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
+            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --build_csharp --build --update --config $(BuildConfig) --build_nuget --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
 
       - ${{ if notIn(parameters['sln_platform'], 'Win32', 'x64') }}:
         # Use cross-compiled protoc
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index da6263cc56975..d1fa72d7e4413 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -14,10 +14,12 @@ parameters:
   type: string
   default: ''
 
+# Do not update this to a version that does not exist for the qnn-runtime Maven package:
+# https://mvnrepository.com/artifact/com.qualcomm.qti/qnn-runtime
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: '2.33.2.250410'
+  default: '2.33.0.250327'
 
 - name: enableWebGpu
   displayName: Enable WebGPU test
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index 03839f8ac6282..4474a6b45ef58 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -48,10 +48,12 @@ parameters:
   type: string
   default: ''
 
+# Do not update this to a version that does not exist for the qnn-runtime Maven package:
+# https://mvnrepository.com/artifact/com.qualcomm.qti/qnn-runtime
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: '2.33.2.250410'
+  default: '2.33.0.250327'
 
 - name: is1ES
   displayName: Is 1ES pipeline
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index a949817c3a43c..9f65fc8891e94 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -42,10 +42,12 @@ parameters:
   type: string
   default: '0'
 
+# Do not update this to a version that does not exist for the qnn-runtime Maven package:
+# https://mvnrepository.com/artifact/com.qualcomm.qti/qnn-runtime
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: 2.33.2.250410
+  default: 2.33.0.250327
 
 - name: is1ES
   displayName: Is 1ES pipeline
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 1a580b848a55a..3ca025514ea3d 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -8,4 +8,4 @@ protobuf==4.21.12
 sympy==1.12
 flatbuffers
 psutil
-onnxscript==0.2.3
+onnxscript==0.2.3 ; python_version < '3.13'
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 1b7b5f5bc7092..3b1da942c5e52 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -552,10 +552,12 @@ def generate_files(line_list, args):
 
     if is_qnn_package:
         files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnCpu.dll") + runtimes + " />")
-        files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnGpu.dll") + runtimes + " />")
         files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnHtp.dll") + runtimes + " />")
         files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnSaver.dll") + runtimes + " />")
         if args.target_architecture != "x64":
+            files_list.append(
+                "<file src=" + '"' + os.path.join(args.native_build_path, "QnnGpu.dll") + runtimes + " />"
+            )
             files_list.append(
                 "<file src=" + '"' + os.path.join(args.native_build_path, "QnnSystem.dll") + runtimes + " />"
             )
diff --git a/tools/python/ort_test_dir_utils.py b/tools/python/ort_test_dir_utils.py
index 59bb6670c8794..b81927d8ec11e 100644
--- a/tools/python/ort_test_dir_utils.py
+++ b/tools/python/ort_test_dir_utils.py
@@ -10,12 +10,12 @@
 import onnxruntime as ort
 
 
-def _get_numpy_type(model_info, name):
+def _get_numpy_type(model_info, name) -> np.dtype:
     for i in model_info:
         if i.name == name:
             type_name = i.type.WhichOneof("value")
             if type_name == "tensor_type":
-                return onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[i.type.tensor_type.elem_type]
+                return onnx.helper.tensor_dtype_to_np_dtype(i.type.tensor_type.elem_type)
             else:
                 raise ValueError(f"Type is not handled: {type_name}")
 
@@ -65,7 +65,7 @@ def _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values
         if onnx_type not in [TensorProto.FLOAT, TensorProto.BFLOAT16, TensorProto.DOUBLE, TensorProto.FLOAT16]:
             data *= 256
 
-        np_type = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[onnx_type]
+        np_type = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
         data = data.astype(np_type)
 
         name_input_map[input.name] = data