diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 3e713b69671e7..fb0c379522691 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -455,7 +455,7 @@ if (onnxruntime_USE_CUDA)
         list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
       else()
         if(onnxruntime_CUDNN_HOME)
-          list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib64)
+          list(APPEND onnxruntime_LINK_DIRS  ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
         endif()
         list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
       endif()
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index e7e989cf17f20..ddf390db3d1c3 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -5,7 +5,7 @@
 
 /// <summary>
 /// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT_V2.
-/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally. 
+/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
 /// Going forward, new trt provider options are to be supported via this struct and usage of the publicly defined
 /// OrtTensorRTProviderOptions will be deprecated over time.
 /// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
@@ -31,4 +31,7 @@ struct OrtTensorRTProviderOptionsV2 {
   int trt_force_sequential_engine_build;        // force building TensorRT engine sequentially. Default 0 = false, nonzero = true
   int trt_context_memory_sharing_enable;        // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true
   int trt_layer_norm_fp32_fallback;             // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true
+  int trt_timing_cache_enable;                  // enable TensorRT timing cache. Default 0 = false, nonzero = true
+  int trt_force_timing_cache;                   // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true
+  int trt_detailed_build_log;                   // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true
 };
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 7d14f43064f00..10c67963b85cc 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -117,6 +117,32 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
   }
   return true;
 }
+
+inline std::vector<char> loadTimingCacheFile(const std::string inFileName) {
+  std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
+  if (!iFile) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not read timing cache from: " << inFileName
+                          << ". A new timing cache will be generated and written.";
+    return std::vector<char>();
+  }
+  iFile.seekg(0, std::ifstream::end);
+  size_t fsize = iFile.tellg();
+  iFile.seekg(0, std::ifstream::beg);
+  std::vector<char> content(fsize);
+  iFile.read(content.data(), fsize);
+  iFile.close();
+  return content;
+}
+
+inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) {
+  std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
+  if (!oFile) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not write timing cache to: " << outFileName;
+    return;
+  }
+  oFile.write((char*)blob->data(), blob->size());
+  oFile.close();
+}
 }  // namespace
 
 namespace google {
@@ -312,7 +338,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     }
     dump_subgraphs_ = info.dump_subgraphs;
     engine_cache_enable_ = info.engine_cache_enable;
-    if (engine_cache_enable_ || int8_enable_) {
+    timing_cache_enable_ = info.timing_cache_enable;
+    force_timing_cache_match_ = info.force_timing_cache;
+    detailed_build_log_ = info.detailed_build_log;
+    if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
       cache_path_ = info.engine_cache_path;
     }
     engine_decryption_enable_ = info.engine_decryption_enable;
@@ -386,7 +415,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
       engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true);
     }
 
-    if (engine_cache_enable_ || int8_enable_) {
+    const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable);
+    if (!timing_cache_enable_env.empty()) {
+      timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true);
+    }
+
+    const std::string detailed_build_log_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDetailedBuildLog);
+    if (!detailed_build_log_env.empty()) {
+      detailed_build_log_ = (std::stoi(detailed_build_log_env) == 0 ? false : true);
+    }
+
+    const std::string timing_force_match_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kForceTimingCache);
+    if (!timing_force_match_env.empty()) {
+      force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
+    }
+
+    if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
       const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
       cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
       if (!engine_cache_path.empty() && cache_path_.empty()) {
@@ -438,7 +482,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     dla_core_ = 0;
   }
 
-  if (engine_cache_enable_ || int8_enable_) {
+  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
     if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
       if (!fs::create_directory(cache_path_)) {
         throw std::runtime_error("Failed to create directory " + cache_path_);
@@ -1373,6 +1417,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
     if (!has_dynamic_shape) {
       const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
       const std::string engine_cache_path = cache_path + ".engine";
+      std::string timing_cache_path = "";
+      if (timing_cache_enable_) {
+        cudaDeviceProp prop;
+        CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+        timing_cache_path = GetTimingCachePath(cache_path_, prop);
+      }
       {
         // ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
         auto lock = GetApiLock();
@@ -1419,12 +1469,36 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
             }
           }
 
+          // Load timing cache from file. Create a fresh cache if the file doesn't exist
+          std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+          if (timing_cache_enable_) {
+            std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+            timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+            if (timing_cache == nullptr) {
+              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                     "TensorRT EP could not create timing cache: " + timing_cache_path);
+            }
+            trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+            if (detailed_build_log_) {
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
+            }
+          }
+
           // Build engine
+          std::chrono::steady_clock::time_point engine_build_start;
+          if (detailed_build_log_) {
+            engine_build_start = std::chrono::steady_clock::now();
+          }
           trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config));
           if (trt_engine == nullptr) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                    "TensorRT EP could not build engine for fused node: " + fused_node.Name());
           }
+          if (detailed_build_log_) {
+            auto engine_build_stop = std::chrono::steady_clock::now();
+            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " <<
+              std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+          }
           if (engine_cache_enable_) {
             std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
             size_t engine_size = serializedModel->size();
@@ -1438,7 +1512,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
               std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
               file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
             }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
+          }
+          // serialize and save timing cache
+          if (timing_cache_enable_) {
+            auto timing_cache = trt_config->getTimingCache();
+            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+            if (timingCacheHostData == nullptr) {
+              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                     "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+            }
+            saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+            if (detailed_build_log_) {
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
+            }
           }
         }
       }
@@ -1504,7 +1591,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
             input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
             dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
             runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_,
-            dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_};
+            dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
+            force_timing_cache_match_, detailed_build_log_};
       *state = p.release();
       return 0;
     };
@@ -1545,6 +1633,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
       const std::string engine_cache_path = cache_path + ".engine";
       const std::string profile_cache_path = cache_path + ".profile";
+      std::string timing_cache_path = "";
+      if (timing_cache_enable_) {
+        cudaDeviceProp prop;
+        CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+        timing_cache_path = GetTimingCachePath(cache_path_, prop);
+      }
       if (trt_state->engine_cache_enable && trt_engine == nullptr) {
         std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
         std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
@@ -1779,11 +1873,35 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           trt_config->setDLACore(trt_state->dla_core);
         }
 
+        // Load timing cache from file. Create a fresh cache if the file doesn't exist
+        std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+        if (trt_state->timing_cache_enable) {
+          std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+          timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+          if (timing_cache == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not create timing cache: " + timing_cache_path);
+          }
+          trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
+          }
+        }
+
         // Build engine
         {
           auto lock = GetApiLock();
+          std::chrono::steady_clock::time_point engine_build_start;
+          if (detailed_build_log_) {
+            engine_build_start = std::chrono::steady_clock::now();
+          }
           *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
               trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
+          if (detailed_build_log_) {
+            auto engine_build_stop = std::chrono::steady_clock::now();
+            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " <<
+              std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+          }
         }
         if (trt_state->engine == nullptr) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
@@ -1809,6 +1927,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           }
         }
 
+        // serialize and save timing cache
+        if (trt_state->timing_cache_enable) {
+          auto timing_cache = trt_config->getTimingCache();
+          std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+          if (timingCacheHostData == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+          }
+          saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
+          }
+        }
+
         // Build context
         if (trt_state->context_memory_sharing_enable) {
           *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 784657e54dd44..cb87b31e01b96 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -30,6 +30,9 @@ static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LI
 static const std::string kForceSequentialEngineBuild = "ORT_TENSORRT_FORCE_SEQUENTIAL_ENGINE_BUILD";
 static const std::string kContextMemorySharingEnable = "ORT_TENSORRT_CONTEXT_MEMORY_SHARING_ENABLE";
 static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_FALLBACK";
+static const std::string kTimingCacheEnable = "ORT_TENSORRT_TIMING_CACHE_ENABLE";
+static const std::string kForceTimingCache = "ORT_TENSORRT_FORCE_TIMING_CACHE_ENABLE";
+static const std::string kDetailedBuildLog = "ORT_TENSORRT_DETAILED_BUILD_LOG_ENABLE";
 // Old env variable for backward compatibility
 static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
 }  // namespace tensorrt_env_vars
@@ -114,6 +117,9 @@ struct TensorrtFuncState {
   bool engine_decryption_enable = false;
   int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
   int (*engine_encryption)(const char*, char*, size_t) = nullptr;
+  bool timing_cache_enable = true;
+  bool force_timing_cache = false;
+  bool detailed_build_log = false;
 };
 
 // Logical device representation.
@@ -176,6 +182,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool engine_decryption_enable_ = false;
   int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
   int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
+  bool timing_cache_enable_ = false;
+  bool force_timing_cache_match_ = false;
+  bool detailed_build_log_ = false;
 
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
   std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index 2db405d5120b6..ae06e6ce1338d 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -27,11 +27,14 @@ constexpr const char* kCachePath = "trt_engine_cache_path";
 constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
 constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
 constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build";
-// add new provider option name here. 
+// add new provider option name here.
 constexpr const char* kContextMemorySharingEnable = "trt_context_memory_sharing_enable";
 constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback";
+constexpr const char* kTimingCacheEnable = "trt_timing_cache_enable";
+constexpr const char* kForceTimingCacheMatch = "trt_force_timing_cache_match";
+constexpr const char* kDetailedBuildLog = "trt_detailed_build_log";
 }  // namespace provider_option_names
-}  // namespace tensorrt 
+}  // namespace tensorrt
 
 TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
   TensorrtExecutionProviderInfo info{};
@@ -57,15 +60,17 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kInt8CalibTable, info.int8_calibration_table_name)
           .AddAssignmentToReference(tensorrt::provider_option_names::kInt8UseNativeCalibTable, info.int8_use_native_calibration_table)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDLAEnable, info.dla_enable)
-          .AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)		  
+          .AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kCachePath, info.engine_cache_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
-          .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path) 
+          .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build)
           .AddAssignmentToReference(tensorrt::provider_option_names::kContextMemorySharingEnable, info.context_memory_sharing_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kLayerNormFP32Fallback, info.layer_norm_fp32_fallback)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kTimingCacheEnable, info.timing_cache_enable)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kForceTimingCacheMatch, info.force_timing_cache)
           .Parse(options)); // add new provider option here.
 
   return info;
@@ -93,6 +98,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       // add new provider option here.
       {tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.context_memory_sharing_enable)},
       {tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.layer_norm_fp32_fallback)},
+      {tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.timing_cache_enable)},
   };
   return options;
 }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index 1f1fdb679f2a9..1a2e5e01af464 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -17,10 +17,10 @@ struct TensorrtExecutionProviderInfo {
   void* user_compute_stream{nullptr};
   bool has_trt_options{false};
   int max_partition_iterations{1000};
-  int min_subgraph_size{1};  
+  int min_subgraph_size{1};
   size_t max_workspace_size{1 << 30};
   bool fp16_enable{false};
-  bool int8_enable{false}; 
+  bool int8_enable{false};
   std::string int8_calibration_table_name{""};
   bool int8_use_native_calibration_table{false};
   bool dla_enable{false};
@@ -33,6 +33,9 @@ struct TensorrtExecutionProviderInfo {
   bool force_sequential_engine_build{false};
   bool context_memory_sharing_enable{false};
   bool layer_norm_fp32_fallback{false};
+  bool timing_cache_enable{false};
+  bool force_timing_cache{false};
+  bool detailed_build_log{false};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index ad1fdf00227ec..22702bec3b504 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -168,6 +168,17 @@ std::string GetCachePath(const std::string& root, const std::string& name) {
   }
 }
 
+/*
+ * Get Timing by compute capability
+ *
+ */
+std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
+  // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
+  const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc"  +
+      std::to_string(prop.major * 10 + prop.minor) + ".timing";
+  return GetCachePath(root, timing_cache_name);
+}
+
 /*
  * Get cache by type
  *
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index dd6915878eff5..9b4b8236e0b23 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -70,6 +70,8 @@ struct Tensorrt_Provider : Provider {
     info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0;
     info.context_memory_sharing_enable = options.trt_context_memory_sharing_enable != 0;
     info.layer_norm_fp32_fallback = options.trt_layer_norm_fp32_fallback != 0;
+    info.timing_cache_enable = options.trt_timing_cache_enable != 0;
+    info.detailed_build_log = options.trt_detailed_build_log != 0;
     return std::make_shared<TensorrtProviderFactory>(info);
   }
 
@@ -137,6 +139,8 @@ struct Tensorrt_Provider : Provider {
     trt_options.trt_force_sequential_engine_build = internal_options.force_sequential_engine_build;
     trt_options.trt_context_memory_sharing_enable = internal_options.context_memory_sharing_enable;
     trt_options.trt_layer_norm_fp32_fallback = internal_options.layer_norm_fp32_fallback;
+    trt_options.trt_timing_cache_enable = internal_options.timing_cache_enable;
+    trt_options.trt_force_timing_cache = internal_options.force_timing_cache;
   }
 
   ProviderOptions GetProviderOptions(const void* provider_options) override {
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 0e82fdb0097b2..8e70dd24ac10a 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1274,7 +1274,10 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
   trt_options_converted.trt_engine_decryption_lib_path = legacy_trt_options->trt_engine_decryption_lib_path;
   trt_options_converted.trt_force_sequential_engine_build = legacy_trt_options->trt_force_sequential_engine_build;
   // Add new provider option below
-  // Use default value as this field is not available in OrtTensorRTProviderOptionsV
+  // Use default value as this field is not available in OrtTensorRTProviderOptions
+  trt_options_converted.trt_timing_cache_enable = 0;
+  trt_options_converted.trt_force_timing_cache = 0;
+  trt_options_converted.trt_detailed_build_log = 0;
   trt_options_converted.trt_context_memory_sharing_enable = 0;
   trt_options_converted.trt_layer_norm_fp32_fallback = 0;
   return trt_options_converted;
@@ -1601,6 +1604,9 @@ ORT_API_STATUS_IMPL(OrtApis::CreateTensorRTProviderOptions, _Outptr_ OrtTensorRT
   (*out)->trt_force_sequential_engine_build = false;
   (*out)->trt_context_memory_sharing_enable = false;
   (*out)->trt_layer_norm_fp32_fallback = false;
+  (*out)->trt_timing_cache_enable = false;
+  (*out)->trt_force_timing_cache = false;
+  (*out)->trt_detailed_build_log = false;
   return nullptr;
 #else
   ORT_UNUSED_PARAMETER(out);
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index dd24ce51e1111..667073063d4cc 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -366,6 +366,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             nullptr,
             0,
             0,
+            0,
+            0,
+            0,
             0};
         for (auto option : it->second) {
           if (option.first == "device_id") {
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index f6ad3b5d75f77..552274b77bbfb 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -121,6 +121,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     bool trt_force_sequential_engine_build = false;
     bool trt_context_memory_sharing_enable = false;
     bool trt_layer_norm_fp32_fallback = false;
+    bool trt_timing_cache_enable = false;
+    bool trt_force_timing_cache = false;
+    bool trt_detailed_build_log = false;
 
 #ifdef _MSC_VER
     std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
@@ -268,6 +271,30 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         } else {
           ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_layer_norm_fp32_fallback' should be a boolean i.e. true or false. Default value is false.\n");
         }
+      } else if (key == "trt_timing_cache_enable") {
+        if (value == "true" || value == "True") {
+          trt_timing_cache_enable = true;
+        } else if (value == "false" || value == "False") {
+          trt_timing_cache_enable = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_timing_cache_enable' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else if (key == "trt_force_timing_cache") {
+        if (value == "true" || value == "True") {
+          trt_force_timing_cache = true;
+        } else if (value == "false" || value == "False") {
+          trt_force_timing_cache = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_force_timing_cache' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else if (key == "trt_detailed_build_log") {
+        if (value == "true" || value == "True") {
+          trt_detailed_build_log = true;
+        } else if (value == "false" || value == "False") {
+          trt_detailed_build_log = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_detailed_build_log' should be a boolean i.e. true or false. Default value is false.\n");
+        }
       } else {
         ORT_THROW("[ERROR] [TensorRT] wrong key type entered. Choose from the following runtime key options that are available for TensorRT. ['device_id', 'trt_max_partition_iterations', 'trt_min_subgraph_size', 'trt_max_workspace_size', 'trt_fp16_enable', 'trt_int8_enable', 'trt_int8_calibration_table_name', 'trt_int8_use_native_calibration_table', 'trt_dla_enable', 'trt_dla_core', 'trt_dump_subgraphs', 'trt_engine_cache_enable', 'trt_engine_cache_path', 'trt_engine_decryption_enable', 'trt_engine_decryption_lib_path', 'trt_force_sequential_engine_build', 'trt_context_memory_sharing_enable', 'trt_layer_norm_fp32_fallback'] \n");
       }
@@ -293,6 +320,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     tensorrt_options.trt_force_sequential_engine_build = trt_force_sequential_engine_build;
     tensorrt_options.trt_context_memory_sharing_enable = trt_context_memory_sharing_enable;
     tensorrt_options.trt_layer_norm_fp32_fallback = trt_layer_norm_fp32_fallback;
+    tensorrt_options.trt_timing_cache_enable = trt_timing_cache_enable;
+    tensorrt_options.trt_force_timing_cache = trt_force_timing_cache;
+    tensorrt_options.trt_detailed_build_log = trt_detailed_build_log;
     session_options.AppendExecutionProvider_TensorRT_V2(tensorrt_options);
 
     OrtCUDAProviderOptions cuda_options;
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index 2039c65b53aa6..b27ded96d85a9 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -114,7 +114,9 @@ Status PerformanceRunner::Run() {
   }
 
   // warm up
+  initial_inference_result_.start = std::chrono::high_resolution_clock::now();
   ORT_RETURN_IF_ERROR(RunOneIteration<true>());
+  initial_inference_result_.end = std::chrono::high_resolution_clock::now();
 
   // TODO: start profiling
   // if (!performance_test_config_.run_config.profile_file.empty())
@@ -139,9 +141,12 @@ Status PerformanceRunner::Run() {
   std::chrono::duration<double> session_create_duration = session_create_end_ - session_create_start_;
   // TODO: end profiling
   // if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
+  auto first_inference_duration =
+    std::chrono::duration_cast<std::chrono::milliseconds>(initial_inference_result_.end - initial_inference_result_.start).count();
   std::chrono::duration<double> inference_duration = performance_result_.end - performance_result_.start;
 
   std::cout << "Session creation time cost: " << session_create_duration.count() << " s\n"
+            << "First inference time cost: " << first_inference_duration << " ms\n"
             << "Total inference time cost: " << performance_result_.total_time_cost << " s\n"  // sum of time taken by each request
             << "Total inference requests: " << performance_result_.time_costs.size() << "\n"
             << "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n"
diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h
index aae68fd2d289f..da2df9c39f44c 100644
--- a/onnxruntime/test/perftest/performance_runner.h
+++ b/onnxruntime/test/perftest/performance_runner.h
@@ -106,6 +106,7 @@ class PerformanceRunner {
  private:
   std::chrono::time_point<std::chrono::high_resolution_clock> session_create_start_;
   std::chrono::time_point<std::chrono::high_resolution_clock> session_create_end_;
+  PerformanceResult initial_inference_result_;
   PerformanceResult performance_result_;
   PerformanceTestConfig performance_test_config_;
   std::unique_ptr<TestModelInfo> test_model_info_;
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index 2073beb0cfc83..1da491fe4f9b0 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -683,7 +683,7 @@ TEST_P(ModelTest, Run) {
         if (test_case_name.find(ORT_TSTR("FLOAT16")) != std::string::npos) {
           OrtTensorRTProviderOptionsV2 params{0, 0, nullptr, 1000, 1, 1 << 30,
                                               1,  // enable fp16
-                                              0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0};
+                                              0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0};
           ortso.AppendExecutionProvider_TensorRT_V2(params);
         } else {
           OrtTensorRTProviderOptionsV2* ep_option = nullptr;
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 2e139e0f57e68..2d38bf7b4b3ba 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -11,6 +11,8 @@
 #include "core/providers/tensorrt/tensorrt_execution_provider_utils.h"
 #include <string>
 #include <thread>
+#include <filesystem>
+#include <chrono>
 
 using namespace std;
 using namespace ONNX_NAMESPACE;
@@ -151,6 +153,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
       nullptr,
       0,
       0,
+      0,
+      0,
+      0,
       0};
 
     params.trt_engine_cache_enable = 1;
@@ -222,6 +227,9 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
       nullptr,
       0,
       0,
+      0,
+      0,
+      0,
       0};
 
     params.trt_engine_cache_enable = 1;
@@ -386,6 +394,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
       nullptr,
       0,
       0,
+      0,
+      0,
+      0,
       0};
 
   if (cache_type.compare("engine") == 0) {
@@ -498,9 +509,68 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
     }
   } else if (cache_type.compare("timing") == 0) {
      // add test code here
+
+    /* Following code block tests the functionality of engine and optimization profile of ORT TRT, including:
+     * - timing cache cache serialization/de-serialization
+     * - benefir of usign a timing cache no matter if dynamic / static input
+     */
+    uint64_t compilation_without_cache_ms, compilation_with_cache_ms;
+
+    params.trt_timing_cache_enable = 1;
+    //  std::chrono
+    {
+      auto start = chrono::steady_clock::now();
+      std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+      EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+      auto status = session_object.Load(model_name);
+      ASSERT_TRUE(status.IsOK());
+      status = session_object.Initialize();
+      ASSERT_TRUE(status.IsOK());
+
+      // run inference
+      // TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
+      status = session_object.Run(run_options, feeds, output_names, &fetches);
+      auto end = chrono::steady_clock::now();
+      ASSERT_TRUE(status.IsOK());
+      VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
+      ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
+      compilation_without_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
+    }
+
+    // get new session and reinitialize model
+    // second same inference should resuse the cache and therefore have a faster build
+    if (input_type.compare("static") == 0) {
+      {
+        InferenceSession session_object_new{so, GetEnvironment()};
+        {
+          auto start = chrono::steady_clock::now();
+          std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+          EXPECT_TRUE(session_object_new.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+          auto status = session_object_new.Load(model_name);
+          ASSERT_TRUE(status.IsOK());
+          status = session_object_new.Initialize();
+          ASSERT_TRUE(status.IsOK());
+
+          // run inference
+          // TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
+          status = session_object_new.Run(run_options, feeds, output_names, &fetches);
+          // TODO narrow down actual compilation section
+          auto end = chrono::steady_clock::now();
+
+          ASSERT_TRUE(status.IsOK());
+          VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
+          ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
+          compilation_with_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
+        }
+      }
+      ASSERT_TRUE(compilation_with_cache_ms <= compilation_without_cache_ms);
+    } else {
+      // TODO test dynamic shapes
+    }
   }
 
   // clean up caches
+  RemoveCachesByType("./", ".timing");
   RemoveCachesByType("./", ".engine");
   RemoveCachesByType("./", ".profile");
 }
@@ -515,11 +585,13 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
  * We have following test parameters:
  * - engine_static: engine cache enabled with non-dynamic input shape
  * - engine_dynamic: engine cache enabled with dynamic input shape
- * - timing_static: will be added
- * - timing_dynamic: will be added
+ * - timing_static: timing cache enabled, static input shape
+ * - timing_dynamic: timing cache enabled, static input shape
  */
 INSTANTIATE_TEST_SUITE_P(TensorrtExecutionProviderCacheTests, TensorrtExecutionProviderCacheTest, testing::Values("engine_static",
-                                                                                                                  "engine_dynamic"),
+                                                                                                                  "engine_dynamic",
+                                                                                                                  "timing_static",
+                                                                                                                  "timing_dynamic"),
                                                                                                   [](const ::testing::TestParamInfo<TensorrtExecutionProviderCacheTest::ParamType>& info) {return info.param;});
 
 TEST(TensorrtExecutionProviderTest, FunctionTest) {