microsoft · qjia7 · Jan 14, 2026 · Jan 7, 2026 · Jan 12, 2026
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -628,17 +628,15 @@ void WebGpuContext::CollectProfilingData(profiling::Events& events) {
 
       for (size_t i = 0; i < pending_kernels.size(); i++) {
         const PendingKernelInfo& pending_kernel_info = pending_kernels[i];
-        const auto& inputs = pending_kernel_info.inputs;
-        const auto& outputs = pending_kernel_info.outputs;
+        const auto& input_shapes = pending_kernel_info.input_shapes;
+        const auto& output_shapes = pending_kernel_info.output_shapes;
 
         SS(shapes, 128);
-        for (size_t s = 0; s < inputs.size(); s++) {
-          const auto& input = inputs[s];
-          shapes << "inputs[" << s << "] = " << input.override_shape.ToString() << " ";
+        for (size_t s = 0; s < input_shapes.size(); s++) {
+          shapes << "inputs[" << s << "] = " << input_shapes[s].ToString() << " ";
         }
-        for (size_t s = 0; s < outputs.size(); s++) {
-          const auto& output = outputs[s];
-          shapes << "outputs[" << s << "] = " << output.override_shape.ToString() << " ";
+        for (size_t s = 0; s < output_shapes.size(); s++) {
+          shapes << "outputs[" << s << "] = " << output_shapes[s].ToString() << " ";
         }
 
         if (gpu_timestamp_offset_ == 0) {

diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -268,16 +268,26 @@ class WebGpuContext final {
                       std::string_view cache_key,
                       const std::vector<ProgramInput>& inputs,
                       const std::vector<ProgramOutput>& outputs)
-        : name{absl::StrJoin({kernel_name, kernel_type, program_name}, "&")}, cache_key{cache_key}, inputs{inputs}, outputs{outputs} {}
+        : name{absl::StrJoin({kernel_name, kernel_type, program_name}, "&")}, cache_key{cache_key} {
+      // Store shape information instead of tensor pointers to avoid accessing released tensors
+      input_shapes.reserve(inputs.size());
+      for (const auto& input : inputs) {
+        input_shapes.emplace_back(input.use_override_shape ? input.override_shape : input.tensor->Shape());
+      }
+      output_shapes.reserve(outputs.size());
+      for (const auto& output : outputs) {
+        output_shapes.emplace_back(output.use_override_shape ? output.override_shape : output.tensor->Shape());
+      }
+    }
 
     PendingKernelInfo(PendingKernelInfo&&) = default;
     PendingKernelInfo& operator=(PendingKernelInfo&&) = default;
     ORT_DISALLOW_COPY_AND_ASSIGNMENT(PendingKernelInfo);
 
     std::string name;
     std::string cache_key;
-    std::vector<ProgramInput> inputs;
-    std::vector<ProgramOutput> outputs;
+    std::vector<TensorShape> input_shapes;
+    std::vector<TensorShape> output_shapes;
   };
 
   struct PendingQueryInfo {