pytorch · lsy323 · Mar 26, 2025 · Mar 18, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/torch_xla/csrc/runtime/tensor_source.h b/torch_xla/csrc/runtime/tensor_source.h
@@ -54,13 +54,23 @@ class AtenSource : public TensorSource {
     if (target_torch_type != tensor.type().scalarType()) {
       TORCH_LAZY_COUNTER("AtenSourceDowncasts", 1);
     }
-    // TODO(ysiraichi): check, first, if tensor lives in a device that the
-    // current PjRt client has access. If so, we don't need to go through the
-    // CPU.
-    tensor_ = std::move(
-        tensor.to(at::TensorOptions().device(at::kCPU).dtype(target_torch_type),
-                  /*non_blocking=*/false,
-                  /*copy=*/true, at::MemoryFormat::Contiguous));
+    // The purposes of copy are:
+    // 1. Ensure the memory is contiguous, which is expected by PJRT.
+    // 2. Move CUDA tensor to CPU since we cannot pass CUDA memory to PJRT now.
+    // 3. Cast data type.
+    // We can avoid if copy is not needed.
+    if (tensor.device() == at::kCPU && tensor.is_contiguous() &&
+        tensor.dtype() == target_torch_type) {
+      tensor_ = std::move(tensor);
+    } else {
+      // TODO(ysiraichi): check, first, if tensor lives in a device that the
+      // current PjRt client has access. If so, we don't need to go through the
+      // CPU.
+      tensor_ = std::move(tensor.to(
+          at::TensorOptions().device(at::kCPU).dtype(target_torch_type),
+          /*non_blocking=*/false,
+          /*copy=*/true, at::MemoryFormat::Contiguous));
+    }
   }
 
   const void* data() const override { return tensor_.const_data_ptr(); }