From 2bfcf5154582821d22b49fcc013641de2a79f0b4 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 21 Nov 2025 00:26:04 +0000
Subject: [PATCH 01/11] [core][cgraph] Introduce fault-tolerant
 PushMutableObject

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../experimental_mutable_object_provider.cc   |  78 ++++-
 .../experimental_mutable_object_provider.h    |  12 +-
 src/ray/core_worker/tests/BUILD.bazel         |   2 +
 .../tests/mutable_object_provider_test.cc     | 286 +++++++++++++++++-
 src/ray/raylet_rpc_client/raylet_client.cc    |   6 +-
 src/ray/raylet_rpc_client/tests/BUILD.bazel   |  18 ++
 .../tests/raylet_client_test.cc               | 246 +++++++++++++++
 7 files changed, 632 insertions(+), 16 deletions(-)
 create mode 100644 src/ray/raylet_rpc_client/tests/raylet_client_test.cc
diff --git a/src/ray/core_worker/experimental_mutable_object_provider.cc b/src/ray/core_worker/experimental_mutable_object_provider.cc
index df0613c9f347..07bd85c72f1b 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.cc
+++ b/src/ray/core_worker/experimental_mutable_object_provider.cc
@@ -136,19 +136,65 @@ void MutableObjectProvider::HandlePushMutableObject(
   uint64_t offset = request.offset();
   uint64_t chunk_size = request.chunk_size();
 
+  // Validate request bounds to prevent buffer overflows.
+  RAY_CHECK_LE(offset + chunk_size, total_data_size)
+      << "Chunk extends beyond total data size. offset=" << offset
+      << ", chunk_size=" << chunk_size << ", total_data_size=" << total_data_size;
+  RAY_CHECK_EQ(request.data().size(), chunk_size)
+      << "Data size mismatch. Expected " << chunk_size << " bytes, got "
+      << request.data().size() << " bytes";
+  RAY_CHECK_EQ(request.metadata().size(), total_metadata_size)
+      << "Metadata size mismatch. Expected " << total_metadata_size << " bytes, got "
+      << request.metadata().size() << " bytes";
+
+  // Check if this chunk has already been received (idempotent retry handling).
+  bool is_new_chunk = false;
   uint64_t tmp_written_so_far = 0;
+  bool object_complete = false;
+  bool needs_write_acquire = false;
   {
     absl::MutexLock guard(&written_so_far_lock_);
 
-    tmp_written_so_far = written_so_far_[writer_object_id];
-    written_so_far_[writer_object_id] += chunk_size;
-    if (written_so_far_[writer_object_id] == total_data_size) {
-      written_so_far_.erase(written_so_far_.find(writer_object_id));
+    // Initialize tracking for this object if needed.
+    auto &received_chunks = received_chunks_[writer_object_id];
+
+    // Check if we've already received this specific chunk by offset.
+    if (received_chunks.find(offset) != received_chunks.end()) {
+      // This chunk was already received (retry) - skip processing but return status.
+      auto written_it = written_so_far_.find(writer_object_id);
+      if (written_it != written_so_far_.end()) {
+        tmp_written_so_far = written_it->second;
+        object_complete = (tmp_written_so_far == total_data_size);
+      } else {
+        // Tracking was cleaned up, meaning object is complete.
+        object_complete = true;
+      }
+    } else {
+      // This is a new chunk - check tracking but don't mark as received yet.
+      // We'll mark it as received only after successfully writing it.
+      is_new_chunk = true;
+      tmp_written_so_far = written_so_far_[writer_object_id];
+
+      // Check if WriteAcquire needs to be called. This handles out-of-order chunks:
+      // only the first chunk (or first chunk to arrive) will call WriteAcquire.
+      if (!write_acquired_[writer_object_id]) {
+        needs_write_acquire = true;
+        write_acquired_[writer_object_id] = true;
+      }
     }
   }
 
+  // If this is a retry of an already-received chunk, return current status without
+  // re-processing the data (idempotent operation).
+  if (!is_new_chunk) {
+    reply->set_done(object_complete);
+    return;
+  }
+
   std::shared_ptr<Buffer> object_backing_store;
-  if (tmp_written_so_far == 0u) {
+  if (needs_write_acquire) {
+    // First chunk to arrive (may not be offset 0 due to out-of-order delivery) -
+    // acquire write lock and allocate backing store.
     // We set `metadata` to nullptr since the metadata is at the end of the object, which
     // we will not have until the last chunk is received.
     RAY_CHECK_OK(object_manager_->WriteAcquire(info.local_object_id,
@@ -158,6 +204,8 @@ void MutableObjectProvider::HandlePushMutableObject(
                                                info.num_readers,
                                                object_backing_store));
   } else {
+    // Subsequent chunk (or chunk arriving after WriteAcquire was called by another chunk)
+    // - get existing backing store.
     RAY_CHECK_OK(object_manager_->GetObjectBackingStore(info.local_object_id,
                                                         total_data_size,
                                                         total_metadata_size,
@@ -165,11 +213,29 @@ void MutableObjectProvider::HandlePushMutableObject(
   }
   RAY_CHECK(object_backing_store);
 
+  // Copy chunk data to backing store.
   memcpy(object_backing_store->Data() + offset, request.data().data(), chunk_size);
   size_t total_written = tmp_written_so_far + chunk_size;
   RAY_CHECK_LE(total_written, total_data_size);
+
+  // Mark this chunk as received only after successfully writing it.
+  // This ensures retries are handled correctly even if WriteAcquire fails.
+  {
+    absl::MutexLock guard(&written_so_far_lock_);
+    received_chunks_[writer_object_id].insert(offset);
+    // Update written_so_far_ by adding this chunk's size.
+    // Note: We increment rather than set to handle potential out-of-order chunks.
+    written_so_far_[writer_object_id] += chunk_size;
+    if (written_so_far_[writer_object_id] == total_data_size) {
+      object_complete = true;
+      // Note: We keep received_chunks_ and written_so_far_ entries to handle
+      // retries. They will be cleaned up when the object is overwritten (next
+      // WriteAcquire will reset state) or when the object is unregistered.
+    }
+  }
+
   if (total_written == total_data_size) {
-    // Copy the metadata to the end of the object.
+    // All data chunks received - copy metadata and release write lock.
     memcpy(object_backing_store->Data() + total_data_size,
            request.metadata().data(),
            total_metadata_size);
diff --git a/src/ray/core_worker/experimental_mutable_object_provider.h b/src/ray/core_worker/experimental_mutable_object_provider.h
index ad6757983431..4d1ee02a2b71 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.h
+++ b/src/ray/core_worker/experimental_mutable_object_provider.h
@@ -15,6 +15,7 @@
 
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "ray/common/asio/instrumented_io_context.h"
@@ -239,13 +240,22 @@ class MutableObjectProvider : public MutableObjectProviderInterface {
   // and then send the changes to remote nodes via the network.
   std::vector<std::unique_ptr<std::thread>> io_threads_;
 
-  // Protects the `written_so_far_` map.
+  // Protects the `written_so_far_` map and `received_chunks_` set.
   absl::Mutex written_so_far_lock_;
   // For objects larger than the gRPC max payload size *that this node receives from a
   // writer node*, this map tracks how many bytes have been received so far for a single
   // object write.
   std::unordered_map<ObjectID, uint64_t> written_so_far_
       ABSL_GUARDED_BY(written_so_far_lock_);
+  // Tracks which chunks (by offset) have been received for each object to handle retries
+  // idempotently. Each offset uniquely identifies a chunk.
+  std::unordered_map<ObjectID, std::unordered_set<uint64_t>> received_chunks_
+      ABSL_GUARDED_BY(written_so_far_lock_);
+  // Tracks whether WriteAcquire has been called for each object to handle out-of-order
+  // chunks. This ensures WriteAcquire is called exactly once, even if chunks arrive
+  // concurrently or out of order.
+  std::unordered_map<ObjectID, bool> write_acquired_
+      ABSL_GUARDED_BY(written_so_far_lock_);
 
   friend class MutableObjectProvider_MutableObjectBufferReadRelease_Test;
 };
diff --git a/src/ray/core_worker/tests/BUILD.bazel b/src/ray/core_worker/tests/BUILD.bazel
index 217d8bd3640f..96be6e7f3382 100644
--- a/src/ray/core_worker/tests/BUILD.bazel
+++ b/src/ray/core_worker/tests/BUILD.bazel
@@ -236,6 +236,8 @@ ray_cc_test(
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/src/ray/core_worker/tests/mutable_object_provider_test.cc b/src/ray/core_worker/tests/mutable_object_provider_test.cc
index 8a5207fd6e1b..d3b1b7c9dc17 100644
--- a/src/ray/core_worker/tests/mutable_object_provider_test.cc
+++ b/src/ray/core_worker/tests/mutable_object_provider_test.cc
@@ -12,15 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
 #include <limits>
 #include <memory>
 #include <string>
+#include <thread>
 #include <unordered_set>
 #include <vector>
 
 #include "absl/functional/bind_front.h"
 #include "absl/random/random.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/barrier.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "mock/ray/object_manager/plasma/client.h"
@@ -42,34 +47,47 @@ class TestPlasma : public plasma::MockPlasmaClient {
   Status GetExperimentalMutableObject(
       const ObjectID &object_id,
       std::unique_ptr<plasma::MutableObject> *mutable_object) override {
+    absl::MutexLock guard(&lock_);
     if (!objects_.count(object_id)) {
-      *mutable_object = MakeObject();
+      // Use a larger default size to support tests with larger objects
+      // Need at least 2048 bytes to accommodate tests with variable chunk sizes
+      *mutable_object = MakeObject(/*min_size=*/2048);
       objects_.insert(object_id);
     } else {
-      *mutable_object = nullptr;
+      // Object already exists - return a new view of it
+      // For testing, we create a new object each time since the real implementation
+      // would return a view of the existing object
+      *mutable_object = MakeObject(/*min_size=*/2048);
     }
     return Status::OK();
   }
 
+  ~TestPlasma() override {
+    // Objects are managed by the MutableObjectProvider, so we don't free them here
+  }
+
  private:
   // Creates a new mutable object. It is the caller's responsibility to free the backing
   // store.
-  std::unique_ptr<plasma::MutableObject> MakeObject() {
-    constexpr size_t kPayloadSize = 128;
-    constexpr size_t kSize = sizeof(PlasmaObjectHeader) + kPayloadSize;
+  std::unique_ptr<plasma::MutableObject> MakeObject(size_t min_size = 128) {
+    // Allocate enough space for header + data + metadata
+    // Round up to ensure we have enough space
+    size_t payload_size = std::max(min_size, static_cast<size_t>(128));
+    size_t total_size = sizeof(PlasmaObjectHeader) + payload_size;
 
     plasma::PlasmaObject info{};
     info.header_offset = 0;
     info.data_offset = sizeof(PlasmaObjectHeader);
-    info.allocated_size = kPayloadSize;
+    info.allocated_size = payload_size;
 
-    uint8_t *ptr = static_cast<uint8_t *>(malloc(kSize));
+    uint8_t *ptr = static_cast<uint8_t *>(malloc(total_size));
     RAY_CHECK(ptr);
     auto ret = std::make_unique<plasma::MutableObject>(ptr, info);
     ret->header->Init();
     return ret;
   }
 
+  absl::Mutex lock_;
   // Tracks the mutable objects that have been created.
   std::unordered_set<ObjectID> objects_;
 };
@@ -369,6 +387,260 @@ TEST(MutableObjectProvider, MutableObjectBufferSetErrorBeforeReadRelease) {
             StatusCode::ChannelError);
 }
 
+// Test that emulates the out-of-order PushMutableObject scenario described in
+// https://github.com/ray-project/ray/issues/58426
+//
+// Scenario: Multiple ranks send PushMutableObject requests concurrently. Due to
+// network jitter and OS scheduling, some requests arrive immediately while others
+// are delayed. This test verifies that the receiver handles out-of-order requests
+// correctly without deadlocking.
+//
+// The test simulates:
+// 1. Multiple "ranks" (4 ranks: rank0-rank3) sending requests concurrently
+// 2. Network jitter causing some requests to be delayed
+// 3. A new round of requests arriving before previous round completes
+// 4. Verifies no deadlock occurs and all data is correctly received
+TEST(MutableObjectProvider, HandleOutOfOrderPushMutableObject) {
+  constexpr int kNumRanks = 4;
+  constexpr size_t kDataSize = 64;
+  constexpr size_t kMetadataSize = 8;
+
+  // Create provider and register reader channels for each rank
+  auto plasma = std::make_shared<TestPlasma>();
+  MutableObjectProvider provider(plasma, /*factory=*/nullptr, nullptr);
+
+  std::vector<ObjectID> writer_object_ids;
+  std::vector<ObjectID> reader_object_ids;
+  for (int i = 0; i < kNumRanks; i++) {
+    writer_object_ids.push_back(ObjectID::FromRandom());
+    reader_object_ids.push_back(ObjectID::FromRandom());
+    provider.HandleRegisterMutableObject(
+        writer_object_ids[i], /*num_readers=*/1, reader_object_ids[i]);
+  }
+
+  // Prepare data for each rank
+  std::vector<std::vector<uint8_t>> rank_data(kNumRanks);
+  std::vector<std::vector<uint8_t>> rank_metadata(kNumRanks);
+  for (int i = 0; i < kNumRanks; i++) {
+    rank_data[i].resize(kDataSize, static_cast<uint8_t>(i));
+    rank_metadata[i].resize(kMetadataSize, static_cast<uint8_t>(i + 100));
+  }
+
+  // Simulate two rounds of requests
+  constexpr int kNumRounds = 2;
+
+  // Track request completion
+  std::vector<std::vector<bool>> round_completed(kNumRounds);
+  for (int round = 0; round < kNumRounds; round++) {
+    round_completed[round].resize(kNumRanks, false);
+  }
+
+  // Barrier to synchronize the start of each round
+  std::vector<std::unique_ptr<absl::Barrier>> round_barriers;
+  for (int round = 0; round < kNumRounds; round++) {
+    round_barriers.push_back(std::make_unique<absl::Barrier>(kNumRanks));
+  }
+
+  // Function for each rank to send requests for all rounds
+  auto rank_sender = [&](int rank) {
+    for (int round = 0; round < kNumRounds; round++) {
+      // Wait for all ranks to be ready before sending this round
+      round_barriers[round]->Block();
+
+      // Simulate network jitter: rank0 and rank1 send immediately,
+      // rank2 and rank3 are delayed (simulating the issue scenario)
+      if (rank >= 2 && round == 0) {
+        // Delay ranks 2-3 in the first round to simulate network jitter
+        absl::SleepFor(absl::Milliseconds(50));
+      }
+
+      ray::rpc::PushMutableObjectRequest request;
+      request.set_writer_object_id(writer_object_ids[rank].Binary());
+      request.set_total_data_size(kDataSize);
+      request.set_total_metadata_size(kMetadataSize);
+      request.set_offset(0);
+      request.set_chunk_size(kDataSize);
+      request.set_data(rank_data[rank].data(), kDataSize);
+      request.set_metadata(rank_metadata[rank].data(), kMetadataSize);
+
+      ray::rpc::PushMutableObjectReply reply;
+
+      // Send request - this should not block even if other ranks' requests
+      // are delayed or out of order
+      provider.HandlePushMutableObject(request, &reply);
+
+      EXPECT_TRUE(reply.done())
+          << "Rank " << rank << " round " << round << " should complete";
+      round_completed[round][rank] = true;
+
+      // Small delay after rank0 completes round 0 to simulate the scenario where
+      // rank0 sends next round before ranks 2-3 complete previous round
+      if (rank == 0 && round == 0) {
+        absl::SleepFor(absl::Milliseconds(10));
+      }
+    }
+  };
+
+  // Launch one thread per rank (each rank sends all rounds)
+  std::vector<std::thread> threads;
+  for (int rank = 0; rank < kNumRanks; rank++) {
+    threads.emplace_back(rank_sender, rank);
+  }
+
+  // Wait for all threads to complete (with timeout to detect deadlocks)
+  auto start_time = absl::Now();
+  for (auto &thread : threads) {
+    thread.join();
+  }
+  auto elapsed = absl::Now() - start_time;
+
+  // Verify no deadlock occurred (should complete quickly)
+  EXPECT_LT(elapsed, absl::Seconds(5))
+      << "Test should complete quickly; deadlock suspected if timeout";
+
+  // Verify all requests completed successfully
+  for (int round = 0; round < kNumRounds; round++) {
+    for (int rank = 0; rank < kNumRanks; rank++) {
+      EXPECT_TRUE(round_completed[round][rank])
+          << "Rank " << rank << " round " << round << " did not complete";
+    }
+  }
+
+  // Verify data integrity: read back the data for each rank
+  for (int rank = 0; rank < kNumRanks; rank++) {
+    std::shared_ptr<RayObject> result;
+    EXPECT_EQ(provider.ReadAcquire(reader_object_ids[rank], result).code(),
+              StatusCode::OK)
+        << "Failed to read data for rank " << rank;
+
+    EXPECT_EQ(result->GetData()->Size(), kDataSize);
+    EXPECT_EQ(result->GetMetadata()->Size(), kMetadataSize);
+
+    // Verify data content
+    const uint8_t *data_ptr = result->GetData()->Data();
+    for (size_t i = 0; i < kDataSize; i++) {
+      EXPECT_EQ(data_ptr[i], static_cast<uint8_t>(rank))
+          << "Data mismatch for rank " << rank << " at offset " << i;
+    }
+
+    // Verify metadata content
+    const uint8_t *metadata_ptr = result->GetMetadata()->Data();
+    for (size_t i = 0; i < kMetadataSize; i++) {
+      EXPECT_EQ(metadata_ptr[i], static_cast<uint8_t>(rank + 100))
+          << "Metadata mismatch for rank " << rank << " at offset " << i;
+    }
+
+    EXPECT_EQ(provider.ReadRelease(reader_object_ids[rank]).code(), StatusCode::OK);
+  }
+}
+
+// Test retry handling with out-of-order chunks
+// Simulates the scenario where a chunk is retried and arrives out of order
+TEST(MutableObjectProvider, HandleRetryOutOfOrderChunks) {
+  constexpr size_t kChunk0Size = 256;
+  constexpr size_t kChunk1Size = 512;
+  constexpr size_t kChunk2Size = 384;
+  constexpr size_t kTotalDataSize = kChunk0Size + kChunk1Size + kChunk2Size;  // 3 chunks
+  constexpr size_t kMetadataSize = 16;
+
+  ObjectID writer_object_id = ObjectID::FromRandom();
+  ObjectID reader_object_id = ObjectID::FromRandom();
+  auto plasma = std::make_shared<TestPlasma>();
+  MutableObjectProvider provider(plasma, /*factory=*/nullptr, nullptr);
+
+  provider.HandleRegisterMutableObject(
+      writer_object_id, /*num_readers=*/1, reader_object_id);
+
+  // Prepare chunk data
+  std::vector<std::vector<uint8_t>> chunk_data(3);
+  std::vector<uint8_t> metadata(kMetadataSize, 0xAB);
+  chunk_data[0].resize(kChunk0Size, static_cast<uint8_t>(0));
+  chunk_data[1].resize(kChunk1Size, static_cast<uint8_t>(1));
+  chunk_data[2].resize(kChunk2Size, static_cast<uint8_t>(2));
+
+  // Send chunks out of order: chunk 1, then chunk 0 (retry scenario),
+  // then chunk 2
+  std::vector<ray::rpc::PushMutableObjectReply> replies(3);
+
+  // Chunk 1 arrives first (offset = kChunk0Size)
+  {
+    ray::rpc::PushMutableObjectRequest request;
+    request.set_writer_object_id(writer_object_id.Binary());
+    request.set_total_data_size(kTotalDataSize);
+    request.set_total_metadata_size(kMetadataSize);
+    request.set_offset(kChunk0Size);
+    request.set_chunk_size(kChunk1Size);
+    request.set_data(chunk_data[1].data(), kChunk1Size);
+    request.set_metadata(metadata.data(), kMetadataSize);
+    provider.HandlePushMutableObject(request, &replies[1]);
+    EXPECT_FALSE(replies[1].done()) << "Chunk 1 should not complete the object";
+  }
+
+  // Chunk 0 arrives second (offset = 0) - simulates retry or out-of-order
+  {
+    ray::rpc::PushMutableObjectRequest request;
+    request.set_writer_object_id(writer_object_id.Binary());
+    request.set_total_data_size(kTotalDataSize);
+    request.set_total_metadata_size(kMetadataSize);
+    request.set_offset(0);
+    request.set_chunk_size(kChunk0Size);
+    request.set_data(chunk_data[0].data(), kChunk0Size);
+    request.set_metadata(metadata.data(), kMetadataSize);
+    provider.HandlePushMutableObject(request, &replies[0]);
+    EXPECT_FALSE(replies[0].done()) << "Chunk 0 should not complete the object";
+  }
+
+  // Retry chunk 0 (idempotent - should be handled gracefully)
+  {
+    ray::rpc::PushMutableObjectRequest request;
+    request.set_writer_object_id(writer_object_id.Binary());
+    request.set_total_data_size(kTotalDataSize);
+    request.set_total_metadata_size(kMetadataSize);
+    request.set_offset(0);
+    request.set_chunk_size(kChunk0Size);
+    request.set_data(chunk_data[0].data(), kChunk0Size);
+    request.set_metadata(metadata.data(), kMetadataSize);
+    ray::rpc::PushMutableObjectReply retry_reply;
+    provider.HandlePushMutableObject(request, &retry_reply);
+    // Retry should return current status without error
+    EXPECT_FALSE(retry_reply.done()) << "Retry of chunk 0 should return current status";
+  }
+
+  // Chunk 2 arrives last (offset = kChunk0Size + kChunk1Size)
+  {
+    ray::rpc::PushMutableObjectRequest request;
+    request.set_writer_object_id(writer_object_id.Binary());
+    request.set_total_data_size(kTotalDataSize);
+    request.set_total_metadata_size(kMetadataSize);
+    request.set_offset(kChunk0Size + kChunk1Size);
+    request.set_chunk_size(kChunk2Size);
+    request.set_data(chunk_data[2].data(), kChunk2Size);
+    request.set_metadata(metadata.data(), kMetadataSize);
+    provider.HandlePushMutableObject(request, &replies[2]);
+    EXPECT_TRUE(replies[2].done()) << "Chunk 2 should complete the object";
+  }
+
+  // Verify all chunks were received correctly
+  std::shared_ptr<RayObject> result;
+  EXPECT_EQ(provider.ReadAcquire(reader_object_id, result).code(), StatusCode::OK);
+
+  EXPECT_EQ(result->GetData()->Size(), kTotalDataSize);
+  EXPECT_EQ(result->GetMetadata()->Size(), kMetadataSize);
+
+  // Verify data integrity - check each chunk
+  const uint8_t *data_ptr = result->GetData()->Data();
+  size_t chunk_offsets[3] = {0, kChunk0Size, kChunk0Size + kChunk1Size};
+  size_t chunk_sizes[3] = {kChunk0Size, kChunk1Size, kChunk2Size};
+  for (int chunk = 0; chunk < 3; chunk++) {
+    for (size_t i = 0; i < chunk_sizes[chunk]; i++) {
+      EXPECT_EQ(data_ptr[chunk_offsets[chunk] + i], static_cast<uint8_t>(chunk))
+          << "Data mismatch at chunk " << chunk << " offset " << i;
+    }
+  }
+
+  EXPECT_EQ(provider.ReadRelease(reader_object_id).code(), StatusCode::OK);
+}
+
 #endif  // defined(__APPLE__) || defined(__linux__)
 
 }  // namespace experimental
diff --git a/src/ray/raylet_rpc_client/raylet_client.cc b/src/ray/raylet_rpc_client/raylet_client.cc
index d76b49716331..98ddd7573a00 100644
--- a/src/ray/raylet_rpc_client/raylet_client.cc
+++ b/src/ray/raylet_rpc_client/raylet_client.cc
@@ -193,8 +193,10 @@ void RayletClient::PushMutableObject(
     // metadata from any message can be used.
     request.set_metadata(static_cast<char *>(metadata), metadata_size);
 
-    // TODO(jackhumphries): Add failure recovery, retries, and timeout.
-    INVOKE_RPC_CALL(
+    // Use retryable RPC call to handle failure recovery, retries, and timeout.
+    // Each chunk is retried automatically on transient network errors.
+    INVOKE_RETRYABLE_RPC_CALL(
+        retryable_grpc_client_,
         NodeManagerService,
         PushMutableObject,
         request,
diff --git a/src/ray/raylet_rpc_client/tests/BUILD.bazel b/src/ray/raylet_rpc_client/tests/BUILD.bazel
index ae162b2349e2..5fd0f01e9648 100644
--- a/src/ray/raylet_rpc_client/tests/BUILD.bazel
+++ b/src/ray/raylet_rpc_client/tests/BUILD.bazel
@@ -13,3 +13,21 @@ ray_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+ray_cc_test(
+    name = "raylet_client_test",
+    size = "small",
+    srcs = ["raylet_client_test.cc"],
+    tags = ["team:core"],
+    deps = [
+        "//src/ray/common:asio",
+        "//src/ray/common:id",
+        "//src/ray/common:ray_config",
+        "//src/ray/protobuf:node_manager_cc_proto",
+        "//src/ray/raylet_rpc_client:raylet_client_lib",
+        "//src/ray/rpc:grpc_client",
+        "//src/ray/rpc:rpc_chaos",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/src/ray/raylet_rpc_client/tests/raylet_client_test.cc b/src/ray/raylet_rpc_client/tests/raylet_client_test.cc
new file mode 100644
index 000000000000..821fd9fc8eaa
--- /dev/null
+++ b/src/ray/raylet_rpc_client/tests/raylet_client_test.cc
@@ -0,0 +1,246 @@
+// Copyright 2025 The Ray Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ray/raylet_rpc_client/raylet_client.h"
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "ray/common/asio/asio_util.h"
+#include "ray/common/id.h"
+#include "ray/common/ray_config.h"
+#include "ray/rpc/client_call.h"
+#include "ray/rpc/rpc_chaos.h"
+#include "src/ray/protobuf/node_manager.pb.h"
+
+namespace ray {
+namespace rpc {
+
+class RayletClientTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initialize RPC chaos framework for failure injection testing
+    rpc::testing::Init();
+
+    // Create a ClientCallManager for the RayletClient
+    client_call_manager_ = std::make_unique<ClientCallManager>(
+        io_service_, /*record_stats=*/false, /*local_address=*/"127.0.0.1");
+
+    // Create a test address
+    test_address_.set_ip_address("127.0.0.1");
+    test_address_.set_port(12345);
+
+    // Create a RayletClient with a no-op unavailable callback
+    raylet_unavailable_callback_ = []() {};
+
+    raylet_client_ = std::make_unique<RayletClient>(
+        test_address_, *client_call_manager_, raylet_unavailable_callback_);
+  }
+
+  instrumented_io_context io_service_;
+  std::unique_ptr<ClientCallManager> client_call_manager_;
+  rpc::Address test_address_;
+  std::function<void()> raylet_unavailable_callback_;
+  std::unique_ptr<RayletClient> raylet_client_;
+};
+
+// Test that PushMutableObject correctly handles chunking for large data
+TEST_F(RayletClientTest, PushMutableObjectChunking) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  // Create data that will be split into multiple chunks
+  // Using a size larger than the default chunk size (98% of max_grpc_message_size)
+  uint64_t data_size = RayConfig::instance().max_grpc_message_size() * 2;
+  uint64_t metadata_size = 100;
+
+  std::vector<uint8_t> data(data_size, 0xAB);
+  std::vector<uint8_t> metadata(metadata_size, 0xCD);
+
+  // Track callback invocations
+  bool callback_called = false;
+  Status callback_status;
+  rpc::PushMutableObjectReply callback_reply;
+
+  auto callback = [&callback_called, &callback_status, &callback_reply](
+                      const Status &status, rpc::PushMutableObjectReply &&reply) {
+    callback_called = true;
+    callback_status = status;
+    callback_reply = std::move(reply);
+  };
+
+  // Call PushMutableObject - this will send multiple chunks
+  // Note: This will fail to connect to the actual raylet, but we can verify
+  // that the method doesn't crash and handles the chunking logic correctly
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Process events to allow async operations to complete
+  // The actual RPC will fail, but we're testing that the chunking logic works
+  auto start_time = std::chrono::steady_clock::now();
+  while (!callback_called &&
+         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
+    io_service_.poll();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+
+  // The callback should eventually be called (even if with an error)
+  // This verifies that the retry mechanism is working and the callback is invoked
+  // Note: We expect it to fail since there's no actual raylet server running,
+  // but the retry mechanism should handle this gracefully
+  EXPECT_TRUE(callback_called) << "Callback should be called after retries";
+}
+
+// Test that PushMutableObject handles small data (single chunk)
+TEST_F(RayletClientTest, PushMutableObjectSingleChunk) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  uint64_t data_size = 1024;  // Small data, single chunk
+  uint64_t metadata_size = 100;
+
+  std::vector<uint8_t> data(data_size, 0xAB);
+  std::vector<uint8_t> metadata(metadata_size, 0xCD);
+
+  bool callback_called = false;
+  Status callback_status;
+
+  auto callback = [&callback_called, &callback_status](
+                      const Status &status, rpc::PushMutableObjectReply &&reply) {
+    callback_called = true;
+    callback_status = status;
+  };
+
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Process events
+  auto start_time = std::chrono::steady_clock::now();
+  while (!callback_called &&
+         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
+    io_service_.poll();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+
+  // Callback should be called (even if with error due to no server)
+  // This verifies the retry mechanism handles single chunks correctly
+}
+
+// Test that PushMutableObject correctly calculates chunk count
+TEST_F(RayletClientTest, PushMutableObjectChunkCalculation) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  // Test with data size that's exactly a multiple of chunk size
+  uint64_t max_grpc_payload_size = RayConfig::instance().max_grpc_message_size() * 0.98;
+  uint64_t data_size = max_grpc_payload_size * 3;  // Exactly 3 chunks
+  uint64_t metadata_size = 100;
+
+  std::vector<uint8_t> data(data_size, 0xAB);
+  std::vector<uint8_t> metadata(metadata_size, 0xCD);
+
+  bool callback_called = false;
+
+  auto callback = [&callback_called](const Status &status,
+                                     rpc::PushMutableObjectReply &&reply) {
+    callback_called = true;
+  };
+
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Process events briefly
+  auto start_time = std::chrono::steady_clock::now();
+  while (!callback_called &&
+         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(2)) {
+    io_service_.poll();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+
+  // Verify the method completes without crashing
+  // The retry mechanism should handle the multiple chunks correctly
+}
+
+// Test that PushMutableObject uses retryable RPC calls and handles failures gracefully
+// This test verifies that the retry mechanism is invoked when failures occur.
+// Note: The callback is only called when the operation succeeds (reply.done() == true).
+// When failures occur, the retry mechanism queues requests and retries them, but the
+// callback is not called until all chunks succeed.
+//
+// This test verifies that:
+// 1. PushMutableObject uses INVOKE_RETRYABLE_RPC_CALL (which handles retries)
+// 2. The call completes without crashing even when failures are injected
+// 3. The retry mechanism handles failures gracefully (no hang or crash)
+TEST_F(RayletClientTest, PushMutableObjectRetryOnFailure) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  uint64_t data_size = 1024;
+  uint64_t metadata_size = 100;
+
+  std::vector<uint8_t> data(data_size, 0xAB);
+  std::vector<uint8_t> metadata(metadata_size, 0xCD);
+
+  // Inject RPC failures to test retry behavior
+  // Format: "Service.grpc_client.Method=max_failures:req_prob:resp_prob:inflight_prob"
+  // This will cause some calls to fail with UNAVAILABLE status,
+  // which should trigger retries via RetryableGrpcClient
+  std::string failure_config =
+      "NodeManagerService.grpc_client.PushMutableObject=5:50:0:0";
+  RayConfig::instance().testing_rpc_failure() = failure_config;
+
+  bool callback_called = false;
+  Status callback_status;
+  int callback_count = 0;
+
+  auto callback = [&callback_called, &callback_status, &callback_count](
+                      const Status &status, rpc::PushMutableObjectReply &&reply) {
+    callback_count++;
+    callback_called = true;
+    callback_status = status;
+    // The callback is only called when the operation succeeds (reply.done() == true)
+    // With no server running, this won't happen, but that's OK for this test
+  };
+
+  // Call PushMutableObject - this will trigger retries on failures
+  // Note: Since there's no actual raylet server, the RPC will fail,
+  // but we're testing that the retry mechanism handles failures gracefully
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Process events to allow retries to be attempted
+  // The RetryableGrpcClient will queue failed requests and retry them
+  // We just verify the method doesn't hang or crash
+  auto start_time = std::chrono::steady_clock::now();
+  while ((std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
+    io_service_.poll();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+
+  // Verify the method completed without crashing
+  // The retry mechanism should handle failures gracefully by queuing requests
+  // Note: We don't expect the callback to be called since there's no server,
+  // but the important thing is that retries are attempted and the call doesn't hang
+
+  // Reset failure injection
+  RayConfig::instance().testing_rpc_failure() = "";
+}
+
+}  // namespace rpc
+}  // namespace ray
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From e14a5b7b49a9383bb8f9fce200f801d1f1c401b8 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 21 Nov 2025 01:59:15 +0000
Subject: [PATCH 02/11] wip

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../tests/raylet_client_test.cc               | 53 ++++++++++++-------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/ray/raylet_rpc_client/tests/raylet_client_test.cc b/src/ray/raylet_rpc_client/tests/raylet_client_test.cc
index 821fd9fc8eaa..f635c91a248d 100644
--- a/src/ray/raylet_rpc_client/tests/raylet_client_test.cc
+++ b/src/ray/raylet_rpc_client/tests/raylet_client_test.cc
@@ -174,15 +174,15 @@ TEST_F(RayletClientTest, PushMutableObjectChunkCalculation) {
 }
 
 // Test that PushMutableObject uses retryable RPC calls and handles failures gracefully
-// This test verifies that the retry mechanism is invoked when failures occur.
+// This test verifies that retries actually occur when failures are injected.
 // Note: The callback is only called when the operation succeeds (reply.done() == true).
-// When failures occur, the retry mechanism queues requests and retries them, but the
-// callback is not called until all chunks succeed.
+// When failures occur, the retry mechanism queues requests and retries them.
 //
 // This test verifies that:
-// 1. PushMutableObject uses INVOKE_RETRYABLE_RPC_CALL (which handles retries)
-// 2. The call completes without crashing even when failures are injected
-// 3. The retry mechanism handles failures gracefully (no hang or crash)
+// 1. PushMutableObject uses INVOKE_RETRYABLE_RPC_CALL with retryable_grpc_client_
+//    (which is created in RayletClient constructor and uses RetryableGrpcClient)
+// 2. Retries actually occur when failures are injected (verified by guaranteed failures)
+// 3. The code handles failures gracefully (no hang or crash)
 TEST_F(RayletClientTest, PushMutableObjectRetryOnFailure) {
   ObjectID writer_object_id = ObjectID::FromRandom();
 
@@ -192,14 +192,20 @@ TEST_F(RayletClientTest, PushMutableObjectRetryOnFailure) {
   std::vector<uint8_t> data(data_size, 0xAB);
   std::vector<uint8_t> metadata(metadata_size, 0xCD);
 
-  // Inject RPC failures to test retry behavior
-  // Format: "Service.grpc_client.Method=max_failures:req_prob:resp_prob:inflight_prob"
-  // This will cause some calls to fail with UNAVAILABLE status,
-  // which should trigger retries via RetryableGrpcClient
+  // Inject RPC failures with guaranteed failures to verify retries occur
+  // Format:
+  // "Service.grpc_client.Method=max_failures:req_prob:resp_prob:inflight_prob:guaranteed_req_failures"
+  // We use guaranteed request failures (5th parameter = 3) to ensure the first 3 RPC
+  // attempts fail. This guarantees that retries will be attempted, proving the retry
+  // mechanism works. After 3 guaranteed failures, failures become probabilistic (50%
+  // chance).
   std::string failure_config =
-      "NodeManagerService.grpc_client.PushMutableObject=5:50:0:0";
+      "NodeManagerService.grpc_client.PushMutableObject=10:50:0:0:3";
   RayConfig::instance().testing_rpc_failure() = failure_config;
 
+  // Re-initialize RPC chaos to pick up the new config
+  rpc::testing::Init();
+
   bool callback_called = false;
   Status callback_status;
   int callback_count = 0;
@@ -209,32 +215,39 @@ TEST_F(RayletClientTest, PushMutableObjectRetryOnFailure) {
     callback_count++;
     callback_called = true;
     callback_status = status;
-    // The callback is only called when the operation succeeds (reply.done() == true)
-    // With no server running, this won't happen, but that's OK for this test
   };
 
   // Call PushMutableObject - this will trigger retries on failures
-  // Note: Since there's no actual raylet server, the RPC will fail,
-  // but we're testing that the retry mechanism handles failures gracefully
+  // Note: Since there's no actual raylet server, the RPC will eventually fail,
+  // but we're testing that retries are attempted when failures occur
   raylet_client_->PushMutableObject(
       writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
 
   // Process events to allow retries to be attempted
-  // The RetryableGrpcClient will queue failed requests and retry them
-  // We just verify the method doesn't hang or crash
+  // The RetryableGrpcClient (used via INVOKE_RETRYABLE_RPC_CALL) will queue
+  // failed requests and retry them. With 3 guaranteed failures, we know at least
+  // 3 RPC attempts will be made (initial + 2 retries), proving retries occurred.
   auto start_time = std::chrono::steady_clock::now();
   while ((std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
     io_service_.poll();
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
 
-  // Verify the method completed without crashing
-  // The retry mechanism should handle failures gracefully by queuing requests
+  // Verify retries occurred: With 3 guaranteed failures, the RPC must have been
+  // attempted at least 3 times (initial attempt + 2 retries). We verify this by
+  // checking that the failure mechanism consumed the guaranteed failures.
+  // Since we can't directly access retry counts, we verify indirectly:
+  // - The code didn't crash (retry mechanism handled failures)
+  // - Multiple RPC attempts were made (guaranteed by the 3 guaranteed failures)
+  // - The retry mechanism was invoked (proven by the guaranteed failures being consumed)
+
   // Note: We don't expect the callback to be called since there's no server,
-  // but the important thing is that retries are attempted and the call doesn't hang
+  // but the important verification is that retries were attempted (proven by
+  // the guaranteed failures being consumed through multiple RPC attempts).
 
   // Reset failure injection
   RayConfig::instance().testing_rpc_failure() = "";
+  rpc::testing::Init();
 }
 
 }  // namespace rpc

From c8369859bbdc01448898160d7690cf69a5c9fd6f Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Thu, 4 Dec 2025 23:18:54 +0000
Subject: [PATCH 03/11] barrier

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../tests/mutable_object_provider_test.cc           | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/ray/core_worker/tests/mutable_object_provider_test.cc b/src/ray/core_worker/tests/mutable_object_provider_test.cc
index d3b1b7c9dc17..42fc780eb65d 100644
--- a/src/ray/core_worker/tests/mutable_object_provider_test.cc
+++ b/src/ray/core_worker/tests/mutable_object_provider_test.cc
@@ -435,17 +435,16 @@ TEST(MutableObjectProvider, HandleOutOfOrderPushMutableObject) {
     round_completed[round].resize(kNumRanks, false);
   }
 
-  // Barrier to synchronize the start of each round
-  std::vector<std::unique_ptr<absl::Barrier>> round_barriers;
-  for (int round = 0; round < kNumRounds; round++) {
-    round_barriers.push_back(std::make_unique<absl::Barrier>(kNumRanks));
-  }
+  // Single barrier to synchronize the start of all threads (but not between rounds)
+  // This allows rank0 to proceed to the next round before ranks 2-3 complete previous round
+  absl::Barrier start_barrier(kNumRanks);
 
   // Function for each rank to send requests for all rounds
   auto rank_sender = [&](int rank) {
+    // Wait for all ranks to be ready before starting (synchronize thread launch)
+    start_barrier.Block();
+
     for (int round = 0; round < kNumRounds; round++) {
-      // Wait for all ranks to be ready before sending this round
-      round_barriers[round]->Block();
 
       // Simulate network jitter: rank0 and rank1 send immediately,
       // rank2 and rank3 are delayed (simulating the issue scenario)

From 1051cdf9bbfc5727755436760bceff783b335a8a Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 5 Dec 2025 00:51:32 +0000
Subject: [PATCH 04/11] fix unit test

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../experimental_mutable_object_provider.cc   | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/ray/core_worker/experimental_mutable_object_provider.cc b/src/ray/core_worker/experimental_mutable_object_provider.cc
index 07bd85c72f1b..29c466c5d0d1 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.cc
+++ b/src/ray/core_worker/experimental_mutable_object_provider.cc
@@ -173,10 +173,17 @@ void MutableObjectProvider::HandlePushMutableObject(
       // This is a new chunk - check tracking but don't mark as received yet.
       // We'll mark it as received only after successfully writing it.
       is_new_chunk = true;
-      tmp_written_so_far = written_so_far_[writer_object_id];
+      auto written_it = written_so_far_.find(writer_object_id);
+      bool is_new_write = (written_it == written_so_far_.end());
+      tmp_written_so_far = is_new_write ? 0 : written_it->second;
 
       // Check if WriteAcquire needs to be called. This handles out-of-order chunks:
       // only the first chunk (or first chunk to arrive) will call WriteAcquire.
+      // Reset write_acquired_ flag if this is a new write (after previous WriteRelease).
+      if (is_new_write) {
+        write_acquired_[writer_object_id] = false;
+        received_chunks_[writer_object_id].clear();
+      }
       if (!write_acquired_[writer_object_id]) {
         needs_write_acquire = true;
         write_acquired_[writer_object_id] = true;
@@ -225,12 +232,16 @@ void MutableObjectProvider::HandlePushMutableObject(
     received_chunks_[writer_object_id].insert(offset);
     // Update written_so_far_ by adding this chunk's size.
     // Note: We increment rather than set to handle potential out-of-order chunks.
+    // Initialize to 0 if this is the first chunk of a new write.
+    if (written_so_far_.find(writer_object_id) == written_so_far_.end()) {
+      written_so_far_[writer_object_id] = 0;
+    }
     written_so_far_[writer_object_id] += chunk_size;
     if (written_so_far_[writer_object_id] == total_data_size) {
       object_complete = true;
-      // Note: We keep received_chunks_ and written_so_far_ entries to handle
-      // retries. They will be cleaned up when the object is overwritten (next
-      // WriteAcquire will reset state) or when the object is unregistered.
+      // Note: We keep received_chunks_ and written_so_far_ entries until WriteRelease
+      // completes to handle retries. They will be cleaned up after WriteRelease() is
+      // called.
     }
   }
 
@@ -241,6 +252,16 @@ void MutableObjectProvider::HandlePushMutableObject(
            total_metadata_size);
     // The entire object has been written, so call `WriteRelease()`.
     RAY_CHECK_OK(object_manager_->WriteRelease(info.local_object_id));
+
+    // Clean up tracking state after WriteRelease to prepare for the next write.
+    // This ensures that subsequent writes to the same channel start fresh.
+    {
+      absl::MutexLock guard(&written_so_far_lock_);
+      written_so_far_.erase(writer_object_id);
+      received_chunks_.erase(writer_object_id);
+      write_acquired_.erase(writer_object_id);
+    }
+
     reply->set_done(true);
   } else {
     reply->set_done(false);

From 1a4ea6ce9aca8e172754add8cbdca81958bb8166 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Wed, 17 Dec 2025 01:55:55 +0000
Subject: [PATCH 05/11] up test

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../raylet_rpc_client/fake_raylet_client.h    |  14 +-
 src/ray/raylet_rpc_client/raylet_client.cc    |  52 +-
 src/ray/raylet_rpc_client/raylet_client.h     |  15 +-
 .../raylet_client_interface.h                 |   3 +-
 src/ray/raylet_rpc_client/tests/BUILD.bazel   |  18 -
 .../tests/raylet_client_test.cc               | 259 ---------
 src/ray/rpc/tests/BUILD.bazel                 |  18 +
 src/ray/rpc/tests/push_mutable_object_test.cc | 518 ++++++++++++++++++
 8 files changed, 589 insertions(+), 308 deletions(-)
 delete mode 100644 src/ray/raylet_rpc_client/tests/raylet_client_test.cc
 create mode 100644 src/ray/rpc/tests/push_mutable_object_test.cc

diff --git a/src/ray/raylet_rpc_client/fake_raylet_client.h b/src/ray/raylet_rpc_client/fake_raylet_client.h
index 931c5d52643b..238eb88842be 100644
--- a/src/ray/raylet_rpc_client/fake_raylet_client.h
+++ b/src/ray/raylet_rpc_client/fake_raylet_client.h
@@ -235,13 +235,13 @@ class FakeRayletClient : public RayletClientInterface {
       const ObjectID &reader_object_id,
       const ClientCallback<RegisterMutableObjectReply> &callback) override {}
 
-  void PushMutableObject(
-      const ObjectID &writer_object_id,
-      uint64_t data_size,
-      uint64_t metadata_size,
-      void *data,
-      void *metadata,
-      const ClientCallback<PushMutableObjectReply> &callback) override {}
+  void PushMutableObject(const ObjectID &writer_object_id,
+                         uint64_t data_size,
+                         uint64_t metadata_size,
+                         void *data,
+                         void *metadata,
+                         const ClientCallback<PushMutableObjectReply> &callback,
+                         int64_t timeout_ms = -1) override {}
 
   void GetWorkerFailureCause(
       const LeaseID &lease_id,
diff --git a/src/ray/raylet_rpc_client/raylet_client.cc b/src/ray/raylet_rpc_client/raylet_client.cc
index 98ddd7573a00..198301860953 100644
--- a/src/ray/raylet_rpc_client/raylet_client.cc
+++ b/src/ray/raylet_rpc_client/raylet_client.cc
@@ -14,6 +14,7 @@
 
 #include "ray/raylet_rpc_client/raylet_client.h"
 
+#include <atomic>
 #include <limits>
 #include <memory>
 #include <set>
@@ -166,7 +167,8 @@ void RayletClient::PushMutableObject(
     uint64_t metadata_size,
     void *data,
     void *metadata,
-    const ray::rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback) {
+    const ray::rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback,
+    int64_t timeout_ms) {
   // Ray sets the gRPC max payload size to ~512 MiB. We set the max chunk size to a
   // slightly lower value to allow extra padding just in case.
   uint64_t kMaxGrpcPayloadSize = RayConfig::instance().max_grpc_message_size() * 0.98;
@@ -177,6 +179,10 @@ void RayletClient::PushMutableObject(
     total_num_chunks++;
   }
 
+  // Use shared state to ensure callback is only invoked once across all chunks.
+  // Multiple chunks may fail, but the user should only receive one callback.
+  auto callback_invoked = std::make_shared<std::atomic<bool>>(false);
+
   for (uint64_t i = 0; i < total_num_chunks; i++) {
     rpc::PushMutableObjectRequest request;
     request.set_writer_object_id(writer_object_id.Binary());
@@ -195,21 +201,35 @@ void RayletClient::PushMutableObject(
 
     // Use retryable RPC call to handle failure recovery, retries, and timeout.
     // Each chunk is retried automatically on transient network errors.
-    INVOKE_RETRYABLE_RPC_CALL(
-        retryable_grpc_client_,
-        NodeManagerService,
-        PushMutableObject,
-        request,
-        [callback](const Status &status, rpc::PushMutableObjectReply &&reply) {
-          RAY_LOG_IF_ERROR(ERROR, status) << "Error pushing mutable object: " << status;
-          if (reply.done()) {
-            // The callback is only executed once the receiver node receives all chunks
-            // for the mutable object write.
-            callback(status, std::move(reply));
-          }
-        },
-        grpc_client_,
-        /*method_timeout_ms*/ -1);
+    auto lambda_callback = [callback, callback_invoked](
+                               const Status &status,
+                               rpc::PushMutableObjectReply &&reply) {
+      RAY_LOG_IF_ERROR(ERROR, status) << "Error pushing mutable object: " << status;
+
+      // Ensure callback is only invoked once across all chunks
+      bool expected = false;
+      bool should_invoke = false;
+
+      if (reply.done()) {
+        // Success case: all chunks received
+        should_invoke = callback_invoked->compare_exchange_strong(expected, true);
+      } else if (!status.ok()) {
+        // Error case: invoke on first failure only
+        should_invoke = callback_invoked->compare_exchange_strong(expected, true);
+      }
+      // Otherwise: intermediate chunk received successfully, waiting for more chunks.
+
+      if (should_invoke) {
+        callback(status, std::move(reply));
+      }
+    };
+    INVOKE_RETRYABLE_RPC_CALL(retryable_grpc_client_,
+                              NodeManagerService,
+                              PushMutableObject,
+                              request,
+                              lambda_callback,
+                              grpc_client_,
+                              /*method_timeout_ms*/ timeout_ms);
   }
 }
 
diff --git a/src/ray/raylet_rpc_client/raylet_client.h b/src/ray/raylet_rpc_client/raylet_client.h
index 07de0073ef02..3d50e8122311 100644
--- a/src/ray/raylet_rpc_client/raylet_client.h
+++ b/src/ray/raylet_rpc_client/raylet_client.h
@@ -82,13 +82,14 @@ class RayletClient : public RayletClientInterface {
       const ray::rpc::ClientCallback<ray::rpc::RegisterMutableObjectReply> &callback)
       override;
 
-  void PushMutableObject(const ObjectID &writer_object_id,
-                         uint64_t data_size,
-                         uint64_t metadata_size,
-                         void *data,
-                         void *metadata,
-                         const ray::rpc::ClientCallback<ray::rpc::PushMutableObjectReply>
-                             &callback) override;
+  void PushMutableObject(
+      const ObjectID &writer_object_id,
+      uint64_t data_size,
+      uint64_t metadata_size,
+      void *data,
+      void *metadata,
+      const ray::rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback,
+      int64_t timeout_ms = -1) override;
 
   void ReportWorkerBacklog(
       const WorkerID &worker_id,
diff --git a/src/ray/raylet_rpc_client/raylet_client_interface.h b/src/ray/raylet_rpc_client/raylet_client_interface.h
index fdd0035e224c..019a52053352 100644
--- a/src/ray/raylet_rpc_client/raylet_client_interface.h
+++ b/src/ray/raylet_rpc_client/raylet_client_interface.h
@@ -174,7 +174,8 @@ class RayletClientInterface {
       uint64_t metadata_size,
       void *data,
       void *metadata,
-      const rpc::ClientCallback<rpc::PushMutableObjectReply> &callback) = 0;
+      const rpc::ClientCallback<rpc::PushMutableObjectReply> &callback,
+      int64_t timeout_ms = -1) = 0;
 
   /// Get the system config from Raylet.
   /// \param callback Callback that will be called after raylet replied the system config.
diff --git a/src/ray/raylet_rpc_client/tests/BUILD.bazel b/src/ray/raylet_rpc_client/tests/BUILD.bazel
index 5fd0f01e9648..ae162b2349e2 100644
--- a/src/ray/raylet_rpc_client/tests/BUILD.bazel
+++ b/src/ray/raylet_rpc_client/tests/BUILD.bazel
@@ -13,21 +13,3 @@ ray_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-ray_cc_test(
-    name = "raylet_client_test",
-    size = "small",
-    srcs = ["raylet_client_test.cc"],
-    tags = ["team:core"],
-    deps = [
-        "//src/ray/common:asio",
-        "//src/ray/common:id",
-        "//src/ray/common:ray_config",
-        "//src/ray/protobuf:node_manager_cc_proto",
-        "//src/ray/raylet_rpc_client:raylet_client_lib",
-        "//src/ray/rpc:grpc_client",
-        "//src/ray/rpc:rpc_chaos",
-        "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/src/ray/raylet_rpc_client/tests/raylet_client_test.cc b/src/ray/raylet_rpc_client/tests/raylet_client_test.cc
deleted file mode 100644
index f635c91a248d..000000000000
--- a/src/ray/raylet_rpc_client/tests/raylet_client_test.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-// Copyright 2025 The Ray Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ray/raylet_rpc_client/raylet_client.h"
-
-#include <gtest/gtest.h>
-
-#include <chrono>
-#include <memory>
-#include <thread>
-#include <vector>
-
-#include "ray/common/asio/asio_util.h"
-#include "ray/common/id.h"
-#include "ray/common/ray_config.h"
-#include "ray/rpc/client_call.h"
-#include "ray/rpc/rpc_chaos.h"
-#include "src/ray/protobuf/node_manager.pb.h"
-
-namespace ray {
-namespace rpc {
-
-class RayletClientTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize RPC chaos framework for failure injection testing
-    rpc::testing::Init();
-
-    // Create a ClientCallManager for the RayletClient
-    client_call_manager_ = std::make_unique<ClientCallManager>(
-        io_service_, /*record_stats=*/false, /*local_address=*/"127.0.0.1");
-
-    // Create a test address
-    test_address_.set_ip_address("127.0.0.1");
-    test_address_.set_port(12345);
-
-    // Create a RayletClient with a no-op unavailable callback
-    raylet_unavailable_callback_ = []() {};
-
-    raylet_client_ = std::make_unique<RayletClient>(
-        test_address_, *client_call_manager_, raylet_unavailable_callback_);
-  }
-
-  instrumented_io_context io_service_;
-  std::unique_ptr<ClientCallManager> client_call_manager_;
-  rpc::Address test_address_;
-  std::function<void()> raylet_unavailable_callback_;
-  std::unique_ptr<RayletClient> raylet_client_;
-};
-
-// Test that PushMutableObject correctly handles chunking for large data
-TEST_F(RayletClientTest, PushMutableObjectChunking) {
-  ObjectID writer_object_id = ObjectID::FromRandom();
-
-  // Create data that will be split into multiple chunks
-  // Using a size larger than the default chunk size (98% of max_grpc_message_size)
-  uint64_t data_size = RayConfig::instance().max_grpc_message_size() * 2;
-  uint64_t metadata_size = 100;
-
-  std::vector<uint8_t> data(data_size, 0xAB);
-  std::vector<uint8_t> metadata(metadata_size, 0xCD);
-
-  // Track callback invocations
-  bool callback_called = false;
-  Status callback_status;
-  rpc::PushMutableObjectReply callback_reply;
-
-  auto callback = [&callback_called, &callback_status, &callback_reply](
-                      const Status &status, rpc::PushMutableObjectReply &&reply) {
-    callback_called = true;
-    callback_status = status;
-    callback_reply = std::move(reply);
-  };
-
-  // Call PushMutableObject - this will send multiple chunks
-  // Note: This will fail to connect to the actual raylet, but we can verify
-  // that the method doesn't crash and handles the chunking logic correctly
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
-
-  // Process events to allow async operations to complete
-  // The actual RPC will fail, but we're testing that the chunking logic works
-  auto start_time = std::chrono::steady_clock::now();
-  while (!callback_called &&
-         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
-    io_service_.poll();
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-
-  // The callback should eventually be called (even if with an error)
-  // This verifies that the retry mechanism is working and the callback is invoked
-  // Note: We expect it to fail since there's no actual raylet server running,
-  // but the retry mechanism should handle this gracefully
-  EXPECT_TRUE(callback_called) << "Callback should be called after retries";
-}
-
-// Test that PushMutableObject handles small data (single chunk)
-TEST_F(RayletClientTest, PushMutableObjectSingleChunk) {
-  ObjectID writer_object_id = ObjectID::FromRandom();
-
-  uint64_t data_size = 1024;  // Small data, single chunk
-  uint64_t metadata_size = 100;
-
-  std::vector<uint8_t> data(data_size, 0xAB);
-  std::vector<uint8_t> metadata(metadata_size, 0xCD);
-
-  bool callback_called = false;
-  Status callback_status;
-
-  auto callback = [&callback_called, &callback_status](
-                      const Status &status, rpc::PushMutableObjectReply &&reply) {
-    callback_called = true;
-    callback_status = status;
-  };
-
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
-
-  // Process events
-  auto start_time = std::chrono::steady_clock::now();
-  while (!callback_called &&
-         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
-    io_service_.poll();
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-
-  // Callback should be called (even if with error due to no server)
-  // This verifies the retry mechanism handles single chunks correctly
-}
-
-// Test that PushMutableObject correctly calculates chunk count
-TEST_F(RayletClientTest, PushMutableObjectChunkCalculation) {
-  ObjectID writer_object_id = ObjectID::FromRandom();
-
-  // Test with data size that's exactly a multiple of chunk size
-  uint64_t max_grpc_payload_size = RayConfig::instance().max_grpc_message_size() * 0.98;
-  uint64_t data_size = max_grpc_payload_size * 3;  // Exactly 3 chunks
-  uint64_t metadata_size = 100;
-
-  std::vector<uint8_t> data(data_size, 0xAB);
-  std::vector<uint8_t> metadata(metadata_size, 0xCD);
-
-  bool callback_called = false;
-
-  auto callback = [&callback_called](const Status &status,
-                                     rpc::PushMutableObjectReply &&reply) {
-    callback_called = true;
-  };
-
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
-
-  // Process events briefly
-  auto start_time = std::chrono::steady_clock::now();
-  while (!callback_called &&
-         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(2)) {
-    io_service_.poll();
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-
-  // Verify the method completes without crashing
-  // The retry mechanism should handle the multiple chunks correctly
-}
-
-// Test that PushMutableObject uses retryable RPC calls and handles failures gracefully
-// This test verifies that retries actually occur when failures are injected.
-// Note: The callback is only called when the operation succeeds (reply.done() == true).
-// When failures occur, the retry mechanism queues requests and retries them.
-//
-// This test verifies that:
-// 1. PushMutableObject uses INVOKE_RETRYABLE_RPC_CALL with retryable_grpc_client_
-//    (which is created in RayletClient constructor and uses RetryableGrpcClient)
-// 2. Retries actually occur when failures are injected (verified by guaranteed failures)
-// 3. The code handles failures gracefully (no hang or crash)
-TEST_F(RayletClientTest, PushMutableObjectRetryOnFailure) {
-  ObjectID writer_object_id = ObjectID::FromRandom();
-
-  uint64_t data_size = 1024;
-  uint64_t metadata_size = 100;
-
-  std::vector<uint8_t> data(data_size, 0xAB);
-  std::vector<uint8_t> metadata(metadata_size, 0xCD);
-
-  // Inject RPC failures with guaranteed failures to verify retries occur
-  // Format:
-  // "Service.grpc_client.Method=max_failures:req_prob:resp_prob:inflight_prob:guaranteed_req_failures"
-  // We use guaranteed request failures (5th parameter = 3) to ensure the first 3 RPC
-  // attempts fail. This guarantees that retries will be attempted, proving the retry
-  // mechanism works. After 3 guaranteed failures, failures become probabilistic (50%
-  // chance).
-  std::string failure_config =
-      "NodeManagerService.grpc_client.PushMutableObject=10:50:0:0:3";
-  RayConfig::instance().testing_rpc_failure() = failure_config;
-
-  // Re-initialize RPC chaos to pick up the new config
-  rpc::testing::Init();
-
-  bool callback_called = false;
-  Status callback_status;
-  int callback_count = 0;
-
-  auto callback = [&callback_called, &callback_status, &callback_count](
-                      const Status &status, rpc::PushMutableObjectReply &&reply) {
-    callback_count++;
-    callback_called = true;
-    callback_status = status;
-  };
-
-  // Call PushMutableObject - this will trigger retries on failures
-  // Note: Since there's no actual raylet server, the RPC will eventually fail,
-  // but we're testing that retries are attempted when failures occur
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
-
-  // Process events to allow retries to be attempted
-  // The RetryableGrpcClient (used via INVOKE_RETRYABLE_RPC_CALL) will queue
-  // failed requests and retry them. With 3 guaranteed failures, we know at least
-  // 3 RPC attempts will be made (initial + 2 retries), proving retries occurred.
-  auto start_time = std::chrono::steady_clock::now();
-  while ((std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
-    io_service_.poll();
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-
-  // Verify retries occurred: With 3 guaranteed failures, the RPC must have been
-  // attempted at least 3 times (initial attempt + 2 retries). We verify this by
-  // checking that the failure mechanism consumed the guaranteed failures.
-  // Since we can't directly access retry counts, we verify indirectly:
-  // - The code didn't crash (retry mechanism handled failures)
-  // - Multiple RPC attempts were made (guaranteed by the 3 guaranteed failures)
-  // - The retry mechanism was invoked (proven by the guaranteed failures being consumed)
-
-  // Note: We don't expect the callback to be called since there's no server,
-  // but the important verification is that retries were attempted (proven by
-  // the guaranteed failures being consumed through multiple RPC attempts).
-
-  // Reset failure injection
-  RayConfig::instance().testing_rpc_failure() = "";
-  rpc::testing::Init();
-}
-
-}  // namespace rpc
-}  // namespace ray
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/src/ray/rpc/tests/BUILD.bazel b/src/ray/rpc/tests/BUILD.bazel
index 5ca3ee88c853..0be57ef07bd2 100644
--- a/src/ray/rpc/tests/BUILD.bazel
+++ b/src/ray/rpc/tests/BUILD.bazel
@@ -51,3 +51,21 @@ ray_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+ray_cc_test(
+    name = "push_mutable_object_test",
+    size = "medium",
+    srcs = ["push_mutable_object_test.cc"],
+    tags = ["team:core"],
+    deps = [
+        "//src/ray/common:asio",
+        "//src/ray/common:id",
+        "//src/ray/common:ray_config",
+        "//src/ray/protobuf:node_manager_cc_proto",
+        "//src/ray/raylet_rpc_client:raylet_client_lib",
+        "//src/ray/rpc:grpc_client",
+        "//src/ray/rpc:grpc_server",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/src/ray/rpc/tests/push_mutable_object_test.cc b/src/ray/rpc/tests/push_mutable_object_test.cc
new file mode 100644
index 000000000000..d17b0021daf6
--- /dev/null
+++ b/src/ray/rpc/tests/push_mutable_object_test.cc
@@ -0,0 +1,518 @@
+// Copyright 2025 The Ray Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <chrono>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "ray/common/asio/asio_util.h"
+#include "ray/common/id.h"
+#include "ray/common/ray_config.h"
+#include "ray/raylet_rpc_client/raylet_client.h"
+#include "ray/rpc/client_call.h"
+#include "ray/rpc/grpc_server.h"
+#include "src/ray/protobuf/node_manager.grpc.pb.h"
+#include "src/ray/protobuf/node_manager.pb.h"
+
+namespace ray {
+namespace rpc {
+
+/// Mock NodeManager service handler for testing
+/// Note: Must be named NodeManagerServiceHandler to work with
+/// RPC_SERVICE_HANDLER_CUSTOM_AUTH macro
+class NodeManagerServiceHandler {
+ public:
+  virtual ~NodeManagerServiceHandler() = default;
+
+  virtual void HandlePushMutableObject(rpc::PushMutableObjectRequest request,
+                                       rpc::PushMutableObjectReply *reply,
+                                       SendReplyCallback send_reply_callback) {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    std::string object_id = request.writer_object_id();
+    uint64_t offset = request.offset();
+    uint64_t chunk_size = request.chunk_size();
+    uint64_t total_data_size = request.total_data_size();
+
+    total_requests_++;
+
+    RAY_LOG(INFO) << "Received chunk for object " << ObjectID::FromBinary(object_id).Hex()
+                  << " offset=" << offset << " chunk_size=" << chunk_size
+                  << " total_size=" << total_data_size << " (request #" << total_requests_
+                  << ")";
+
+    // Simulate retry behavior: fail first N requests if configured
+    if (fail_first_n_requests_ > 0 && total_requests_ <= fail_first_n_requests_) {
+      RAY_LOG(INFO) << "Failing request " << total_requests_
+                    << " with UNAVAILABLE (fail_first_n_requests_="
+                    << fail_first_n_requests_ << ")";
+      failed_requests_++;
+      // Return RpcError with UNAVAILABLE to trigger retries
+      send_reply_callback(
+          ray::Status::RpcError("Simulated transient failure - server unavailable",
+                                grpc::StatusCode::UNAVAILABLE),
+          /*reply_success=*/[]() {},
+          /*reply_failure=*/[]() {});
+      return;
+    }
+
+    // Simulate fault tolerance: fail chunks based on pattern if configured
+    if (fail_chunk_pattern_ > 0) {
+      // Track chunk attempts
+      std::string chunk_key = object_id + "_" + std::to_string(offset);
+      chunk_attempts_[chunk_key]++;
+      int attempt = chunk_attempts_[chunk_key];
+
+      RAY_LOG(INFO) << "Chunk at offset " << offset << ", attempt " << attempt;
+
+      // Fail non-first chunks on first attempt (pattern 1)
+      if (fail_chunk_pattern_ == 1 && offset > 0 && attempt == 1) {
+        RAY_LOG(INFO) << "Failing chunk at offset " << offset
+                      << " (first attempt, pattern 1)";
+        failed_requests_++;
+        // Return RpcError with UNAVAILABLE to trigger retries
+        send_reply_callback(
+            ray::Status::RpcError("Simulated chunk failure - network unavailable",
+                                  grpc::StatusCode::UNAVAILABLE),
+            /*reply_success=*/[]() {},
+            /*reply_failure=*/[]() {});
+        return;
+      }
+    }
+
+    // Initialize object state if first chunk
+    if (objects_.find(object_id) == objects_.end()) {
+      ObjectState state;
+      state.total_data_size = total_data_size;
+      state.total_metadata_size = request.total_metadata_size();
+      state.data.resize(total_data_size);
+      state.metadata = request.metadata();
+      state.chunks_received = 0;
+      objects_[object_id] = state;
+    }
+
+    // Copy chunk data
+    ObjectState &state = objects_[object_id];
+    const std::string &chunk_data = request.data();
+    std::memcpy(state.data.data() + offset, chunk_data.data(), chunk_size);
+    state.chunks_received++;
+    state.bytes_received += chunk_size;
+
+    // Check if all chunks received
+    bool done = (state.bytes_received >= state.total_data_size);
+
+    RAY_LOG(INFO) << "Object " << ObjectID::FromBinary(object_id).Hex()
+                  << " bytes_received=" << state.bytes_received << "/"
+                  << state.total_data_size << " done=" << done;
+
+    reply->set_done(done);
+
+    if (done) {
+      completed_objects_.push_back(object_id);
+    }
+
+    send_reply_callback(
+        ray::Status::OK(),
+        /*reply_success=*/
+        [object_id]() {
+          RAY_LOG(INFO) << "Reply sent for object "
+                        << ObjectID::FromBinary(object_id).Hex();
+        },
+        /*reply_failure=*/
+        [this, object_id]() {
+          RAY_LOG(WARNING) << "Reply failed for object "
+                           << ObjectID::FromBinary(object_id).Hex();
+          reply_failure_count_++;
+        });
+  }
+
+  struct ObjectState {
+    uint64_t total_data_size;
+    uint64_t total_metadata_size;
+    std::vector<uint8_t> data;
+    std::string metadata;
+    int chunks_received;
+    uint64_t bytes_received = 0;
+  };
+
+  std::atomic<int> total_requests_{0};
+  std::atomic<int> reply_failure_count_{0};
+  std::atomic<int> failed_requests_{0};
+  std::atomic<int> fail_first_n_requests_{0};  // Config: fail first N requests
+  std::atomic<int> fail_chunk_pattern_{0};     // Config: chunk failure pattern
+  std::map<std::string, ObjectState> objects_;
+  std::map<std::string, int> chunk_attempts_;  // Track attempts per chunk
+  std::vector<std::string> completed_objects_;
+  std::mutex mutex_;
+};
+
+/// Mock NodeManager gRPC service
+class MockNodeManagerService : public GrpcService {
+ public:
+  explicit MockNodeManagerService(instrumented_io_context &handler_io_service,
+                                  NodeManagerServiceHandler &handler)
+      : GrpcService(handler_io_service), service_handler_(handler) {}
+
+ protected:
+  grpc::Service &GetGrpcService() override { return service_; }
+
+  void InitServerCallFactories(
+      const std::unique_ptr<grpc::ServerCompletionQueue> &cq,
+      std::vector<std::unique_ptr<ServerCallFactory>> *server_call_factories,
+      const ClusterID &cluster_id,
+      const std::optional<AuthenticationToken> &auth_token) override {
+    RPC_SERVICE_HANDLER_CUSTOM_AUTH(NodeManagerService,
+                                    PushMutableObject,
+                                    /*max_active_rpcs=*/-1,
+                                    ClusterIdAuthType::NO_AUTH);
+  }
+
+ private:
+  NodeManagerService::AsyncService service_;
+  NodeManagerServiceHandler &service_handler_;
+};
+
+/// Integration test fixture with real gRPC server
+class PushMutableObjectTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    // Start handler thread for server
+    handler_thread_ = std::make_unique<std::thread>([this]() {
+      boost::asio::executor_work_guard<boost::asio::io_context::executor_type>
+          handler_io_service_work_(handler_io_service_.get_executor());
+      handler_io_service_.run();
+    });
+
+    // Create and start gRPC server
+    grpc_server_ = std::make_unique<GrpcServer>("test-raylet", 0, true);
+    grpc_server_->RegisterService(std::make_unique<MockNodeManagerService>(
+                                      handler_io_service_, mock_service_handler_),
+                                  false);
+    grpc_server_->Run();
+
+    // Wait for server to start
+    while (grpc_server_->GetPort() == 0) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+
+    RAY_LOG(INFO) << "Test server started on port " << grpc_server_->GetPort();
+
+    // Start client thread
+    client_thread_ = std::make_unique<std::thread>([this]() {
+      boost::asio::executor_work_guard<boost::asio::io_context::executor_type>
+          client_io_service_work_(client_io_service_.get_executor());
+      client_io_service_.run();
+    });
+
+    // Create client
+    client_call_manager_ = std::make_unique<ClientCallManager>(
+        client_io_service_, /*record_stats=*/false, /*local_address=*/"127.0.0.1");
+
+    test_address_.set_ip_address("127.0.0.1");
+    test_address_.set_port(grpc_server_->GetPort());
+
+    raylet_unavailable_callback_ = []() {
+      RAY_LOG(WARNING) << "Raylet unavailable callback invoked";
+    };
+
+    raylet_client_ = std::make_unique<RayletClient>(
+        test_address_, *client_call_manager_, raylet_unavailable_callback_);
+  }
+
+  void TearDown() override {
+    // Cleanup client
+    raylet_client_.reset();
+    client_call_manager_.reset();
+    client_io_service_.stop();
+    if (client_thread_ && client_thread_->joinable()) {
+      client_thread_->join();
+    }
+
+    // Cleanup server
+    grpc_server_->Shutdown();
+    handler_io_service_.stop();
+    if (handler_thread_ && handler_thread_->joinable()) {
+      handler_thread_->join();
+    }
+  }
+
+ protected:
+  // Server side
+  NodeManagerServiceHandler mock_service_handler_;
+  instrumented_io_context handler_io_service_;
+  std::unique_ptr<std::thread> handler_thread_;
+  std::unique_ptr<GrpcServer> grpc_server_;
+
+  // Client side
+  instrumented_io_context client_io_service_;
+  std::unique_ptr<std::thread> client_thread_;
+  std::unique_ptr<ClientCallManager> client_call_manager_;
+  rpc::Address test_address_;
+  std::function<void()> raylet_unavailable_callback_;
+  std::unique_ptr<RayletClient> raylet_client_;
+};
+
+// Test successful single chunk transfer
+TEST_F(PushMutableObjectTest, TestSingleChunkTransferSuccess) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  uint64_t data_size = 1024;  // Small data, single chunk
+  uint64_t metadata_size = 100;
+
+  std::vector<uint8_t> data(data_size);
+  std::vector<uint8_t> metadata(metadata_size);
+
+  // Fill with recognizable pattern
+  for (size_t i = 0; i < data_size; i++) {
+    data[i] = static_cast<uint8_t>(i % 256);
+  }
+  for (size_t i = 0; i < metadata_size; i++) {
+    metadata[i] = static_cast<uint8_t>(0xCD);
+  }
+
+  bool callback_called = false;
+  Status callback_status;
+  rpc::PushMutableObjectReply callback_reply;
+
+  auto callback = [&callback_called, &callback_status, &callback_reply](
+                      const Status &status, rpc::PushMutableObjectReply &&reply) {
+    callback_called = true;
+    callback_status = status;
+    callback_reply = std::move(reply);
+  };
+
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Wait for callback
+  auto start_time = std::chrono::steady_clock::now();
+  while (!callback_called &&
+         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(5)) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+
+  // Verify success
+  ASSERT_TRUE(callback_called) << "Callback should be called";
+  EXPECT_TRUE(callback_status.ok()) << "Status should be OK: " << callback_status;
+  EXPECT_TRUE(callback_reply.done()) << "Reply should indicate done";
+
+  // Verify server received the data
+  std::lock_guard<std::mutex> lock(mock_service_handler_.mutex_);
+  ASSERT_EQ(mock_service_handler_.completed_objects_.size(), 1);
+
+  std::string object_id_binary = writer_object_id.Binary();
+  ASSERT_TRUE(mock_service_handler_.objects_.find(object_id_binary) !=
+              mock_service_handler_.objects_.end());
+
+  const auto &received_object = mock_service_handler_.objects_[object_id_binary];
+  EXPECT_EQ(received_object.bytes_received, data_size);
+  EXPECT_EQ(received_object.data.size(), data_size);
+
+  // Verify data integrity
+  EXPECT_EQ(std::memcmp(received_object.data.data(), data.data(), data_size), 0)
+      << "Received data should match sent data";
+}
+
+// Test successful multi-chunk transfer (large object)
+TEST_F(PushMutableObjectTest, TestMultiChunkTransferSuccess) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  // Create data larger than max chunk size to force chunking
+  uint64_t data_size = RayConfig::instance().max_grpc_message_size() * 2;
+  uint64_t metadata_size = 200;
+
+  std::vector<uint8_t> data(data_size);
+  std::vector<uint8_t> metadata(metadata_size, 0xAB);
+
+  // Fill with recognizable pattern
+  for (size_t i = 0; i < data_size; i++) {
+    data[i] = static_cast<uint8_t>((i / 1024) % 256);
+  }
+
+  bool callback_called = false;
+  Status callback_status;
+  rpc::PushMutableObjectReply callback_reply;
+
+  auto callback = [&callback_called, &callback_status, &callback_reply](
+                      const Status &status, rpc::PushMutableObjectReply &&reply) {
+    callback_called = true;
+    callback_status = status;
+    callback_reply = std::move(reply);
+  };
+
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Wait for callback (may take longer for large object)
+  auto start_time = std::chrono::steady_clock::now();
+  while (!callback_called &&
+         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(30)) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+
+  // Verify success
+  ASSERT_TRUE(callback_called) << "Callback should be called";
+  EXPECT_TRUE(callback_status.ok()) << "Status should be OK: " << callback_status;
+  EXPECT_TRUE(callback_reply.done()) << "Reply should indicate done";
+
+  // Verify server received all chunks
+  std::lock_guard<std::mutex> lock(mock_service_handler_.mutex_);
+  ASSERT_EQ(mock_service_handler_.completed_objects_.size(), 1);
+
+  std::string object_id_binary = writer_object_id.Binary();
+  ASSERT_TRUE(mock_service_handler_.objects_.find(object_id_binary) !=
+              mock_service_handler_.objects_.end());
+
+  const auto &received_object = mock_service_handler_.objects_[object_id_binary];
+  EXPECT_EQ(received_object.bytes_received, data_size);
+  EXPECT_GE(received_object.chunks_received, 2) << "Should have received multiple chunks";
+
+  // Verify data integrity across all chunks
+  EXPECT_EQ(std::memcmp(received_object.data.data(), data.data(), data_size), 0)
+      << "Received data should match sent data across all chunks";
+
+  RAY_LOG(INFO) << "Successfully transferred " << data_size << " bytes in "
+                << received_object.chunks_received << " chunks";
+}
+
+// Test that callback is only called once even with multiple chunks
+TEST_F(PushMutableObjectTest, TestCallbackCalledOnlyOnce) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  uint64_t data_size = RayConfig::instance().max_grpc_message_size() * 1.5;
+  uint64_t metadata_size = 50;
+
+  std::vector<uint8_t> data(data_size, 0xFF);
+  std::vector<uint8_t> metadata(metadata_size, 0xEE);
+
+  std::atomic<int> callback_count{0};
+  bool callback_called = false;
+  Status callback_status;
+
+  auto callback = [&callback_count, &callback_called, &callback_status](
+                      const Status &status, rpc::PushMutableObjectReply &&reply) {
+    callback_count++;
+    callback_called = true;
+    callback_status = status;
+  };
+
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Wait for completion
+  auto start_time = std::chrono::steady_clock::now();
+  while (!callback_called &&
+         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(30)) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+
+  // Give it extra time to catch any duplicate callbacks
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+  ASSERT_TRUE(callback_called) << "Callback should be called";
+  EXPECT_EQ(callback_count.load(), 1) << "Callback should be called exactly once";
+  EXPECT_TRUE(callback_status.ok()) << "Status should be OK";
+}
+
+// Test that retry actually happens by temporarily making server unavailable
+TEST_F(PushMutableObjectTest, TestRetryHappens) {
+  ObjectID writer_object_id = ObjectID::FromRandom();
+
+  // Use multi-chunk transfer to have time to interrupt it
+  uint64_t data_size = RayConfig::instance().max_grpc_message_size() * 1.5;
+  uint64_t metadata_size = 50;
+  std::vector<uint8_t> data(data_size, 0xAA);
+  std::vector<uint8_t> metadata(metadata_size, 0xBB);
+
+  bool callback_called = false;
+  Status callback_status;
+  rpc::PushMutableObjectReply callback_reply;
+
+  auto callback = [&callback_called, &callback_status, &callback_reply](
+                      const Status &status, rpc::PushMutableObjectReply &&reply) {
+    callback_called = true;
+    callback_status = status;
+    callback_reply = std::move(reply);
+    RAY_LOG(INFO) << "Client callback invoked with status: " << status
+                  << ", done=" << reply.done();
+  };
+
+  // Shut down server BEFORE starting transfer to force initial failure
+  int old_port = grpc_server_->GetPort();
+  RAY_LOG(INFO) << "Shutting down server before transfer to force retry";
+  grpc_server_->Shutdown();
+
+  // Start the transfer (will fail initially due to UNAVAILABLE)
+  raylet_client_->PushMutableObject(
+      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+
+  // Wait a bit to let initial attempts fail
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+  // Track requests before restart
+  int requests_before = mock_service_handler_.total_requests_.load();
+  RAY_LOG(INFO) << "Requests before restart: " << requests_before;
+
+  // Restart the server - this allows retries to succeed
+  RAY_LOG(INFO) << "Restarting server to allow retry to succeed";
+  grpc_server_ = std::make_unique<GrpcServer>("test-raylet", old_port, true);
+  grpc_server_->RegisterService(std::make_unique<MockNodeManagerService>(
+                                    handler_io_service_, mock_service_handler_),
+                                false);
+  grpc_server_->Run();
+
+  // Wait for server to be ready
+  while (grpc_server_->GetPort() == 0) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+
+  RAY_LOG(INFO) << "Server restarted on port " << grpc_server_->GetPort();
+
+  // Wait for completion (the client should retry and succeed)
+  auto start_time = std::chrono::steady_clock::now();
+  while (!callback_called &&
+         (std::chrono::steady_clock::now() - start_time) < std::chrono::seconds(15)) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  ASSERT_TRUE(callback_called) << "Callback should be called after server recovery";
+  EXPECT_TRUE(callback_status.ok())
+      << "Status should be OK after retry: " << callback_status;
+  EXPECT_TRUE(callback_reply.done()) << "Transfer should complete after retry";
+
+  // Verify the data was eventually received after retry
+  std::lock_guard<std::mutex> lock(mock_service_handler_.mutex_);
+  ASSERT_EQ(mock_service_handler_.completed_objects_.size(), 1);
+
+  int requests_after = mock_service_handler_.total_requests_.load();
+  RAY_LOG(INFO) << "Requests after restart: " << requests_after;
+  RAY_LOG(INFO) << "Successfully completed transfer after server restart - retry worked! "
+                << "Server processed " << requests_after << " requests";
+}
+
+}  // namespace rpc
+}  // namespace ray
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From 34a87b22939e11cbf0e47976f91b8d359f89eccc Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Thu, 18 Dec 2025 21:40:23 +0000
Subject: [PATCH 06/11] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 src/mock/ray/raylet_client/raylet_client.h     |  3 ++-
 .../tests/mutable_object_provider_test.cc      | 18 +++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/mock/ray/raylet_client/raylet_client.h b/src/mock/ray/raylet_client/raylet_client.h
index 8c4a76734b27..70333db40ee1 100644
--- a/src/mock/ray/raylet_client/raylet_client.h
+++ b/src/mock/ray/raylet_client/raylet_client.h
@@ -107,7 +107,8 @@ class MockRayletClientInterface : public RayletClientInterface {
                uint64_t metadata_size,
                void *data,
                void *metadata,
-               const rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback),
+               const rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback,
+               int64_t timeout_ms),
               (override));
   MOCK_METHOD(void,
               GetSystemConfig,
diff --git a/src/ray/core_worker/tests/mutable_object_provider_test.cc b/src/ray/core_worker/tests/mutable_object_provider_test.cc
index 42fc780eb65d..d05ba8d56431 100644
--- a/src/ray/core_worker/tests/mutable_object_provider_test.cc
+++ b/src/ray/core_worker/tests/mutable_object_provider_test.cc
@@ -96,13 +96,13 @@ class MockRayletClient : public rpc::FakeRayletClient {
  public:
   virtual ~MockRayletClient() {}
 
-  void PushMutableObject(
-      const ObjectID &object_id,
-      uint64_t data_size,
-      uint64_t metadata_size,
-      void *data,
-      void *metadata,
-      const rpc::ClientCallback<rpc::PushMutableObjectReply> &callback) override {
+  void PushMutableObject(const ObjectID &object_id,
+                         uint64_t data_size,
+                         uint64_t metadata_size,
+                         void *data,
+                         void *metadata,
+                         const rpc::ClientCallback<rpc::PushMutableObjectReply> &callback,
+                         int64_t timeout_ms = -1) override {
     absl::MutexLock guard(&lock_);
     pushed_objects_.push_back(object_id);
   }
@@ -436,7 +436,8 @@ TEST(MutableObjectProvider, HandleOutOfOrderPushMutableObject) {
   }
 
   // Single barrier to synchronize the start of all threads (but not between rounds)
-  // This allows rank0 to proceed to the next round before ranks 2-3 complete previous round
+  // This allows rank0 to proceed to the next round before ranks 2-3 complete previous
+  // round
   absl::Barrier start_barrier(kNumRanks);
 
   // Function for each rank to send requests for all rounds
@@ -445,7 +446,6 @@ TEST(MutableObjectProvider, HandleOutOfOrderPushMutableObject) {
     start_barrier.Block();
 
     for (int round = 0; round < kNumRounds; round++) {
-
       // Simulate network jitter: rank0 and rank1 send immediately,
       // rank2 and rank3 are delayed (simulating the issue scenario)
       if (rank >= 2 && round == 0) {

From 138d1feddb802b81dd3c9bb8d6372818eecdbe7e Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Thu, 18 Dec 2025 22:22:24 +0000
Subject: [PATCH 07/11] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../tests/mutable_object_provider_test.cc     | 146 ------------------
 1 file changed, 146 deletions(-)

diff --git a/src/ray/core_worker/tests/mutable_object_provider_test.cc b/src/ray/core_worker/tests/mutable_object_provider_test.cc
index d05ba8d56431..013827865250 100644
--- a/src/ray/core_worker/tests/mutable_object_provider_test.cc
+++ b/src/ray/core_worker/tests/mutable_object_provider_test.cc
@@ -387,152 +387,6 @@ TEST(MutableObjectProvider, MutableObjectBufferSetErrorBeforeReadRelease) {
             StatusCode::ChannelError);
 }
 
-// Test that emulates the out-of-order PushMutableObject scenario described in
-// https://github.com/ray-project/ray/issues/58426
-//
-// Scenario: Multiple ranks send PushMutableObject requests concurrently. Due to
-// network jitter and OS scheduling, some requests arrive immediately while others
-// are delayed. This test verifies that the receiver handles out-of-order requests
-// correctly without deadlocking.
-//
-// The test simulates:
-// 1. Multiple "ranks" (4 ranks: rank0-rank3) sending requests concurrently
-// 2. Network jitter causing some requests to be delayed
-// 3. A new round of requests arriving before previous round completes
-// 4. Verifies no deadlock occurs and all data is correctly received
-TEST(MutableObjectProvider, HandleOutOfOrderPushMutableObject) {
-  constexpr int kNumRanks = 4;
-  constexpr size_t kDataSize = 64;
-  constexpr size_t kMetadataSize = 8;
-
-  // Create provider and register reader channels for each rank
-  auto plasma = std::make_shared<TestPlasma>();
-  MutableObjectProvider provider(plasma, /*factory=*/nullptr, nullptr);
-
-  std::vector<ObjectID> writer_object_ids;
-  std::vector<ObjectID> reader_object_ids;
-  for (int i = 0; i < kNumRanks; i++) {
-    writer_object_ids.push_back(ObjectID::FromRandom());
-    reader_object_ids.push_back(ObjectID::FromRandom());
-    provider.HandleRegisterMutableObject(
-        writer_object_ids[i], /*num_readers=*/1, reader_object_ids[i]);
-  }
-
-  // Prepare data for each rank
-  std::vector<std::vector<uint8_t>> rank_data(kNumRanks);
-  std::vector<std::vector<uint8_t>> rank_metadata(kNumRanks);
-  for (int i = 0; i < kNumRanks; i++) {
-    rank_data[i].resize(kDataSize, static_cast<uint8_t>(i));
-    rank_metadata[i].resize(kMetadataSize, static_cast<uint8_t>(i + 100));
-  }
-
-  // Simulate two rounds of requests
-  constexpr int kNumRounds = 2;
-
-  // Track request completion
-  std::vector<std::vector<bool>> round_completed(kNumRounds);
-  for (int round = 0; round < kNumRounds; round++) {
-    round_completed[round].resize(kNumRanks, false);
-  }
-
-  // Single barrier to synchronize the start of all threads (but not between rounds)
-  // This allows rank0 to proceed to the next round before ranks 2-3 complete previous
-  // round
-  absl::Barrier start_barrier(kNumRanks);
-
-  // Function for each rank to send requests for all rounds
-  auto rank_sender = [&](int rank) {
-    // Wait for all ranks to be ready before starting (synchronize thread launch)
-    start_barrier.Block();
-
-    for (int round = 0; round < kNumRounds; round++) {
-      // Simulate network jitter: rank0 and rank1 send immediately,
-      // rank2 and rank3 are delayed (simulating the issue scenario)
-      if (rank >= 2 && round == 0) {
-        // Delay ranks 2-3 in the first round to simulate network jitter
-        absl::SleepFor(absl::Milliseconds(50));
-      }
-
-      ray::rpc::PushMutableObjectRequest request;
-      request.set_writer_object_id(writer_object_ids[rank].Binary());
-      request.set_total_data_size(kDataSize);
-      request.set_total_metadata_size(kMetadataSize);
-      request.set_offset(0);
-      request.set_chunk_size(kDataSize);
-      request.set_data(rank_data[rank].data(), kDataSize);
-      request.set_metadata(rank_metadata[rank].data(), kMetadataSize);
-
-      ray::rpc::PushMutableObjectReply reply;
-
-      // Send request - this should not block even if other ranks' requests
-      // are delayed or out of order
-      provider.HandlePushMutableObject(request, &reply);
-
-      EXPECT_TRUE(reply.done())
-          << "Rank " << rank << " round " << round << " should complete";
-      round_completed[round][rank] = true;
-
-      // Small delay after rank0 completes round 0 to simulate the scenario where
-      // rank0 sends next round before ranks 2-3 complete previous round
-      if (rank == 0 && round == 0) {
-        absl::SleepFor(absl::Milliseconds(10));
-      }
-    }
-  };
-
-  // Launch one thread per rank (each rank sends all rounds)
-  std::vector<std::thread> threads;
-  for (int rank = 0; rank < kNumRanks; rank++) {
-    threads.emplace_back(rank_sender, rank);
-  }
-
-  // Wait for all threads to complete (with timeout to detect deadlocks)
-  auto start_time = absl::Now();
-  for (auto &thread : threads) {
-    thread.join();
-  }
-  auto elapsed = absl::Now() - start_time;
-
-  // Verify no deadlock occurred (should complete quickly)
-  EXPECT_LT(elapsed, absl::Seconds(5))
-      << "Test should complete quickly; deadlock suspected if timeout";
-
-  // Verify all requests completed successfully
-  for (int round = 0; round < kNumRounds; round++) {
-    for (int rank = 0; rank < kNumRanks; rank++) {
-      EXPECT_TRUE(round_completed[round][rank])
-          << "Rank " << rank << " round " << round << " did not complete";
-    }
-  }
-
-  // Verify data integrity: read back the data for each rank
-  for (int rank = 0; rank < kNumRanks; rank++) {
-    std::shared_ptr<RayObject> result;
-    EXPECT_EQ(provider.ReadAcquire(reader_object_ids[rank], result).code(),
-              StatusCode::OK)
-        << "Failed to read data for rank " << rank;
-
-    EXPECT_EQ(result->GetData()->Size(), kDataSize);
-    EXPECT_EQ(result->GetMetadata()->Size(), kMetadataSize);
-
-    // Verify data content
-    const uint8_t *data_ptr = result->GetData()->Data();
-    for (size_t i = 0; i < kDataSize; i++) {
-      EXPECT_EQ(data_ptr[i], static_cast<uint8_t>(rank))
-          << "Data mismatch for rank " << rank << " at offset " << i;
-    }
-
-    // Verify metadata content
-    const uint8_t *metadata_ptr = result->GetMetadata()->Data();
-    for (size_t i = 0; i < kMetadataSize; i++) {
-      EXPECT_EQ(metadata_ptr[i], static_cast<uint8_t>(rank + 100))
-          << "Metadata mismatch for rank " << rank << " at offset " << i;
-    }
-
-    EXPECT_EQ(provider.ReadRelease(reader_object_ids[rank]).code(), StatusCode::OK);
-  }
-}
-
 // Test retry handling with out-of-order chunks
 // Simulates the scenario where a chunk is retried and arrives out of order
 TEST(MutableObjectProvider, HandleRetryOutOfOrderChunks) {

From 6f9cc192a3c4dcdd7c7a9958947aa7cc3f58eb2e Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Thu, 18 Dec 2025 22:39:26 +0000
Subject: [PATCH 08/11] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 src/ray/core_worker/experimental_mutable_object_provider.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/ray/core_worker/experimental_mutable_object_provider.cc b/src/ray/core_worker/experimental_mutable_object_provider.cc
index 29c466c5d0d1..78589c5dcd8d 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.cc
+++ b/src/ray/core_worker/experimental_mutable_object_provider.cc
@@ -222,8 +222,6 @@ void MutableObjectProvider::HandlePushMutableObject(
 
   // Copy chunk data to backing store.
   memcpy(object_backing_store->Data() + offset, request.data().data(), chunk_size);
-  size_t total_written = tmp_written_so_far + chunk_size;
-  RAY_CHECK_LE(total_written, total_data_size);
 
   // Mark this chunk as received only after successfully writing it.
   // This ensures retries are handled correctly even if WriteAcquire fails.
@@ -237,6 +235,7 @@ void MutableObjectProvider::HandlePushMutableObject(
       written_so_far_[writer_object_id] = 0;
     }
     written_so_far_[writer_object_id] += chunk_size;
+    RAY_CHECK_LE(written_so_far_[writer_object_id], total_data_size);
     if (written_so_far_[writer_object_id] == total_data_size) {
       object_complete = true;
       // Note: We keep received_chunks_ and written_so_far_ entries until WriteRelease
@@ -245,7 +244,7 @@ void MutableObjectProvider::HandlePushMutableObject(
     }
   }
 
-  if (total_written == total_data_size) {
+  if (object_complete) {
     // All data chunks received - copy metadata and release write lock.
     memcpy(object_backing_store->Data() + total_data_size,
            request.metadata().data(),

From 5ec49ebba50dd1e9fc913ebd3bfe2af0e30b714d Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Thu, 18 Dec 2025 22:46:23 +0000
Subject: [PATCH 09/11] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../experimental_mutable_object_provider.cc           | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/ray/core_worker/experimental_mutable_object_provider.cc b/src/ray/core_worker/experimental_mutable_object_provider.cc
index 78589c5dcd8d..8550ff62829d 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.cc
+++ b/src/ray/core_worker/experimental_mutable_object_provider.cc
@@ -183,6 +183,10 @@ void MutableObjectProvider::HandlePushMutableObject(
       if (is_new_write) {
         write_acquired_[writer_object_id] = false;
         received_chunks_[writer_object_id].clear();
+        // Initialize written_so_far_ immediately to prevent race condition where
+        // multiple concurrent threads all see is_new_write=true and reset
+        // write_acquired_.
+        written_so_far_[writer_object_id] = 0;
       }
       if (!write_acquired_[writer_object_id]) {
         needs_write_acquire = true;
@@ -229,11 +233,8 @@ void MutableObjectProvider::HandlePushMutableObject(
     absl::MutexLock guard(&written_so_far_lock_);
     received_chunks_[writer_object_id].insert(offset);
     // Update written_so_far_ by adding this chunk's size.
-    // Note: We increment rather than set to handle potential out-of-order chunks.
-    // Initialize to 0 if this is the first chunk of a new write.
-    if (written_so_far_.find(writer_object_id) == written_so_far_.end()) {
-      written_so_far_[writer_object_id] = 0;
-    }
+    // Note: written_so_far_ was already initialized to 0 in the first lock block
+    // for new writes, so we can safely increment it here.
     written_so_far_[writer_object_id] += chunk_size;
     RAY_CHECK_LE(written_so_far_[writer_object_id], total_data_size);
     if (written_so_far_[writer_object_id] == total_data_size) {

From e96483f6998837d7c5237d087e91872dd1e11cfb Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 19 Dec 2025 02:48:16 +0000
Subject: [PATCH 10/11] fix stale retry issue with versioning

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 src/mock/ray/raylet_client/raylet_client.h    |   1 +
 .../experimental_mutable_object_manager.cc    |  21 ++-
 .../experimental_mutable_object_manager.h     |  13 ++
 .../experimental_mutable_object_provider.cc   | 133 +++++++++++-------
 .../experimental_mutable_object_provider.h    |  29 +++-
 .../tests/mutable_object_provider_test.cc     | 123 ++++++++++++++--
 src/ray/protobuf/node_manager.proto           |   3 +
 .../raylet_rpc_client/fake_raylet_client.h    |   1 +
 src/ray/raylet_rpc_client/raylet_client.cc    |   3 +
 src/ray/raylet_rpc_client/raylet_client.h     |   1 +
 .../raylet_client_interface.h                 |   1 +
 11 files changed, 262 insertions(+), 67 deletions(-)

diff --git a/src/mock/ray/raylet_client/raylet_client.h b/src/mock/ray/raylet_client/raylet_client.h
index 70333db40ee1..88c91f75c36f 100644
--- a/src/mock/ray/raylet_client/raylet_client.h
+++ b/src/mock/ray/raylet_client/raylet_client.h
@@ -107,6 +107,7 @@ class MockRayletClientInterface : public RayletClientInterface {
                uint64_t metadata_size,
                void *data,
                void *metadata,
+               int64_t version,
                const rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback,
                int64_t timeout_ms),
               (override));
diff --git a/src/ray/core_worker/experimental_mutable_object_manager.cc b/src/ray/core_worker/experimental_mutable_object_manager.cc
index ca551d97eb59..2571194392c7 100644
--- a/src/ray/core_worker/experimental_mutable_object_manager.cc
+++ b/src/ray/core_worker/experimental_mutable_object_manager.cc
@@ -306,6 +306,15 @@ Status MutableObjectManager::ReadAcquire(const ObjectID &object_id,
                                          std::shared_ptr<RayObject> &result,
                                          int64_t timeout_ms)
     ABSL_NO_THREAD_SAFETY_ANALYSIS {
+  int64_t unused_version = 0;
+  return ReadAcquire(object_id, result, unused_version, timeout_ms);
+}
+
+Status MutableObjectManager::ReadAcquire(const ObjectID &object_id,
+                                         std::shared_ptr<RayObject> &result,
+                                         int64_t &version_read,
+                                         int64_t timeout_ms)
+    ABSL_NO_THREAD_SAFETY_ANALYSIS {
   RAY_LOG(DEBUG).WithField(object_id) << "ReadAcquire";
   absl::ReaderMutexLock guard(&destructor_lock_);
 
@@ -349,11 +358,11 @@ Status MutableObjectManager::ReadAcquire(const ObjectID &object_id,
   }
 
   channel->reading = true;
-  int64_t version_read = 0;
+  int64_t version_read_internal = 0;
   Status s = object->header->ReadAcquire(object_id,
                                          sem,
                                          channel->next_version_to_read,
-                                         version_read,
+                                         version_read_internal,
                                          check_signals_,
                                          timeout_point);
   if (!s.ok()) {
@@ -363,8 +372,8 @@ Status MutableObjectManager::ReadAcquire(const ObjectID &object_id,
     channel->lock->unlock();
     return s;
   }
-  RAY_CHECK_GT(version_read, 0);
-  channel->next_version_to_read = version_read;
+  RAY_CHECK_GT(version_read_internal, 0);
+  channel->next_version_to_read = version_read_internal;
 
   size_t total_size = object->header->data_size + object->header->metadata_size;
   RAY_CHECK_LE(static_cast<int64_t>(total_size), channel->mutable_object->allocated_size);
@@ -402,6 +411,10 @@ Status MutableObjectManager::ReadAcquire(const ObjectID &object_id,
                                          std::move(metadata_copy),
                                          std::vector<rpc::ObjectReference>());
   }
+
+  // Return the version to caller (obtained atomically with header_sem)
+  version_read = version_read_internal;
+
   RAY_LOG(DEBUG).WithField(object_id) << "ReadAcquire returning buffer";
   return Status::OK();
 }
diff --git a/src/ray/core_worker/experimental_mutable_object_manager.h b/src/ray/core_worker/experimental_mutable_object_manager.h
index 1c888eaf2535..6b8fec35e5ca 100644
--- a/src/ray/core_worker/experimental_mutable_object_manager.h
+++ b/src/ray/core_worker/experimental_mutable_object_manager.h
@@ -184,6 +184,19 @@ class MutableObjectManager : public std::enable_shared_from_this<MutableObjectMa
                      std::shared_ptr<RayObject> &result,
                      int64_t timeout_ms = -1);
 
+  /// Overload that returns the version of the object that was read.
+  /// The version is obtained atomically while holding the header semaphore.
+  ///
+  /// \param[in] object_id The ID of the object.
+  /// \param[out] result The read object.
+  /// \param[out] version_read The version of the object that was read.
+  /// \param[in] timeout_ms Timeout in milliseconds.
+  /// \return The return status.
+  Status ReadAcquire(const ObjectID &object_id,
+                     std::shared_ptr<RayObject> &result,
+                     int64_t &version_read,
+                     int64_t timeout_ms);
+
   /// Releases the object, allowing it to be written again. If the caller did
   /// not previously ReadAcquire the object, then this first blocks until the
   /// latest value is available to read, then releases the value.
diff --git a/src/ray/core_worker/experimental_mutable_object_provider.cc b/src/ray/core_worker/experimental_mutable_object_provider.cc
index 8550ff62829d..684e4c658e0f 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.cc
+++ b/src/ray/core_worker/experimental_mutable_object_provider.cc
@@ -147,63 +147,73 @@ void MutableObjectProvider::HandlePushMutableObject(
       << "Metadata size mismatch. Expected " << total_metadata_size << " bytes, got "
       << request.metadata().size() << " bytes";
 
-  // Check if this chunk has already been received (idempotent retry handling).
-  bool is_new_chunk = false;
-  uint64_t tmp_written_so_far = 0;
-  bool object_complete = false;
-  bool needs_write_acquire = false;
+  // Version-based idempotent retry handling strategy:
+  // - Stale versions (<= highest_completed): discard immediately
+  // - Active version (== highest_completed + 1): write to backing store
+  // - Future versions (> highest_completed + 1): buffer for later processing
+  int64_t request_version = request.version();
+
+  // Step 1: Reject stale retries from completed writes (O(1) check)
+  int64_t highest_completed = 0;
   {
     absl::MutexLock guard(&written_so_far_lock_);
+    highest_completed = highest_completed_version_[writer_object_id];  // default 0
+
+    if (request_version <= highest_completed) {
+      // Stale retry from already-completed write
+      reply->set_done(true);
+      return;
+    }
+  }
+
+  // Step 2: Determine if this is active version or future version
+  int64_t active_version = highest_completed + 1;
+  bool is_active_version = (request_version == active_version);
+
+  // Step 3: Check for duplicate chunks using (offset, version) key
+  auto chunk_key = std::make_pair(offset, request_version);
+  bool needs_write_acquire = false;
 
-    // Initialize tracking for this object if needed.
+  {
+    absl::MutexLock guard(&written_so_far_lock_);
     auto &received_chunks = received_chunks_[writer_object_id];
 
-    // Check if we've already received this specific chunk by offset.
-    if (received_chunks.find(offset) != received_chunks.end()) {
-      // This chunk was already received (retry) - skip processing but return status.
-      auto written_it = written_so_far_.find(writer_object_id);
-      if (written_it != written_so_far_.end()) {
-        tmp_written_so_far = written_it->second;
-        object_complete = (tmp_written_so_far == total_data_size);
+    if (received_chunks.find(chunk_key) != received_chunks.end()) {
+      // Duplicate chunk - return status
+      if (is_active_version) {
+        auto written_it = written_so_far_.find(writer_object_id);
+        uint64_t written = (written_it != written_so_far_.end()) ? written_it->second : 0;
+        reply->set_done(written == total_data_size);
       } else {
-        // Tracking was cleaned up, meaning object is complete.
-        object_complete = true;
-      }
-    } else {
-      // This is a new chunk - check tracking but don't mark as received yet.
-      // We'll mark it as received only after successfully writing it.
-      is_new_chunk = true;
-      auto written_it = written_so_far_.find(writer_object_id);
-      bool is_new_write = (written_it == written_so_far_.end());
-      tmp_written_so_far = is_new_write ? 0 : written_it->second;
-
-      // Check if WriteAcquire needs to be called. This handles out-of-order chunks:
-      // only the first chunk (or first chunk to arrive) will call WriteAcquire.
-      // Reset write_acquired_ flag if this is a new write (after previous WriteRelease).
-      if (is_new_write) {
-        write_acquired_[writer_object_id] = false;
-        received_chunks_[writer_object_id].clear();
-        // Initialize written_so_far_ immediately to prevent race condition where
-        // multiple concurrent threads all see is_new_write=true and reset
-        // write_acquired_.
-        written_so_far_[writer_object_id] = 0;
-      }
-      if (!write_acquired_[writer_object_id]) {
-        needs_write_acquire = true;
-        write_acquired_[writer_object_id] = true;
+        reply->set_done(false);
       }
+      return;
     }
-  }
 
-  // If this is a retry of an already-received chunk, return current status without
-  // re-processing the data (idempotent operation).
-  if (!is_new_chunk) {
-    reply->set_done(object_complete);
-    return;
+    // Step 4: For future versions, buffer only (don't write to backing store)
+    if (!is_active_version) {
+      received_chunks.insert(chunk_key);
+      reply->set_done(false);
+      return;
+    }
+
+    // Step 5: Active version - check if need WriteAcquire
+    if (!write_acquired_[writer_object_id]) {
+      needs_write_acquire = true;
+      write_acquired_[writer_object_id] = true;
+    }
   }
 
+  // Continue with WriteAcquire and write logic for active version
+  bool object_complete = false;
+
   std::shared_ptr<Buffer> object_backing_store;
   if (needs_write_acquire) {
+    // Initialize written_so_far_ for new write
+    {
+      absl::MutexLock guard(&written_so_far_lock_);
+      written_so_far_[writer_object_id] = 0;
+    }
     // First chunk to arrive (may not be offset 0 due to out-of-order delivery) -
     // acquire write lock and allocate backing store.
     // We set `metadata` to nullptr since the metadata is at the end of the object, which
@@ -231,7 +241,8 @@ void MutableObjectProvider::HandlePushMutableObject(
   // This ensures retries are handled correctly even if WriteAcquire fails.
   {
     absl::MutexLock guard(&written_so_far_lock_);
-    received_chunks_[writer_object_id].insert(offset);
+    // Mark chunk as received using (offset, version) pair (reusing chunk_key from above)
+    received_chunks_[writer_object_id].insert(chunk_key);
     // Update written_so_far_ by adding this chunk's size.
     // Note: written_so_far_ was already initialized to 0 in the first lock block
     // for new writes, so we can safely increment it here.
@@ -253,13 +264,31 @@ void MutableObjectProvider::HandlePushMutableObject(
     // The entire object has been written, so call `WriteRelease()`.
     RAY_CHECK_OK(object_manager_->WriteRelease(info.local_object_id));
 
-    // Clean up tracking state after WriteRelease to prepare for the next write.
-    // This ensures that subsequent writes to the same channel start fresh.
+    // Update tracking state after WriteRelease
     {
       absl::MutexLock guard(&written_so_far_lock_);
+
+      // Update highest completed version
+      highest_completed_version_[writer_object_id] = request_version;
+
+      // Remove ONLY chunks belonging to this completed version
+      auto &chunks = received_chunks_[writer_object_id];
+      for (auto it = chunks.begin(); it != chunks.end();) {
+        if (it->second == request_version) {
+          it = chunks.erase(it);
+        } else {
+          ++it;
+        }
+      }
+
+      // Clear per-object tracking for next write
       written_so_far_.erase(writer_object_id);
-      received_chunks_.erase(writer_object_id);
       write_acquired_.erase(writer_object_id);
+
+      // Clean up received_chunks_ entry if empty
+      if (chunks.empty()) {
+        received_chunks_.erase(writer_object_id);
+      }
     }
 
     reply->set_done(true);
@@ -309,9 +338,11 @@ void MutableObjectProvider::PollWriterClosure(
         &remote_readers) {
   // NOTE: There's only 1 PollWriterClosure at any time in a single thread.
   std::shared_ptr<RayObject> object;
+  int64_t version = 0;
   // The corresponding ReadRelease() will be automatically called when
   // `object` goes out of scope.
-  Status status = object_manager_->ReadAcquire(writer_object_id, object);
+  Status status =
+      object_manager_->ReadAcquire(writer_object_id, object, version, /*timeout_ms=*/-1);
   // Check if the thread returned from ReadAcquire() because the process is exiting, not
   // because there is something to read.
   if (status.code() == StatusCode::ChannelError) {
@@ -323,6 +354,9 @@ void MutableObjectProvider::PollWriterClosure(
   RAY_CHECK(object->GetData());
   RAY_CHECK(object->GetMetadata());
 
+  // Version was obtained safely from ReadAcquire (with header_sem protection)
+  RAY_CHECK_GT(version, 0) << "Invalid version for " << writer_object_id;
+
   std::shared_ptr<size_t> num_replied = std::make_shared<size_t>(0);
   for (const auto &reader : *remote_readers) {
     reader->PushMutableObject(
@@ -331,6 +365,7 @@ void MutableObjectProvider::PollWriterClosure(
         object->GetMetadata()->Size(),
         object->GetData()->Data(),
         object->GetMetadata()->Data(),
+        version,
         [this, &io_context, writer_object_id, remote_readers, num_replied](
             const Status &push_object_status, const rpc::PushMutableObjectReply &reply) {
           *num_replied += 1;
diff --git a/src/ray/core_worker/experimental_mutable_object_provider.h b/src/ray/core_worker/experimental_mutable_object_provider.h
index 4d1ee02a2b71..dd4ff3f4e90d 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.h
+++ b/src/ray/core_worker/experimental_mutable_object_provider.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "ray/common/asio/instrumented_io_context.h"
@@ -23,6 +24,18 @@
 #include "ray/raylet_rpc_client/raylet_client_interface.h"
 #include "ray/rpc/client_call.h"
 
+// Hash function for std::pair<uint64_t, int64_t> to use in unordered_set for chunk
+// tracking
+namespace std {
+template <>
+struct hash<std::pair<uint64_t, int64_t>> {
+  size_t operator()(const std::pair<uint64_t, int64_t> &p) const {
+    // Combine the two hash values using a standard hash combining technique
+    return std::hash<uint64_t>{}(p.first) ^ (std::hash<int64_t>{}(p.second) << 1);
+  }
+};
+}  // namespace std
+
 namespace ray {
 namespace core {
 namespace experimental {
@@ -247,15 +260,23 @@ class MutableObjectProvider : public MutableObjectProviderInterface {
   // object write.
   std::unordered_map<ObjectID, uint64_t> written_so_far_
       ABSL_GUARDED_BY(written_so_far_lock_);
-  // Tracks which chunks (by offset) have been received for each object to handle retries
-  // idempotently. Each offset uniquely identifies a chunk.
-  std::unordered_map<ObjectID, std::unordered_set<uint64_t>> received_chunks_
-      ABSL_GUARDED_BY(written_so_far_lock_);
+  // Tracks which chunks (by offset and version) have been received for each object to
+  // handle retries idempotently. The version comes from PlasmaObjectHeader.version on the
+  // sender side. The pair (offset, version) uniquely identifies a chunk across different
+  // write epochs, preventing stale retries from interfering with new writes.
+  std::unordered_map<ObjectID, std::unordered_set<std::pair<uint64_t, int64_t>>>
+      received_chunks_ ABSL_GUARDED_BY(written_so_far_lock_);
   // Tracks whether WriteAcquire has been called for each object to handle out-of-order
   // chunks. This ensures WriteAcquire is called exactly once, even if chunks arrive
   // concurrently or out of order.
   std::unordered_map<ObjectID, bool> write_acquired_
       ABSL_GUARDED_BY(written_so_far_lock_);
+  // Maps writer_object_id to the highest version that has completed WriteRelease.
+  // Only version == highest_completed + 1 can actively write to backing store.
+  // Versions <= highest_completed are stale retries and are discarded.
+  // Versions > highest_completed + 1 are buffered for future processing.
+  std::unordered_map<ObjectID, int64_t> highest_completed_version_
+      ABSL_GUARDED_BY(written_so_far_lock_);
 
   friend class MutableObjectProvider_MutableObjectBufferReadRelease_Test;
 };
diff --git a/src/ray/core_worker/tests/mutable_object_provider_test.cc b/src/ray/core_worker/tests/mutable_object_provider_test.cc
index 013827865250..5d68adaf7374 100644
--- a/src/ray/core_worker/tests/mutable_object_provider_test.cc
+++ b/src/ray/core_worker/tests/mutable_object_provider_test.cc
@@ -17,7 +17,9 @@
 #include <memory>
 #include <string>
 #include <thread>
+#include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "absl/functional/bind_front.h"
@@ -48,22 +50,31 @@ class TestPlasma : public plasma::MockPlasmaClient {
       const ObjectID &object_id,
       std::unique_ptr<plasma::MutableObject> *mutable_object) override {
     absl::MutexLock guard(&lock_);
-    if (!objects_.count(object_id)) {
+    auto it = objects_.find(object_id);
+    if (it == objects_.end()) {
       // Use a larger default size to support tests with larger objects
       // Need at least 2048 bytes to accommodate tests with variable chunk sizes
-      *mutable_object = MakeObject(/*min_size=*/2048);
-      objects_.insert(object_id);
+      auto obj = MakeObject(/*min_size=*/2048);
+      uint8_t *ptr = reinterpret_cast<uint8_t *>(obj->header);
+      objects_[object_id] = ptr;
+      *mutable_object = std::move(obj);
     } else {
-      // Object already exists - return a new view of it
-      // For testing, we create a new object each time since the real implementation
-      // would return a view of the existing object
-      *mutable_object = MakeObject(/*min_size=*/2048);
+      // Object already exists - return a view of the same underlying memory
+      uint8_t *ptr = it->second;
+      plasma::PlasmaObject info{};
+      info.header_offset = 0;
+      info.data_offset = sizeof(PlasmaObjectHeader);
+      info.allocated_size = 2048;  // Same size as initial allocation
+      *mutable_object = std::make_unique<plasma::MutableObject>(ptr, info);
     }
     return Status::OK();
   }
 
   ~TestPlasma() override {
-    // Objects are managed by the MutableObjectProvider, so we don't free them here
+    // Free all allocated objects
+    for (auto &pair : objects_) {
+      free(pair.second);
+    }
   }
 
  private:
@@ -88,8 +99,8 @@ class TestPlasma : public plasma::MockPlasmaClient {
   }
 
   absl::Mutex lock_;
-  // Tracks the mutable objects that have been created.
-  std::unordered_set<ObjectID> objects_;
+  // Maps object IDs to their backing store pointers
+  std::unordered_map<ObjectID, uint8_t *> objects_;
 };
 
 class MockRayletClient : public rpc::FakeRayletClient {
@@ -101,6 +112,7 @@ class MockRayletClient : public rpc::FakeRayletClient {
                          uint64_t metadata_size,
                          void *data,
                          void *metadata,
+                         int64_t version,
                          const rpc::ClientCallback<rpc::PushMutableObjectReply> &callback,
                          int64_t timeout_ms = -1) override {
     absl::MutexLock guard(&lock_);
@@ -207,6 +219,7 @@ TEST(MutableObjectProvider, HandlePushMutableObject) {
   request.set_writer_object_id(object_id.Binary());
   request.set_total_data_size(0);
   request.set_total_metadata_size(0);
+  request.set_version(1);
 
   ray::rpc::PushMutableObjectReply reply;
   provider.HandlePushMutableObject(request, &reply);
@@ -425,6 +438,7 @@ TEST(MutableObjectProvider, HandleRetryOutOfOrderChunks) {
     request.set_chunk_size(kChunk1Size);
     request.set_data(chunk_data[1].data(), kChunk1Size);
     request.set_metadata(metadata.data(), kMetadataSize);
+    request.set_version(1);  // All chunks in this write have version 1
     provider.HandlePushMutableObject(request, &replies[1]);
     EXPECT_FALSE(replies[1].done()) << "Chunk 1 should not complete the object";
   }
@@ -439,6 +453,7 @@ TEST(MutableObjectProvider, HandleRetryOutOfOrderChunks) {
     request.set_chunk_size(kChunk0Size);
     request.set_data(chunk_data[0].data(), kChunk0Size);
     request.set_metadata(metadata.data(), kMetadataSize);
+    request.set_version(1);  // Same version as chunk 1
     provider.HandlePushMutableObject(request, &replies[0]);
     EXPECT_FALSE(replies[0].done()) << "Chunk 0 should not complete the object";
   }
@@ -453,6 +468,7 @@ TEST(MutableObjectProvider, HandleRetryOutOfOrderChunks) {
     request.set_chunk_size(kChunk0Size);
     request.set_data(chunk_data[0].data(), kChunk0Size);
     request.set_metadata(metadata.data(), kMetadataSize);
+    request.set_version(1);  // Same version - legitimate retry
     ray::rpc::PushMutableObjectReply retry_reply;
     provider.HandlePushMutableObject(request, &retry_reply);
     // Retry should return current status without error
@@ -469,6 +485,7 @@ TEST(MutableObjectProvider, HandleRetryOutOfOrderChunks) {
     request.set_chunk_size(kChunk2Size);
     request.set_data(chunk_data[2].data(), kChunk2Size);
     request.set_metadata(metadata.data(), kMetadataSize);
+    request.set_version(1);  // Same version
     provider.HandlePushMutableObject(request, &replies[2]);
     EXPECT_TRUE(replies[2].done()) << "Chunk 2 should complete the object";
   }
@@ -494,6 +511,92 @@ TEST(MutableObjectProvider, HandleRetryOutOfOrderChunks) {
   EXPECT_EQ(provider.ReadRelease(reader_object_id).code(), StatusCode::OK);
 }
 
+// Test that version tracking correctly distinguishes chunks from different write epochs
+// This verifies chunks with different versions are not incorrectly treated as duplicates
+TEST(MutableObjectProvider, HandleVersionBasedRetryDetection) {
+  constexpr size_t kDataSize = 512;
+  constexpr size_t kMetadataSize = 16;
+
+  ObjectID writer_object_id = ObjectID::FromRandom();
+  ObjectID reader_object_id = ObjectID::FromRandom();
+  auto plasma = std::make_shared<TestPlasma>();
+  MutableObjectProvider provider(plasma, /*factory=*/nullptr, nullptr);
+
+  provider.HandleRegisterMutableObject(
+      writer_object_id, /*num_readers=*/1, reader_object_id);
+
+  // Write with version 1, single chunk at offset 0
+  std::vector<uint8_t> write1_data(kDataSize, 0xAA);
+  std::vector<uint8_t> metadata1(kMetadataSize, 0x11);
+  {
+    ray::rpc::PushMutableObjectRequest request;
+    ray::rpc::PushMutableObjectReply reply;
+    request.set_writer_object_id(writer_object_id.Binary());
+    request.set_total_data_size(kDataSize);
+    request.set_total_metadata_size(kMetadataSize);
+    request.set_offset(0);
+    request.set_chunk_size(kDataSize);
+    request.set_data(write1_data.data(), kDataSize);
+    request.set_metadata(metadata1.data(), kMetadataSize);
+    request.set_version(1);
+    provider.HandlePushMutableObject(request, &reply);
+    EXPECT_TRUE(reply.done());
+  }
+
+  // Retry of same chunk (same version) - should be treated as duplicate
+  {
+    ray::rpc::PushMutableObjectRequest request;
+    ray::rpc::PushMutableObjectReply reply;
+    request.set_writer_object_id(writer_object_id.Binary());
+    request.set_total_data_size(kDataSize);
+    request.set_total_metadata_size(kMetadataSize);
+    request.set_offset(0);
+    request.set_chunk_size(kDataSize);
+    request.set_data(write1_data.data(), kDataSize);
+    request.set_metadata(metadata1.data(), kMetadataSize);
+    request.set_version(1);  // Same version
+    provider.HandlePushMutableObject(request, &reply);
+    EXPECT_TRUE(reply.done())
+        << "Legitimate retry with same version recognized as duplicate";
+  }
+
+  // Read and release
+  {
+    std::shared_ptr<RayObject> result;
+    EXPECT_EQ(provider.ReadAcquire(reader_object_id, result).code(), StatusCode::OK);
+    EXPECT_EQ(provider.ReadRelease(reader_object_id).code(), StatusCode::OK);
+  }
+
+  // New write with version 2, same offset 0 - should NOT be treated as duplicate
+  std::vector<uint8_t> write2_data(kDataSize, 0xBB);
+  std::vector<uint8_t> metadata2(kMetadataSize, 0x22);
+  {
+    ray::rpc::PushMutableObjectRequest request;
+    ray::rpc::PushMutableObjectReply reply;
+    request.set_writer_object_id(writer_object_id.Binary());
+    request.set_total_data_size(kDataSize);
+    request.set_total_metadata_size(kMetadataSize);
+    request.set_offset(0);
+    request.set_chunk_size(kDataSize);
+    request.set_data(write2_data.data(), kDataSize);
+    request.set_metadata(metadata2.data(), kMetadataSize);
+    request.set_version(2);  // DIFFERENT version
+    provider.HandlePushMutableObject(request, &reply);
+    EXPECT_TRUE(reply.done()) << "New write with different version correctly processed";
+  }
+
+  // Verify we got Write 2's data (version 2 overwrote version 1)
+  {
+    std::shared_ptr<RayObject> result;
+    EXPECT_EQ(provider.ReadAcquire(reader_object_id, result).code(), StatusCode::OK);
+    const uint8_t *data_ptr = result->GetData()->Data();
+    for (size_t i = 0; i < kDataSize; i++) {
+      EXPECT_EQ(data_ptr[i], 0xBB) << "Version 2 data correctly written at offset " << i;
+    }
+    EXPECT_EQ(provider.ReadRelease(reader_object_id).code(), StatusCode::OK);
+  }
+}
+
 #endif  // defined(__APPLE__) || defined(__linux__)
 
 }  // namespace experimental
diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto
index d9e3e1ee6dae..f99a8b864665 100644
--- a/src/ray/protobuf/node_manager.proto
+++ b/src/ray/protobuf/node_manager.proto
@@ -381,6 +381,9 @@ message PushMutableObjectRequest {
   bytes data = 6;
   // The metadata payload. Note that the size of metadata is minimal.
   bytes metadata = 7;
+  // The version of the object from PlasmaObjectHeader.version on the writer node.
+  // This is incremented on each write and used to distinguish retries from stale writes.
+  int64 version = 8;
 }
 
 message PushMutableObjectReply {
diff --git a/src/ray/raylet_rpc_client/fake_raylet_client.h b/src/ray/raylet_rpc_client/fake_raylet_client.h
index e5f753bbfed3..cb25734c3975 100644
--- a/src/ray/raylet_rpc_client/fake_raylet_client.h
+++ b/src/ray/raylet_rpc_client/fake_raylet_client.h
@@ -240,6 +240,7 @@ class FakeRayletClient : public RayletClientInterface {
                          uint64_t metadata_size,
                          void *data,
                          void *metadata,
+                         int64_t version,
                          const ClientCallback<PushMutableObjectReply> &callback,
                          int64_t timeout_ms = -1) override {}
 
diff --git a/src/ray/raylet_rpc_client/raylet_client.cc b/src/ray/raylet_rpc_client/raylet_client.cc
index d17da09ddc39..27048efba3ac 100644
--- a/src/ray/raylet_rpc_client/raylet_client.cc
+++ b/src/ray/raylet_rpc_client/raylet_client.cc
@@ -167,6 +167,7 @@ void RayletClient::PushMutableObject(
     uint64_t metadata_size,
     void *data,
     void *metadata,
+    int64_t version,
     const ray::rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback,
     int64_t timeout_ms) {
   // Ray sets the gRPC max payload size to ~512 MiB. We set the max chunk size to a
@@ -198,6 +199,8 @@ void RayletClient::PushMutableObject(
     // Set metadata for each message so on the receiver side
     // metadata from any message can be used.
     request.set_metadata(static_cast<char *>(metadata), metadata_size);
+    // Set the version from the sender's PlasmaObjectHeader to distinguish write epochs
+    request.set_version(version);
 
     // Use retryable RPC call to handle failure recovery, retries, and timeout.
     // Each chunk is retried automatically on transient network errors.
diff --git a/src/ray/raylet_rpc_client/raylet_client.h b/src/ray/raylet_rpc_client/raylet_client.h
index a5e6c24b3dbf..6c9e1786d6cf 100644
--- a/src/ray/raylet_rpc_client/raylet_client.h
+++ b/src/ray/raylet_rpc_client/raylet_client.h
@@ -88,6 +88,7 @@ class RayletClient : public RayletClientInterface {
       uint64_t metadata_size,
       void *data,
       void *metadata,
+      int64_t version,
       const ray::rpc::ClientCallback<ray::rpc::PushMutableObjectReply> &callback,
       int64_t timeout_ms = -1) override;
 
diff --git a/src/ray/raylet_rpc_client/raylet_client_interface.h b/src/ray/raylet_rpc_client/raylet_client_interface.h
index f9150fd507c6..5fdeff070bdc 100644
--- a/src/ray/raylet_rpc_client/raylet_client_interface.h
+++ b/src/ray/raylet_rpc_client/raylet_client_interface.h
@@ -174,6 +174,7 @@ class RayletClientInterface {
       uint64_t metadata_size,
       void *data,
       void *metadata,
+      int64_t version,
       const rpc::ClientCallback<rpc::PushMutableObjectReply> &callback,
       int64_t timeout_ms = -1) = 0;
 

From 0dc0e8e60bddb9335ae39e5d9e91a4e19514c0d0 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Sat, 20 Dec 2025 00:30:31 +0000
Subject: [PATCH 11/11] fix

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../experimental_mutable_object_provider.cc   | 20 +++++++++-
 .../experimental_mutable_object_provider.h    |  7 ++--
 src/ray/rpc/tests/push_mutable_object_test.cc | 40 +++++++++++++++----
 3 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/src/ray/core_worker/experimental_mutable_object_provider.cc b/src/ray/core_worker/experimental_mutable_object_provider.cc
index 684e4c658e0f..ebe7027e37f3 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.cc
+++ b/src/ray/core_worker/experimental_mutable_object_provider.cc
@@ -200,7 +200,8 @@ void MutableObjectProvider::HandlePushMutableObject(
     // Step 5: Active version - check if need WriteAcquire
     if (!write_acquired_[writer_object_id]) {
       needs_write_acquire = true;
-      write_acquired_[writer_object_id] = true;
+      // Note: We do NOT set write_acquired_ = true here yet. This prevents other threads
+      // from calling GetObjectBackingStore() before WriteAcquire() completes.
     }
   }
 
@@ -224,7 +225,24 @@ void MutableObjectProvider::HandlePushMutableObject(
                                                total_metadata_size,
                                                info.num_readers,
                                                object_backing_store));
+    // Now that WriteAcquire has completed, set write_acquired_ to true so other threads
+    // can proceed with GetObjectBackingStore().
+    {
+      absl::MutexLock guard(&written_so_far_lock_);
+      write_acquired_[writer_object_id] = true;
+    }
   } else {
+    // Wait until WriteAcquire has completed before calling GetObjectBackingStore.
+    // This prevents the race condition where we check write_acquired_ before
+    // WriteAcquire() has actually completed.
+    {
+      absl::MutexLock guard(&written_so_far_lock_);
+      auto condition = [this, &writer_object_id]()
+                           ABSL_SHARED_LOCKS_REQUIRED(written_so_far_lock_) {
+                             return write_acquired_[writer_object_id];
+                           };
+      written_so_far_lock_.Await(absl::Condition(&condition));
+    }
     // Subsequent chunk (or chunk arriving after WriteAcquire was called by another chunk)
     // - get existing backing store.
     RAY_CHECK_OK(object_manager_->GetObjectBackingStore(info.local_object_id,
diff --git a/src/ray/core_worker/experimental_mutable_object_provider.h b/src/ray/core_worker/experimental_mutable_object_provider.h
index dd4ff3f4e90d..dac71d1e863a 100644
--- a/src/ray/core_worker/experimental_mutable_object_provider.h
+++ b/src/ray/core_worker/experimental_mutable_object_provider.h
@@ -266,9 +266,10 @@ class MutableObjectProvider : public MutableObjectProviderInterface {
   // write epochs, preventing stale retries from interfering with new writes.
   std::unordered_map<ObjectID, std::unordered_set<std::pair<uint64_t, int64_t>>>
       received_chunks_ ABSL_GUARDED_BY(written_so_far_lock_);
-  // Tracks whether WriteAcquire has been called for each object to handle out-of-order
-  // chunks. This ensures WriteAcquire is called exactly once, even if chunks arrive
-  // concurrently or out of order.
+  // Tracks whether WriteAcquire has been called AND completed for each object to handle
+  // out-of-order chunks. This ensures WriteAcquire is called exactly once, even if chunks
+  // arrive concurrently or out of order. Other threads must wait until this is true
+  // before calling GetObjectBackingStore().
   std::unordered_map<ObjectID, bool> write_acquired_
       ABSL_GUARDED_BY(written_so_far_lock_);
   // Maps writer_object_id to the highest version that has completed WriteRelease.
diff --git a/src/ray/rpc/tests/push_mutable_object_test.cc b/src/ray/rpc/tests/push_mutable_object_test.cc
index d17b0021daf6..f22dd769018f 100644
--- a/src/ray/rpc/tests/push_mutable_object_test.cc
+++ b/src/ray/rpc/tests/push_mutable_object_test.cc
@@ -300,8 +300,14 @@ TEST_F(PushMutableObjectTest, TestSingleChunkTransferSuccess) {
     callback_reply = std::move(reply);
   };
 
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+  raylet_client_->PushMutableObject(writer_object_id,
+                                    data_size,
+                                    metadata_size,
+                                    data.data(),
+                                    metadata.data(),
+                                    /*version=*/1,
+                                    callback,
+                                    /*timeout_ms=*/-1);
 
   // Wait for callback
   auto start_time = std::chrono::steady_clock::now();
@@ -359,8 +365,14 @@ TEST_F(PushMutableObjectTest, TestMultiChunkTransferSuccess) {
     callback_reply = std::move(reply);
   };
 
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+  raylet_client_->PushMutableObject(writer_object_id,
+                                    data_size,
+                                    metadata_size,
+                                    data.data(),
+                                    metadata.data(),
+                                    /*version=*/1,
+                                    callback,
+                                    /*timeout_ms=*/-1);
 
   // Wait for callback (may take longer for large object)
   auto start_time = std::chrono::steady_clock::now();
@@ -415,8 +427,14 @@ TEST_F(PushMutableObjectTest, TestCallbackCalledOnlyOnce) {
     callback_status = status;
   };
 
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+  raylet_client_->PushMutableObject(writer_object_id,
+                                    data_size,
+                                    metadata_size,
+                                    data.data(),
+                                    metadata.data(),
+                                    /*version=*/1,
+                                    callback,
+                                    /*timeout_ms=*/-1);
 
   // Wait for completion
   auto start_time = std::chrono::steady_clock::now();
@@ -462,8 +480,14 @@ TEST_F(PushMutableObjectTest, TestRetryHappens) {
   grpc_server_->Shutdown();
 
   // Start the transfer (will fail initially due to UNAVAILABLE)
-  raylet_client_->PushMutableObject(
-      writer_object_id, data_size, metadata_size, data.data(), metadata.data(), callback);
+  raylet_client_->PushMutableObject(writer_object_id,
+                                    data_size,
+                                    metadata_size,
+                                    data.data(),
+                                    metadata.data(),
+                                    /*version=*/1,
+                                    callback,
+                                    /*timeout_ms=*/-1);
 
   // Wait a bit to let initial attempts fail
   std::this_thread::sleep_for(std::chrono::milliseconds(500));