-
Notifications
You must be signed in to change notification settings - Fork 7.5k
[core][cgraph] Introduce fault-tolerant PushMutableObject #58866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
2bfcf51
e14a5b7
c836985
1051cdf
1a4ea6c
6dff21b
34a87b2
138d1fe
6f9cc19
5ec49eb
e96483f
0dc0e8e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -136,19 +136,87 @@ void MutableObjectProvider::HandlePushMutableObject( | |
| uint64_t offset = request.offset(); | ||
| uint64_t chunk_size = request.chunk_size(); | ||
|
|
||
| uint64_t tmp_written_so_far = 0; | ||
| // Validate request bounds to prevent buffer overflows. | ||
| RAY_CHECK_LE(offset + chunk_size, total_data_size) | ||
| << "Chunk extends beyond total data size. offset=" << offset | ||
| << ", chunk_size=" << chunk_size << ", total_data_size=" << total_data_size; | ||
| RAY_CHECK_EQ(request.data().size(), chunk_size) | ||
| << "Data size mismatch. Expected " << chunk_size << " bytes, got " | ||
| << request.data().size() << " bytes"; | ||
| RAY_CHECK_EQ(request.metadata().size(), total_metadata_size) | ||
| << "Metadata size mismatch. Expected " << total_metadata_size << " bytes, got " | ||
| << request.metadata().size() << " bytes"; | ||
|
|
||
| // Version-based idempotent retry handling strategy: | ||
| // - Stale versions (<= highest_completed): discard immediately | ||
| // - Active version (== highest_completed + 1): write to backing store | ||
| // - Future versions (> highest_completed + 1): buffer for later processing | ||
| int64_t request_version = request.version(); | ||
|
|
||
| // Step 1: Reject stale retries from completed writes (O(1) check) | ||
| int64_t highest_completed = 0; | ||
| { | ||
| absl::MutexLock guard(&written_so_far_lock_); | ||
| highest_completed = highest_completed_version_[writer_object_id]; // default 0 | ||
|
|
||
| tmp_written_so_far = written_so_far_[writer_object_id]; | ||
| written_so_far_[writer_object_id] += chunk_size; | ||
| if (written_so_far_[writer_object_id] == total_data_size) { | ||
| written_so_far_.erase(written_so_far_.find(writer_object_id)); | ||
| if (request_version <= highest_completed) { | ||
| // Stale retry from already-completed write | ||
| reply->set_done(true); | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| // Step 2: Determine if this is active version or future version | ||
| int64_t active_version = highest_completed + 1; | ||
| bool is_active_version = (request_version == active_version); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TOCTOU race allows stale retries to corrupt stateA time-of-check-time-of-use race exists between the two lock acquisitions. The code reads Additional Locations (1) |
||
|
|
||
| // Step 3: Check for duplicate chunks using (offset, version) key | ||
| auto chunk_key = std::make_pair(offset, request_version); | ||
| bool needs_write_acquire = false; | ||
|
|
||
| { | ||
| absl::MutexLock guard(&written_so_far_lock_); | ||
| auto &received_chunks = received_chunks_[writer_object_id]; | ||
|
|
||
| if (received_chunks.find(chunk_key) != received_chunks.end()) { | ||
| // Duplicate chunk - return status | ||
| if (is_active_version) { | ||
| auto written_it = written_so_far_.find(writer_object_id); | ||
| uint64_t written = (written_it != written_so_far_.end()) ? written_it->second : 0; | ||
| reply->set_done(written == total_data_size); | ||
| } else { | ||
| reply->set_done(false); | ||
| } | ||
|
ruisearch42 marked this conversation as resolved.
|
||
| return; | ||
| } | ||
|
|
||
| // Step 4: For future versions, buffer only (don't write to backing store) | ||
| if (!is_active_version) { | ||
| received_chunks.insert(chunk_key); | ||
| reply->set_done(false); | ||
| return; | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Future version chunks tracked but data never writtenWhen a future version chunk arrives (version > active version), the code inserts the Additional Locations (1)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can simply remove |
||
|
|
||
| // Step 5: Active version - check if need WriteAcquire | ||
| if (!write_acquired_[writer_object_id]) { | ||
| needs_write_acquire = true; | ||
| // Note: We do NOT set write_acquired_ = true here yet. This prevents other threads | ||
| // from calling GetObjectBackingStore() before WriteAcquire() completes. | ||
| } | ||
| } | ||
|
|
||
| // Continue with WriteAcquire and write logic for active version | ||
| bool object_complete = false; | ||
|
|
||
| std::shared_ptr<Buffer> object_backing_store; | ||
| if (tmp_written_so_far == 0u) { | ||
| if (needs_write_acquire) { | ||
| // Initialize written_so_far_ for new write | ||
| { | ||
| absl::MutexLock guard(&written_so_far_lock_); | ||
| written_so_far_[writer_object_id] = 0; | ||
| } | ||
| // First chunk to arrive (may not be offset 0 due to out-of-order delivery) - | ||
| // acquire write lock and allocate backing store. | ||
| // We set `metadata` to nullptr since the metadata is at the end of the object, which | ||
| // we will not have until the last chunk is received. | ||
| RAY_CHECK_OK(object_manager_->WriteAcquire(info.local_object_id, | ||
|
|
@@ -157,24 +225,90 @@ void MutableObjectProvider::HandlePushMutableObject( | |
| total_metadata_size, | ||
| info.num_readers, | ||
| object_backing_store)); | ||
| // Now that WriteAcquire has completed, set write_acquired_ to true so other threads | ||
| // can proceed with GetObjectBackingStore(). | ||
| { | ||
| absl::MutexLock guard(&written_so_far_lock_); | ||
| write_acquired_[writer_object_id] = true; | ||
| } | ||
| } else { | ||
| // Wait until WriteAcquire has completed before calling GetObjectBackingStore. | ||
| // This prevents the race condition where we check write_acquired_ before | ||
| // WriteAcquire() has actually completed. | ||
| { | ||
| absl::MutexLock guard(&written_so_far_lock_); | ||
| auto condition = [this, &writer_object_id]() | ||
| ABSL_SHARED_LOCKS_REQUIRED(written_so_far_lock_) { | ||
| return write_acquired_[writer_object_id]; | ||
| }; | ||
| written_so_far_lock_.Await(absl::Condition(&condition)); | ||
| } | ||
| // Subsequent chunk (or chunk arriving after WriteAcquire was called by another chunk) | ||
| // - get existing backing store. | ||
| RAY_CHECK_OK(object_manager_->GetObjectBackingStore(info.local_object_id, | ||
| total_data_size, | ||
| total_metadata_size, | ||
| object_backing_store)); | ||
| } | ||
|
ruisearch42 marked this conversation as resolved.
|
||
| RAY_CHECK(object_backing_store); | ||
|
|
||
| // Copy chunk data to backing store. | ||
| memcpy(object_backing_store->Data() + offset, request.data().data(), chunk_size); | ||
| size_t total_written = tmp_written_so_far + chunk_size; | ||
| RAY_CHECK_LE(total_written, total_data_size); | ||
| if (total_written == total_data_size) { | ||
| // Copy the metadata to the end of the object. | ||
|
|
||
| // Mark this chunk as received only after successfully writing it. | ||
| // This ensures retries are handled correctly even if WriteAcquire fails. | ||
| { | ||
| absl::MutexLock guard(&written_so_far_lock_); | ||
| // Mark chunk as received using (offset, version) pair (reusing chunk_key from above) | ||
| received_chunks_[writer_object_id].insert(chunk_key); | ||
| // Update written_so_far_ by adding this chunk's size. | ||
| // Note: written_so_far_ was already initialized to 0 in the first lock block | ||
| // for new writes, so we can safely increment it here. | ||
| written_so_far_[writer_object_id] += chunk_size; | ||
| RAY_CHECK_LE(written_so_far_[writer_object_id], total_data_size); | ||
| if (written_so_far_[writer_object_id] == total_data_size) { | ||
| object_complete = true; | ||
| // Note: We keep received_chunks_ and written_so_far_ entries until WriteRelease | ||
| // completes to handle retries. They will be cleaned up after WriteRelease() is | ||
| // called. | ||
| } | ||
| } | ||
|
|
||
| if (object_complete) { | ||
| // All data chunks received - copy metadata and release write lock. | ||
| memcpy(object_backing_store->Data() + total_data_size, | ||
| request.metadata().data(), | ||
| total_metadata_size); | ||
| // The entire object has been written, so call `WriteRelease()`. | ||
| RAY_CHECK_OK(object_manager_->WriteRelease(info.local_object_id)); | ||
|
|
||
| // Update tracking state after WriteRelease | ||
| { | ||
| absl::MutexLock guard(&written_so_far_lock_); | ||
|
|
||
| // Update highest completed version | ||
| highest_completed_version_[writer_object_id] = request_version; | ||
|
|
||
| // Remove ONLY chunks belonging to this completed version | ||
| auto &chunks = received_chunks_[writer_object_id]; | ||
| for (auto it = chunks.begin(); it != chunks.end();) { | ||
| if (it->second == request_version) { | ||
| it = chunks.erase(it); | ||
| } else { | ||
| ++it; | ||
| } | ||
| } | ||
|
|
||
| // Clear per-object tracking for next write | ||
| written_so_far_.erase(writer_object_id); | ||
| write_acquired_.erase(writer_object_id); | ||
|
|
||
| // Clean up received_chunks_ entry if empty | ||
| if (chunks.empty()) { | ||
| received_chunks_.erase(writer_object_id); | ||
| } | ||
| } | ||
|
|
||
| reply->set_done(true); | ||
| } else { | ||
| reply->set_done(false); | ||
|
|
@@ -222,9 +356,11 @@ void MutableObjectProvider::PollWriterClosure( | |
| &remote_readers) { | ||
| // NOTE: There's only 1 PollWriterClosure at any time in a single thread. | ||
| std::shared_ptr<RayObject> object; | ||
| int64_t version = 0; | ||
| // The corresponding ReadRelease() will be automatically called when | ||
| // `object` goes out of scope. | ||
| Status status = object_manager_->ReadAcquire(writer_object_id, object); | ||
| Status status = | ||
| object_manager_->ReadAcquire(writer_object_id, object, version, /*timeout_ms=*/-1); | ||
| // Check if the thread returned from ReadAcquire() because the process is exiting, not | ||
| // because there is something to read. | ||
| if (status.code() == StatusCode::ChannelError) { | ||
|
|
@@ -236,6 +372,9 @@ void MutableObjectProvider::PollWriterClosure( | |
| RAY_CHECK(object->GetData()); | ||
| RAY_CHECK(object->GetMetadata()); | ||
|
|
||
| // Version was obtained safely from ReadAcquire (with header_sem protection) | ||
| RAY_CHECK_GT(version, 0) << "Invalid version for " << writer_object_id; | ||
|
|
||
| std::shared_ptr<size_t> num_replied = std::make_shared<size_t>(0); | ||
| for (const auto &reader : *remote_readers) { | ||
| reader->PushMutableObject( | ||
|
|
@@ -244,6 +383,7 @@ void MutableObjectProvider::PollWriterClosure( | |
| object->GetMetadata()->Size(), | ||
| object->GetData()->Data(), | ||
| object->GetMetadata()->Data(), | ||
| version, | ||
| [this, &io_context, writer_object_id, remote_readers, num_replied]( | ||
| const Status &push_object_status, const rpc::PushMutableObjectReply &reply) { | ||
| *num_replied += 1; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you return this out with the status with StatusOr instead of an out param