-
Notifications
You must be signed in to change notification settings - Fork 7k
[xray] Implements ray.wait #2162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
0e18ca7
fc2572c
2e8af60
b57a548
a128698
fa5c32d
0ccf46b
b02de4f
f9a9e16
15b7f61
a22263b
8ab41f0
98bacfa
d518a89
53f33e0
8ef35f7
6e10f9e
9a95c65
aa12bd7
9e1602d
304b39c
5d63bb3
cf1fdb2
531d024
d0d3ea4
62ae832
67eef67
0796a17
dd9f0db
9d4ed2b
541b88c
58af739
d9ef29b
fa1928b
d41b1d0
aeaab5b
048f45f
0aa7525
833939f
8e1947c
83d04dd
080282f
a58f5c9
c6d8ba5
7d8d756
6b6e2f3
3a86c93
1a99f25
00eafd7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -192,12 +192,13 @@ std::pair<std::vector<ObjectID>, std::vector<ObjectID>> local_scheduler_wait( | |
| LocalSchedulerConnection *conn, | ||
| const std::vector<ObjectID> &object_ids, | ||
| int num_returns, | ||
| int64_t timeout, | ||
| int64_t timeout_milliseconds, | ||
| bool wait_local) { | ||
| // Write request. | ||
| flatbuffers::FlatBufferBuilder fbb; | ||
| auto message = ray::protocol::CreateWaitRequest( | ||
| fbb, to_flatbuf(fbb, object_ids), num_returns, timeout, wait_local); | ||
| fbb, to_flatbuf(fbb, object_ids), num_returns, timeout_milliseconds, | ||
| wait_local); | ||
| fbb.Finish(message); | ||
| write_message(conn->conn, ray::protocol::MessageType_WaitRequest, | ||
| fbb.GetSize(), fbb.GetBufferPointer()); | ||
|
|
@@ -206,6 +207,7 @@ std::pair<std::vector<ObjectID>, std::vector<ObjectID>> local_scheduler_wait( | |
| int64_t reply_size; | ||
| uint8_t *reply; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a memory leak, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
| read_message(conn->conn, &type, &reply_size, &reply); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add RAY_CHECK(type == MessageType_WaitReply); |
||
| RAY_CHECK(type == ray::protocol::MessageType_WaitReply); | ||
| auto reply_message = flatbuffers::GetRoot<ray::protocol::WaitReply>(reply); | ||
| // Convert result. | ||
| std::pair<std::vector<ObjectID>, std::vector<ObjectID>> result; | ||
|
|
@@ -219,5 +221,7 @@ std::pair<std::vector<ObjectID>, std::vector<ObjectID>> local_scheduler_wait( | |
| ObjectID object_id = ObjectID::from_binary(remaining->Get(i)->str()); | ||
| result.second.push_back(object_id); | ||
| } | ||
| /* Free the original message from the local scheduler. */ | ||
| free(reply); | ||
| return result; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -175,15 +175,16 @@ void local_scheduler_set_actor_frontier(LocalSchedulerConnection *conn, | |
| /// \param conn The connection information. | ||
| /// \param object_ids The objects to wait for. | ||
| /// \param num_returns The number of objects to wait for. | ||
| /// \param timeout The duration to wait before returning. | ||
| /// \param timeout_milliseconds Duration, in milliseconds, to wait before | ||
| /// returning. | ||
| /// \param wait_local Whether to wait for objects to appear on this node. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This argument doesn't seem to match the current semantics for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| /// \return A pair with the first element containing the object ids that were | ||
| /// found, and the second element the objects that were not found. | ||
| std::pair<std::vector<ObjectID>, std::vector<ObjectID>> local_scheduler_wait( | ||
| LocalSchedulerConnection *conn, | ||
| const std::vector<ObjectID> &object_ids, | ||
| int num_returns, | ||
| int64_t timeout, | ||
| int64_t timeout_milliseconds, | ||
| bool wait_local); | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -355,51 +355,56 @@ ray::Status ObjectManager::Cancel(const ObjectID &object_id) { | |
| return status; | ||
| } | ||
|
|
||
| ray::Status ObjectManager::Wait(const std::vector<ObjectID> &object_ids, int64_t wait_ms, | ||
| uint64_t num_required_objects, bool wait_local, | ||
| const WaitCallback &callback) { | ||
| ray::Status ObjectManager::Wait(const std::vector<ObjectID> &object_ids, | ||
| int64_t timeout_ms, uint64_t num_required_objects, | ||
| bool wait_local, const WaitCallback &callback) { | ||
| UniqueID wait_id = UniqueID::from_random(); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We've had trouble with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think it was a problem for |
||
|
|
||
| if (wait_local) { | ||
| return ray::Status::NotImplemented("Wait for local objects is not yet implemented."); | ||
| } | ||
|
|
||
| RAY_CHECK(wait_ms >= 0); | ||
| RAY_CHECK(timeout_ms >= 0 || timeout_ms == -1); | ||
| RAY_CHECK(num_required_objects != 0); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, we could return |
||
| RAY_CHECK(num_required_objects <= object_ids.size()); | ||
| if (object_ids.size() == 0) { | ||
| callback(std::vector<ObjectID>(), std::vector<ObjectID>()); | ||
| } | ||
|
|
||
| // Initialize fields. | ||
| active_wait_requests_.emplace(wait_id, WaitState(*main_service_, wait_ms, callback)); | ||
| active_wait_requests_.emplace(wait_id, WaitState(*main_service_, timeout_ms, callback)); | ||
| auto &wait_state = active_wait_requests_.find(wait_id)->second; | ||
| wait_state.object_id_order = object_ids; | ||
| wait_state.wait_ms = wait_ms; | ||
| wait_state.timeout_ms = timeout_ms; | ||
| wait_state.num_required_objects = num_required_objects; | ||
| for (auto &oid : object_ids) { | ||
| if (local_objects_.count(oid) > 0) { | ||
| wait_state.found.insert(oid); | ||
| for (auto &object_id : object_ids) { | ||
|
||
| if (local_objects_.count(object_id) > 0) { | ||
| wait_state.found.insert(object_id); | ||
| } else { | ||
| wait_state.remaining.insert(oid); | ||
| wait_state.remaining.insert(object_id); | ||
| } | ||
| } | ||
|
|
||
| if (wait_state.remaining.empty()) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we're going to include this special case here, then the condition should probably be something like That said, I think it makes sense to handle this case after firing off the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We invoke the lookup calls immediately after checking which objects are local to obtain current information about the location of remote objects. This allows us to collect information about all given objects, regardless of their location. If we invoke when If we fire
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment to this |
||
| WaitComplete(wait_id); | ||
| } else { | ||
| for (auto &oid : wait_state.remaining) { | ||
| // We invoke lookup calls immediately after checking which objects are local to | ||
| // obtain current information about the location of remote objects. Thus, | ||
| // we obtain information about all given objects, regardless of their location. | ||
| // This is required to ensure we do not bias returning locally available objects | ||
| // as ready whenever Wait is invoked with a mixture of local and remote objects. | ||
| for (auto &object_id : wait_state.remaining) { | ||
| // Lookup remaining objects. | ||
| wait_state.requested_objects.insert(oid); | ||
| wait_state.requested_objects.insert(object_id); | ||
| RAY_CHECK_OK(object_directory_->LookupLocations( | ||
| oid, [this, wait_id](const std::vector<ClientID> &client_ids, | ||
| const ObjectID &object_id) { | ||
| object_id, [this, wait_id](const std::vector<ClientID> &client_ids, | ||
| const ObjectID &lookup_object_id) { | ||
| auto &wait_state = active_wait_requests_.find(wait_id)->second; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. couldn't
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only way to progress to a point where |
||
| if (!client_ids.empty()) { | ||
| wait_state.remaining.erase(object_id); | ||
| wait_state.found.insert(object_id); | ||
| wait_state.remaining.erase(lookup_object_id); | ||
| wait_state.found.insert(lookup_object_id); | ||
| } | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What would it mean if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It may mean that the object has been evicted. It may also mean that no entries exist for the object. In both cases, after the round of lookups, if we need to wait for more objects, |
||
| wait_state.requested_objects.erase(object_id); | ||
| wait_state.requested_objects.erase(lookup_object_id); | ||
| if (wait_state.requested_objects.empty()) { | ||
| AllWaitLookupsComplete(wait_id); | ||
| } | ||
|
|
@@ -412,47 +417,58 @@ ray::Status ObjectManager::Wait(const std::vector<ObjectID> &object_ids, int64_t | |
| void ObjectManager::AllWaitLookupsComplete(const UniqueID &wait_id) { | ||
| auto &wait_state = active_wait_requests_.find(wait_id)->second; | ||
| if (wait_state.found.size() >= wait_state.num_required_objects || | ||
| wait_state.wait_ms == 0) { | ||
| wait_state.timeout_ms == 0) { | ||
| // Requirements already satisfied. | ||
| WaitComplete(wait_id); | ||
| } else { | ||
| for (auto &oid : wait_state.remaining) { | ||
| // Subscribe to objects in order to ensure Wait-related tests are deterministic. | ||
| for (auto &object_id : wait_state.object_id_order) { | ||
|
||
| if (wait_state.remaining.count(object_id) == 0) { | ||
|
||
| continue; | ||
| } | ||
| // Subscribe to object notifications. | ||
| wait_state.requested_objects.insert(oid); | ||
| if (active_wait_requests_.find(wait_id) == active_wait_requests_.end()) { | ||
| // This is possible if an object's location is obtained immediately, | ||
| // within the current callstack. In this case, WaitComplete has been | ||
| // invoked already, so we're done. | ||
| return; | ||
| } | ||
| wait_state.requested_objects.insert(object_id); | ||
| RAY_CHECK_OK(object_directory_->SubscribeObjectLocations( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The lookup calls return immediately, regardless of whether there's a known location of the object. After the lookup calls, if the conditions of the wait call are still not satisfied, we start the timer and subscribe to each remaining object. The subscription callback will be invoked only when an object's location is found. We wait until the minimum number of objects is found or the timeout is triggered. |
||
| wait_id, oid, [this, wait_id](const std::vector<ClientID> &client_ids, | ||
| const ObjectID &object_id) { | ||
| wait_id, object_id, [this, wait_id](const std::vector<ClientID> &client_ids, | ||
| const ObjectID &subscribe_object_id) { | ||
| auto object_id_wait_state = active_wait_requests_.find(wait_id); | ||
| RAY_CHECK(object_id_wait_state != active_wait_requests_.end()); | ||
| auto &wait_state = active_wait_requests_.find(wait_id)->second; | ||
|
||
| if (wait_state.remaining.count(object_id) != 0) { | ||
| wait_state.remaining.erase(object_id); | ||
| wait_state.found.insert(object_id); | ||
| } | ||
| wait_state.requested_objects.erase(object_id); | ||
| RAY_CHECK_OK( | ||
| object_directory_->UnsubscribeObjectLocations(wait_id, object_id)); | ||
| RAY_CHECK(wait_state.remaining.erase(subscribe_object_id)); | ||
| wait_state.found.insert(subscribe_object_id); | ||
| wait_state.requested_objects.erase(subscribe_object_id); | ||
| RAY_CHECK_OK(object_directory_->UnsubscribeObjectLocations( | ||
| wait_id, subscribe_object_id)); | ||
| if (wait_state.found.size() >= wait_state.num_required_objects) { | ||
| WaitComplete(wait_id); | ||
| } | ||
| })); | ||
| } | ||
| // Set timeout. | ||
| // TODO (hme): If we need to just wait for all objects independent of time | ||
| // (i.e. infinite wait time), determine what the value of wait_ms should be and | ||
| // skip this call. WaitComplete will be invoked when all objects have locations. | ||
| wait_state.timeout_timer->async_wait( | ||
| [this, wait_id](const boost::system::error_code &error_code) { | ||
| if (error_code.value() != 0) { | ||
| return; | ||
| } | ||
| WaitComplete(wait_id); | ||
| }); | ||
| if (wait_state.timeout_ms != -1) { | ||
| wait_state.timeout_timer->async_wait( | ||
| [this, wait_id](const boost::system::error_code &error_code) { | ||
| if (error_code.value() != 0) { | ||
| return; | ||
| } | ||
| WaitComplete(wait_id); | ||
| }); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void ObjectManager::WaitComplete(const UniqueID &wait_id) { | ||
| auto &wait_state = active_wait_requests_.find(wait_id)->second; | ||
| // If we complete with outstanding requests, then wait_ms should be non-zero. | ||
| RAY_CHECK(!(wait_state.requested_objects.size() > 0) || wait_state.wait_ms > 0); | ||
| // If we complete with outstanding requests, then timeout_ms should be non-zero or -1 | ||
| // (infinite wait time). | ||
| if (!wait_state.requested_objects.empty()) { | ||
| RAY_CHECK(wait_state.timeout_ms > 0 || wait_state.timeout_ms == -1); | ||
| } | ||
| // Unsubscribe to any objects that weren't found in the time allotted. | ||
| for (auto &object_id : wait_state.requested_objects) { | ||
| RAY_CHECK_OK(object_directory_->UnsubscribeObjectLocations(wait_id, object_id)); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -157,11 +157,13 @@ class ObjectManager : public ObjectManagerInterface { | |
| /// \param callback Invoked when either timeout_ms is satisfied OR num_ready_objects | ||
| /// is satisfied. | ||
| /// \return Status of whether the wait successfully initiated. | ||
| ray::Status Wait(const std::vector<ObjectID> &object_ids, int64_t wait_ms, | ||
| ray::Status Wait(const std::vector<ObjectID> &object_ids, int64_t timeout_ms, | ||
| uint64_t num_required_objects, bool wait_local, | ||
| const WaitCallback &callback); | ||
|
|
||
| private: | ||
| friend class TestObjectManager; | ||
|
|
||
| ClientID client_id_; | ||
| const ObjectManagerConfig config_; | ||
| std::unique_ptr<ObjectDirectoryInterface> object_directory_; | ||
|
|
@@ -197,17 +199,20 @@ class ObjectManager : public ObjectManagerInterface { | |
| /// Cache of locally available objects. | ||
| std::unordered_map<ObjectID, ObjectInfoT> local_objects_; | ||
|
|
||
| /// This is used as the callback identifier in Pull for | ||
| /// SubscribeObjectLocations. We only need one identifier because we never need to | ||
| /// subscribe multiple times to the same object during Pull. | ||
| UniqueID object_directory_pull_callback_id_ = UniqueID::from_random(); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh I see this is only called once
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this field needed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used to distinguish object directory
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Document this field. |
||
|
|
||
| struct WaitState { | ||
| WaitState(asio::io_service &service, int64_t wait_ms, const WaitCallback &callback) | ||
| : wait_ms(wait_ms), | ||
| WaitState(asio::io_service &service, int64_t timeout_ms, const WaitCallback &callback) | ||
| : timeout_ms(timeout_ms), | ||
| timeout_timer(std::unique_ptr<boost::asio::deadline_timer>( | ||
| new boost::asio::deadline_timer(service, | ||
| boost::posix_time::milliseconds(wait_ms)))), | ||
| new boost::asio::deadline_timer( | ||
| service, boost::posix_time::milliseconds(timeout_ms)))), | ||
| callback(callback) {} | ||
| /// The period of time to wait before invoking the callback. | ||
| int64_t wait_ms; | ||
| int64_t timeout_ms; | ||
| /// The timer used whenever wait_ms > 0. | ||
| std::unique_ptr<boost::asio::deadline_timer> timeout_timer; | ||
| /// The callback invoked when WaitCallback is complete. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does
wait_localwork?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No; I just added the piping for the argument. The back-end returns an unimplemented status if it's set to true. I have an idea of how to implement this whenever we'd like to add it.