-
Notifications
You must be signed in to change notification settings - Fork 7.1k
[Core] Exit the Core Worker Early Error Received from Plasma Store #53679
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
a835e26
d4b182f
efd22a2
c7ae894
a675a47
dce3714
93f4092
ea724b0
46311ad
9b788e1
f653c80
e58b022
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -97,6 +97,7 @@ struct ObjectInUseEntry { | |
| class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Impl> { | ||
| public: | ||
| Impl(); | ||
| explicit Impl(bool exit_on_connection_failure); | ||
| ~Impl(); | ||
|
|
||
| // PlasmaClient method implementations | ||
|
|
@@ -235,11 +236,17 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp | |
| std::unordered_set<ObjectID> deletion_cache_; | ||
| /// A mutex which protects this class. | ||
| std::recursive_mutex client_mutex_; | ||
| /// Whether the current process should exit when read or write to the connection fails. | ||
| /// Currently it is only turned on when the plasma client is in a core worker. | ||
MengjinYan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| bool exit_on_connection_failure_; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of modifying low-level The
|
||
| }; | ||
|
|
||
| PlasmaBuffer::~PlasmaBuffer() { RAY_UNUSED(client_->Release(object_id_)); } | ||
|
|
||
| PlasmaClient::Impl::Impl() : store_capacity_(0) {} | ||
| PlasmaClient::Impl::Impl() : store_capacity_(0), exit_on_connection_failure_(false) {} | ||
|
|
||
| PlasmaClient::Impl::Impl(bool exit_on_connection_failure) | ||
| : store_capacity_(0), exit_on_connection_failure_(exit_on_connection_failure) {} | ||
|
|
||
| PlasmaClient::Impl::~Impl() {} | ||
|
|
||
|
|
@@ -868,7 +875,7 @@ Status PlasmaClient::Impl::Connect(const std::string &store_socket_name, | |
| /// The local stream socket that connects to store. | ||
| ray::local_stream_socket socket(main_service_); | ||
| RAY_RETURN_NOT_OK(ray::ConnectSocketRetry(socket, store_socket_name)); | ||
| store_conn_.reset(new StoreConn(std::move(socket))); | ||
| store_conn_.reset(new StoreConn(std::move(socket), exit_on_connection_failure_)); | ||
| // Send a ConnectRequest to the store to get its memory capacity. | ||
| RAY_RETURN_NOT_OK(SendConnectRequest(store_conn_)); | ||
| std::vector<uint8_t> buffer; | ||
|
|
@@ -912,6 +919,9 @@ std::string PlasmaClient::Impl::DebugString() { | |
|
|
||
| PlasmaClient::PlasmaClient() : impl_(std::make_shared<PlasmaClient::Impl>()) {} | ||
|
|
||
| PlasmaClient::PlasmaClient(bool exit_on_connection_failure) | ||
| : impl_(std::make_shared<PlasmaClient::Impl>(exit_on_connection_failure)) {} | ||
|
|
||
| Status PlasmaClient::Connect(const std::string &store_socket_name, | ||
| const std::string &manager_socket_name, | ||
| int num_retries) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -164,10 +164,30 @@ class StoreConn : public ray::ServerConnection { | |
| public: | ||
| explicit StoreConn(ray::local_stream_socket &&socket); | ||
|
|
||
| explicit StoreConn(ray::local_stream_socket &&socket, bool exit_on_connection_failure); | ||
|
|
||
| /// Receive a file descriptor for the store. | ||
| /// | ||
| /// \return A file descriptor. | ||
| ray::Status RecvFd(MEMFD_TYPE_NON_UNIQUE *fd); | ||
|
|
||
| ray::Status WriteBuffer(const std::vector<boost::asio::const_buffer> &buffer) override; | ||
|
|
||
| ray::Status ReadBuffer(const std::vector<boost::asio::mutable_buffer> &buffer) override; | ||
|
|
||
| private: | ||
| // Whether the current process should exit when WriteBuffer or ReadBuffer fails. | ||
| // Currently it is only turned on when the plasma client is in a core worker. | ||
| // TODO(myan): For better error handling, we should: (1) In the mid-term, evaluate if | ||
| // we should turn it on for the plasma client in other processes. (2) In the | ||
| // long-term, consolidate the shutdown path between core worker and raylet to make the | ||
| // shutdown procedure cleaner. | ||
|
||
| bool exit_on_connection_failure_ = false; | ||
|
|
||
| // Shutdown the current process if the passed in status is not OK and the client is | ||
| // configured to exit on failure. | ||
| // @param status: The status to check. | ||
| void ShutdownWorkerIfErrorStatus(const ray::Status &status); | ||
|
||
| }; | ||
|
|
||
| std::ostream &operator<<(std::ostream &os, const std::shared_ptr<StoreConn> &store_conn); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.