Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions src/mock/ray/gcs_client/accessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,30 +80,17 @@ class MockNodeInfoAccessor : public NodeInfoAccessor {
int64_t timeout_ms,
const std::vector<NodeID> &node_ids),
(override));
MOCK_METHOD(void,
AsyncSubscribeToNodeChange,
(std::function<void(NodeID, const rpc::GcsNodeInfo &)> subscribe,
StatusCallback done),
(override));
MOCK_METHOD(
void,
AsyncSubscribeToNodeAddressAndLivenessChange,
(std::function<void(NodeID, const rpc::GcsNodeAddressAndLiveness &)> subscribe,
StatusCallback done),
(override));
MOCK_METHOD(const rpc::GcsNodeInfo *,
Get,
(const NodeID &node_id, bool filter_dead_nodes),
(const, override));
MOCK_METHOD(const rpc::GcsNodeAddressAndLiveness *,
MOCK_METHOD(std::optional<rpc::GcsNodeAddressAndLiveness>,
GetNodeAddressAndLiveness,
(const NodeID &node_id, bool filter_dead_nodes),
(const, override));
MOCK_METHOD((const absl::flat_hash_map<NodeID, rpc::GcsNodeInfo> &),
GetAll,
(),
(const, override));
MOCK_METHOD((const absl::flat_hash_map<NodeID, rpc::GcsNodeAddressAndLiveness> &),
MOCK_METHOD((absl::flat_hash_map<NodeID, rpc::GcsNodeAddressAndLiveness>),
GetAllNodeAddressAndLiveness,
(),
(const, override));
Expand All @@ -114,6 +101,7 @@ class MockNodeInfoAccessor : public NodeInfoAccessor {
std::vector<bool> &nodes_alive),
(override));
MOCK_METHOD(bool, IsNodeDead, (const NodeID &node_id), (const, override));
MOCK_METHOD(bool, IsNodeAlive, (const NodeID &node_id), (const, override));
MOCK_METHOD(void, AsyncResubscribe, (), (override));
};

Expand Down
2 changes: 1 addition & 1 deletion src/ray/core_worker/core_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -1945,7 +1945,7 @@ class CoreWorker {
// the shutdown procedure without exposing additional public APIs.
friend class CoreWorkerShutdownExecutor;

/// Used to block in certain spots if the GCS node cache is needed.
/// Used to block in certain spots if the GCS node address and liveness cache is needed.
std::mutex gcs_client_node_cache_populated_mutex_;
std::condition_variable gcs_client_node_cache_populated_cv_;
bool gcs_client_node_cache_populated_ = false;
Expand Down
4 changes: 2 additions & 2 deletions src/ray/core_worker/core_worker_process.cc
Original file line number Diff line number Diff line change
Expand Up @@ -588,9 +588,9 @@ std::shared_ptr<CoreWorker> CoreWorkerProcessImpl::CreateCoreWorker(
if (object_locations.has_value()) {
locations.reserve(object_locations->size());
for (const auto &node_id : *object_locations) {
auto *node_info = core_worker->gcs_client_->Nodes().GetNodeAddressAndLiveness(
auto node_info = core_worker->gcs_client_->Nodes().GetNodeAddressAndLiveness(
node_id, /*filter_dead_nodes=*/false);
if (node_info == nullptr) {
if (!node_info) {
// Unsure if the node is dead, so we need to confirm with the GCS. This should
// be rare, the only foreseeable reasons are:
// 1. We filled our cache after the GCS cleared the node info due to
Expand Down
2 changes: 1 addition & 1 deletion src/ray/core_worker/task_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1162,7 +1162,7 @@ bool TaskManager::RetryTaskIfPossible(const TaskID &task_id,
const auto node_info =
gcs_client_->Nodes().GetNodeAddressAndLiveness(task_entry.GetNodeId(),
/*filter_dead_nodes=*/false);
is_preempted = node_info != nullptr && node_info->has_death_info() &&
is_preempted = node_info && node_info->has_death_info() &&
node_info->death_info().reason() ==
rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED;
}
Expand Down
2 changes: 1 addition & 1 deletion src/ray/core_worker/tests/task_manager_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2889,7 +2889,7 @@ TEST_F(TaskManagerTest, TestTaskRetriedOnNodePreemption) {
rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED);
EXPECT_CALL(*mock_gcs_client_->mock_node_accessor,
GetNodeAddressAndLiveness(node_id, false))
.WillOnce(::testing::Return(&node_info));
.WillOnce(::testing::Return(node_info));

// Task should be retried because the node was preempted, even with 0 retries left
rpc::RayErrorInfo node_died_error;
Expand Down
4 changes: 2 additions & 2 deletions src/ray/core_worker_rpc_client/core_worker_client_pool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ std::function<void()> CoreWorkerClientPool::GetDefaultUnavailableTimeoutCallback
};

if (gcs_client->Nodes().IsSubscribedToNodeChange()) {
auto *node_info = gcs_client->Nodes().GetNodeAddressAndLiveness(
auto node_info = gcs_client->Nodes().GetNodeAddressAndLiveness(
node_id, /*filter_dead_nodes=*/false);
if (node_info == nullptr) {
if (!node_info) {
// Node could be dead or info may have not made it to the subscriber cache yet.
// Check with the GCS to confirm if the node is dead.
gcs_check_node_alive();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ class MockGcsClientNodeAccessor : public gcs::NodeInfoAccessor {

bool IsSubscribedToNodeChange() const override { return is_subscribed_to_node_change_; }

MOCK_METHOD(const rpc::GcsNodeInfo *, Get, (const NodeID &, bool), (const, override));

MOCK_METHOD(const rpc::GcsNodeAddressAndLiveness *,
MOCK_METHOD(std::optional<rpc::GcsNodeAddressAndLiveness>,
GetNodeAddressAndLiveness,
(const NodeID &, bool),
(const, override));
Expand Down Expand Up @@ -212,16 +210,16 @@ TEST_P(DefaultUnavailableTimeoutCallbackTest, NodeDeath) {
if (is_subscribed_to_node_change_) {
EXPECT_CALL(mock_node_accessor,
GetNodeAddressAndLiveness(worker_1_node_id, /*filter_dead_nodes=*/false))
.WillOnce(Return(nullptr))
.WillOnce(Return(&node_info_alive))
.WillOnce(Return(&node_info_dead));
.WillOnce(Return(std::nullopt))
.WillOnce(Return(node_info_alive))
.WillOnce(Return(node_info_dead));
EXPECT_CALL(
mock_node_accessor,
AsyncGetAllNodeAddressAndLiveness(_, _, std::vector<NodeID>{worker_1_node_id}))
.WillOnce(invoke_with_node_info_vector({node_info_alive}));
EXPECT_CALL(mock_node_accessor,
GetNodeAddressAndLiveness(worker_2_node_id, /*filter_dead_nodes=*/false))
.WillOnce(Return(nullptr));
.WillOnce(Return(std::nullopt));
EXPECT_CALL(
mock_node_accessor,
AsyncGetAllNodeAddressAndLiveness(_, _, std::vector<NodeID>{worker_2_node_id}))
Expand Down Expand Up @@ -279,7 +277,7 @@ TEST_P(DefaultUnavailableTimeoutCallbackTest, WorkerDeath) {
EXPECT_CALL(gcs_client_.MockNodeAccessor(),
GetNodeAddressAndLiveness(_, /*filter_dead_nodes=*/false))
.Times(2)
.WillRepeatedly(Return(&node_info_alive));
.WillRepeatedly(Return(node_info_alive));
} else {
EXPECT_CALL(gcs_client_.MockNodeAccessor(),
AsyncGetAllNodeAddressAndLiveness(_, _, _))
Expand Down
Loading