Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

set unstable pendding empty chunkserver status to retired #416

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/mds/heartbeat/chunkserver_healthy_checker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ void ChunkserverHealthyChecker::UpdateChunkServerOnlineState(

bool ChunkserverHealthyChecker::TrySetChunkServerRetiredIfNeed(
const HeartbeatInfo &info) {
// Disregard when chunkserver isn't in OFFLINE status
if (OnlineState::OFFLINE != info.state) {
// Disregard when chunkserver is in ONLINE status
if (OnlineState::ONLINE == info.state) {
Copy link
Member

@xu-chaojie xu-chaojie Jul 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why pendding empty chunkserver is unstable ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when you stop this chunkserver, this chunkserver state change from online to unstable.

return false;
}

Expand All @@ -154,6 +154,11 @@ bool ChunkserverHealthyChecker::TrySetChunkServerRetiredIfNeed(
return true;
}

if (OnlineState::OFFLINE != info.state
&& cs.GetStatus() != ChunkServerStatus::PENDDING) {
return false;
}

// Check for any remaining copyset on a chunkserver
bool noCopyset = topo_->GetCopySetsInChunkServer(info.csId).empty();
if (!noCopyset) {
Expand Down
53 changes: 49 additions & 4 deletions test/mds/heartbeat/chunkserver_healthy_checker_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,36 +65,61 @@ TEST(ChunkserverHealthyChecker, test_checkHeartBeat_interval) {
6, steady_clock::now() - std::chrono::milliseconds(10000));
checker->UpdateLastReceivedHeartbeatTime(
7, steady_clock::now() - std::chrono::milliseconds(10000));
checker->UpdateLastReceivedHeartbeatTime(
8, steady_clock::now());
checker->UpdateLastReceivedHeartbeatTime(
9, steady_clock::now() - std::chrono::milliseconds(4000));
checker->UpdateLastReceivedHeartbeatTime(
10, steady_clock::now() - std::chrono::milliseconds(10000));
ASSERT_TRUE(checker->GetHeartBeatInfo(1, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(2, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(3, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_FALSE(checker->GetHeartBeatInfo(4, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(5, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(6, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(7, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(8, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(9, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(10, &info));
ASSERT_EQ(OnlineState::UNSTABLE, info.state);
}

{
// chunkserver-2 心跳miss,
// chunkserver-1 更新为online
// chunkserver-2 心跳miss,保持unstable
// chunkserver-3,chunkserver-5,chunkserver-6心跳offline,
// chunkserver-3的retired状态会被更新, 从心跳map中移除
// chunkserver-5已经是retired状态,无需更新
// chunkserver-6 get info失败, 未成功更新状态
// chunnkserver-7 update失败, 未成功更新状态
// chunkserver-8, pendding && online, 更新为onLine
// chunkserver-9, pendding && unstable, 更新为retired
// chunkserver-10, pendding && offline, 更新为retired
EXPECT_CALL(*topology, UpdateChunkServerOnlineState(_, _))
.Times(7).WillRepeatedly(Return(kTopoErrCodeSuccess));
ChunkServer cs2(2, "", "", 1, "", 0, "",
ChunkServerStatus::READWRITE, OnlineState::UNSTABLE);
ChunkServer cs3(3, "", "", 1, "", 0, "",
ChunkServerStatus::READWRITE, OnlineState::UNSTABLE);
ChunkServer cs5(5, "", "", 1, "", 0, "",
ChunkServerStatus::RETIRED, OnlineState::UNSTABLE);
ChunkServer cs7(7, "", "", 1, "", 0, "",
ChunkServerStatus::READWRITE, OnlineState::UNSTABLE);
ChunkServer cs9(9, "", "", 1, "", 0, "",
ChunkServerStatus::PENDDING, OnlineState::UNSTABLE);
ChunkServer cs10(10, "", "", 1, "", 0, "",
ChunkServerStatus::PENDDING, OnlineState::UNSTABLE);
EXPECT_CALL(*topology, GetChunkServer(2, _))
.WillOnce(DoAll(SetArgPointee<1>(cs2), Return(true)));
EXPECT_CALL(*topology, GetChunkServer(3, _))
.WillOnce(DoAll(SetArgPointee<1>(cs3), Return(true)));
EXPECT_CALL(*topology, GetCopySetsInChunkServer(3, _))
Expand All @@ -107,10 +132,20 @@ TEST(ChunkserverHealthyChecker, test_checkHeartBeat_interval) {
.WillOnce(Return(false));
EXPECT_CALL(*topology, GetChunkServer(7, _))
.WillOnce(DoAll(SetArgPointee<1>(cs7), Return(true)));
EXPECT_CALL(*topology, GetChunkServer(9, _))
.WillOnce(DoAll(SetArgPointee<1>(cs9), Return(true)));
EXPECT_CALL(*topology, GetCopySetsInChunkServer(9, _))
.WillOnce(Return(std::vector<CopySetKey>{}));
EXPECT_CALL(*topology, GetChunkServer(10, _))
.WillOnce(DoAll(SetArgPointee<1>(cs10), Return(true)));
EXPECT_CALL(*topology, GetCopySetsInChunkServer(10, _))
.WillOnce(Return(std::vector<CopySetKey>{}));
EXPECT_CALL(*topology, UpdateChunkServerRwState(_, _))
.Times(2)
.Times(4)
.WillOnce(Return(kTopoErrCodeSuccess))
.WillOnce(Return(kTopoErrCodeInternalError));
.WillOnce(Return(kTopoErrCodeInternalError))
.WillOnce(Return(kTopoErrCodeSuccess))
.WillOnce(Return(kTopoErrCodeSuccess));
checker->CheckHeartBeatInterval();
ASSERT_TRUE(checker->GetHeartBeatInfo(1, &info));
ASSERT_EQ(OnlineState::ONLINE, info.state);
Expand All @@ -122,15 +157,25 @@ TEST(ChunkserverHealthyChecker, test_checkHeartBeat_interval) {
ASSERT_EQ(OnlineState::OFFLINE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(7, &info));
ASSERT_EQ(OnlineState::OFFLINE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(8, &info));
ASSERT_EQ(OnlineState::ONLINE, info.state);
ASSERT_FALSE(checker->GetHeartBeatInfo(9, &info));
ASSERT_FALSE(checker->GetHeartBeatInfo(10, &info));
}

{
// chunkserver-6, chunkserver-7 收到心跳
// chunkserver 2, 6 ,7 收到心跳
checker->UpdateLastReceivedHeartbeatTime(
2, steady_clock::now());
checker->UpdateLastReceivedHeartbeatTime(
6, steady_clock::now());
checker->UpdateLastReceivedHeartbeatTime(
7, steady_clock::now());
EXPECT_CALL(*topology, UpdateChunkServerOnlineState(_, _))
.Times(3).WillRepeatedly(Return(kTopoErrCodeSuccess));
checker->CheckHeartBeatInterval();
ASSERT_TRUE(checker->GetHeartBeatInfo(2, &info));
ASSERT_EQ(OnlineState::ONLINE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(6, &info));
ASSERT_EQ(OnlineState::ONLINE, info.state);
ASSERT_TRUE(checker->GetHeartBeatInfo(7, &info));
Expand Down