Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issue-2674: directory sharding - shard->shard stats aggregation fixes + ut #2800

Merged
merged 1 commit into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions cloud/filestore/libs/storage/service/service_ut_sharding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4792,6 +4792,157 @@ Y_UNIT_TEST_SUITE(TStorageServiceShardingTest)
static_cast<ui32>(NProto::E_REGULAR_NODE),
getAttrResponse.GetNode().GetType());
}

SERVICE_TEST_SID_SELECT_IN_LEADER_ONLY(
ShouldAggregateFileSystemMetricsInBackgroundWithDirectoriesInShards)
{
config.SetMultiTabletForwardingEnabled(true);
config.SetDirectoryCreationInShardsEnabled(true);
TTestEnv env({}, config);
env.CreateSubDomain("nfs");

ui32 nodeIdx = env.CreateNode("nfs");

const TString fsId = "test";
const auto shard1Id = fsId + "-f1";
const auto shard2Id = fsId + "-f2";

TServiceClient service(env.GetRuntime(), nodeIdx);
service.CreateFileStore(fsId, 1'000);
service.CreateFileStore(shard1Id, 1'000);
service.CreateFileStore(shard2Id, 1'000);

TActorId shard1ActorId;
env.GetRuntime().SetEventFilter(
[&] (auto& runtime, TAutoPtr<IEventHandle>& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvIndexTablet::EvConfigureAsShardRequest: {
using R = TEvIndexTablet::TEvConfigureAsShardRequest;
const auto* msg = event->Get<R>();
if (shard1Id == msg->Record.GetFileSystemId()) {
shard1ActorId = event->Recipient;
}
break;
}
}

return false;
});

ConfigureShards(service, fsId, shard1Id, shard2Id, true);

auto headers = service.InitSession(fsId, "client");

// creating 2 files

auto createNodeResponse = service.CreateNode(
headers,
TCreateNodeArgs::File(RootNodeId, "file1"))->Record;

const auto nodeId1 = createNodeResponse.GetNode().GetId();
UNIT_ASSERT_VALUES_EQUAL(1, ExtractShardNo(nodeId1));

createNodeResponse = service.CreateNode(
headers,
TCreateNodeArgs::File(RootNodeId, "file2"))->Record;

const auto nodeId2 = createNodeResponse.GetNode().GetId();
UNIT_ASSERT_VALUES_EQUAL(2, ExtractShardNo(nodeId2));

ui64 handle1 = service.CreateHandle(
headers,
fsId,
nodeId1,
"",
TCreateHandleArgs::RDWR)->Record.GetHandle();

auto data1 = GenerateValidateData(256_KB);
service.WriteData(headers, fsId, nodeId1, handle1, 0, data1);

ui64 handle2 = service.CreateHandle(
headers,
fsId,
nodeId2,
"",
TCreateHandleArgs::RDWR)->Record.GetHandle();

auto data2 = GenerateValidateData(512_KB);
service.WriteData(headers, fsId, nodeId2, handle2, 0, data2);

// triggering background shard stats collection

env.GetRuntime().AdvanceCurrentTime(TDuration::Seconds(15));

{
using TRequest = TEvIndexTabletPrivate::TEvUpdateCounters;

env.GetRuntime().Send(
new IEventHandle(
shard1ActorId, // recipient
TActorId(), // sender
new TRequest(),
0, // flags
0),
0);
}

TDispatchOptions options;
options.FinalEvents = {
TDispatchOptions::TFinalEventCondition(
TEvIndexTabletPrivate::EvGetShardStatsCompleted)};
service.AccessRuntime().DispatchEvents(options);

{
NProtoPrivate::TGetStorageStatsRequest request;
request.SetFileSystemId(shard1Id);
TString buf;
google::protobuf::util::MessageToJsonString(request, &buf);
auto response = service.ExecuteAction("GetStorageStats", buf);
NProtoPrivate::TGetStorageStatsResponse record;
auto status = google::protobuf::util::JsonStringToMessage(
response->Record.GetOutput(),
&record);
auto stats = record.GetStats();
UNIT_ASSERT_VALUES_EQUAL(0, stats.ShardStatsSize());
UNIT_ASSERT_VALUES_EQUAL(
data1.size() / 4_KB,
stats.GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
data1.size() / 4_KB,
stats.GetMixedBlocksCount());

request.SetAllowCache(true);
request.SetForceFetchShardStats(true);
buf.clear();
google::protobuf::util::MessageToJsonString(request, &buf);
response = service.ExecuteAction("GetStorageStats", buf);
record.Clear();
status = google::protobuf::util::JsonStringToMessage(
response->Record.GetOutput(),
&record);
stats = record.GetStats();
UNIT_ASSERT_VALUES_EQUAL(2, stats.ShardStatsSize());
UNIT_ASSERT_VALUES_EQUAL(
64,
stats.GetShardStats(0).GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
1000,
stats.GetShardStats(0).GetTotalBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
128,
stats.GetShardStats(1).GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
1000,
stats.GetShardStats(1).GetTotalBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
(data1.size() + data2.size()) / 4_KB,
stats.GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
(data1.size() + data2.size()) / 4_KB,
stats.GetMixedBlocksCount());
}
}
}

} // namespace NCloud::NFileStore::NStorage
23 changes: 17 additions & 6 deletions cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,11 @@ void TGetShardStatsActor::HandleGetStorageStatsResponse(
LOG_DEBUG(
ctx,
TFileStoreComponents::TABLET_WORKER,
"%s Got storage stats for shard %s",
"%s Got storage stats for shard %s, used: %lu, aggregate used: %lu",
LogTag.c_str(),
ShardIds[ev->Cookie].c_str());
ShardIds[ev->Cookie].c_str(),
src.GetUsedBlocksCount(),
dst.GetUsedBlocksCount());

TABLET_VERIFY(ev->Cookie < ShardStats.size());
auto& ss = ShardStats[ev->Cookie];
Expand Down Expand Up @@ -762,13 +764,15 @@ void TIndexTabletActor::HandleUpdateCounters(
}

if (CachedStatsFetchingStartTs == TInstant::Zero()) {
// TODO(#2674): do shard<->shard stats exchange less often than normal
// metrics collection since it's O(n^2) (where n == shardCount)
auto response =
std::make_unique<TEvIndexTablet::TEvGetStorageStatsResponse>();
auto* stats = response->Record.MutableStats();
FillSelfStorageStats(stats);
const auto& shardIds = GetFileSystem().GetShardFileSystemIds();
// if shardIds isn't empty and current tablet is a shard, it will
// collect self stats via TGetShardStatsActor
if (shardIds.empty() || IsMainTablet()) {
FillSelfStorageStats(stats);
}
if (shardIds.empty()) {
CachedAggregateStats = std::move(*stats);
return;
Expand Down Expand Up @@ -852,9 +856,10 @@ void TIndexTabletActor::HandleGetStorageStats(
auto* stats = response->Record.MutableStats();
// shards shouldn't collect other shards' stats (unless it's background
// shard <-> shard stats exchange which is handled in HandleUpdateCounters)
const auto& shardIds = IsMainTablet()
const auto& shardIds = req.GetForceFetchShardStats() || IsMainTablet()
? GetFileSystem().GetShardFileSystemIds()
: Default<google::protobuf::RepeatedPtrField<TString>>();
req.SetForceFetchShardStats(false);
if (req.GetAllowCache()) {
*stats = CachedAggregateStats;
const ui32 shardMetricsCount =
Expand Down Expand Up @@ -911,6 +916,12 @@ void TIndexTabletActor::HandleGetStorageStats(
ev->Get()->CallContext);
requestInfo->StartedTs = ctx.Now();

if (!IsMainTablet()) {
// if current tablet is a shard, it will collect self stats via
// TGetShardStatsActor
stats->Clear();
}

auto actor = std::make_unique<TGetShardStatsActor>(
LogTag,
SelfId(),
Expand Down
4 changes: 4 additions & 0 deletions cloud/filestore/private/api/protos/tablet.proto
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ message TGetStorageStatsRequest

// Use cached data, don't require up-to-date stats.
bool AllowCache = 6;

// Forces shard stats fetching even if this request is itself sent to a
// shard.
bool ForceFetchShardStats = 7;
}

message TGetStorageStatsResponse
Expand Down
Loading