Skip to content

Commit

Permalink
issue-2674: directory sharding - shard->shard stats aggregation fixes…
Browse files Browse the repository at this point in the history
… + ut (#2800)
  • Loading branch information
qkrorlqr authored Jan 7, 2025
1 parent b035a38 commit 4f702df
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 6 deletions.
151 changes: 151 additions & 0 deletions cloud/filestore/libs/storage/service/service_ut_sharding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4792,6 +4792,157 @@ Y_UNIT_TEST_SUITE(TStorageServiceShardingTest)
static_cast<ui32>(NProto::E_REGULAR_NODE),
getAttrResponse.GetNode().GetType());
}

SERVICE_TEST_SID_SELECT_IN_LEADER_ONLY(
ShouldAggregateFileSystemMetricsInBackgroundWithDirectoriesInShards)
{
config.SetMultiTabletForwardingEnabled(true);
config.SetDirectoryCreationInShardsEnabled(true);
TTestEnv env({}, config);
env.CreateSubDomain("nfs");

ui32 nodeIdx = env.CreateNode("nfs");

const TString fsId = "test";
const auto shard1Id = fsId + "-f1";
const auto shard2Id = fsId + "-f2";

TServiceClient service(env.GetRuntime(), nodeIdx);
service.CreateFileStore(fsId, 1'000);
service.CreateFileStore(shard1Id, 1'000);
service.CreateFileStore(shard2Id, 1'000);

TActorId shard1ActorId;
env.GetRuntime().SetEventFilter(
[&] (auto& runtime, TAutoPtr<IEventHandle>& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvIndexTablet::EvConfigureAsShardRequest: {
using R = TEvIndexTablet::TEvConfigureAsShardRequest;
const auto* msg = event->Get<R>();
if (shard1Id == msg->Record.GetFileSystemId()) {
shard1ActorId = event->Recipient;
}
break;
}
}

return false;
});

ConfigureShards(service, fsId, shard1Id, shard2Id, true);

auto headers = service.InitSession(fsId, "client");

// creating 2 files

auto createNodeResponse = service.CreateNode(
headers,
TCreateNodeArgs::File(RootNodeId, "file1"))->Record;

const auto nodeId1 = createNodeResponse.GetNode().GetId();
UNIT_ASSERT_VALUES_EQUAL(1, ExtractShardNo(nodeId1));

createNodeResponse = service.CreateNode(
headers,
TCreateNodeArgs::File(RootNodeId, "file2"))->Record;

const auto nodeId2 = createNodeResponse.GetNode().GetId();
UNIT_ASSERT_VALUES_EQUAL(2, ExtractShardNo(nodeId2));

ui64 handle1 = service.CreateHandle(
headers,
fsId,
nodeId1,
"",
TCreateHandleArgs::RDWR)->Record.GetHandle();

auto data1 = GenerateValidateData(256_KB);
service.WriteData(headers, fsId, nodeId1, handle1, 0, data1);

ui64 handle2 = service.CreateHandle(
headers,
fsId,
nodeId2,
"",
TCreateHandleArgs::RDWR)->Record.GetHandle();

auto data2 = GenerateValidateData(512_KB);
service.WriteData(headers, fsId, nodeId2, handle2, 0, data2);

// triggering background shard stats collection

env.GetRuntime().AdvanceCurrentTime(TDuration::Seconds(15));

{
using TRequest = TEvIndexTabletPrivate::TEvUpdateCounters;

env.GetRuntime().Send(
new IEventHandle(
shard1ActorId, // recipient
TActorId(), // sender
new TRequest(),
0, // flags
0),
0);
}

TDispatchOptions options;
options.FinalEvents = {
TDispatchOptions::TFinalEventCondition(
TEvIndexTabletPrivate::EvGetShardStatsCompleted)};
service.AccessRuntime().DispatchEvents(options);

{
NProtoPrivate::TGetStorageStatsRequest request;
request.SetFileSystemId(shard1Id);
TString buf;
google::protobuf::util::MessageToJsonString(request, &buf);
auto response = service.ExecuteAction("GetStorageStats", buf);
NProtoPrivate::TGetStorageStatsResponse record;
auto status = google::protobuf::util::JsonStringToMessage(
response->Record.GetOutput(),
&record);
auto stats = record.GetStats();
UNIT_ASSERT_VALUES_EQUAL(0, stats.ShardStatsSize());
UNIT_ASSERT_VALUES_EQUAL(
data1.size() / 4_KB,
stats.GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
data1.size() / 4_KB,
stats.GetMixedBlocksCount());

request.SetAllowCache(true);
request.SetForceFetchShardStats(true);
buf.clear();
google::protobuf::util::MessageToJsonString(request, &buf);
response = service.ExecuteAction("GetStorageStats", buf);
record.Clear();
status = google::protobuf::util::JsonStringToMessage(
response->Record.GetOutput(),
&record);
stats = record.GetStats();
UNIT_ASSERT_VALUES_EQUAL(2, stats.ShardStatsSize());
UNIT_ASSERT_VALUES_EQUAL(
64,
stats.GetShardStats(0).GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
1000,
stats.GetShardStats(0).GetTotalBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
128,
stats.GetShardStats(1).GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
1000,
stats.GetShardStats(1).GetTotalBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
(data1.size() + data2.size()) / 4_KB,
stats.GetUsedBlocksCount());
UNIT_ASSERT_VALUES_EQUAL(
(data1.size() + data2.size()) / 4_KB,
stats.GetMixedBlocksCount());
}
}
}

} // namespace NCloud::NFileStore::NStorage
23 changes: 17 additions & 6 deletions cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,11 @@ void TGetShardStatsActor::HandleGetStorageStatsResponse(
LOG_DEBUG(
ctx,
TFileStoreComponents::TABLET_WORKER,
"%s Got storage stats for shard %s",
"%s Got storage stats for shard %s, used: %lu, aggregate used: %lu",
LogTag.c_str(),
ShardIds[ev->Cookie].c_str());
ShardIds[ev->Cookie].c_str(),
src.GetUsedBlocksCount(),
dst.GetUsedBlocksCount());

TABLET_VERIFY(ev->Cookie < ShardStats.size());
auto& ss = ShardStats[ev->Cookie];
Expand Down Expand Up @@ -762,13 +764,15 @@ void TIndexTabletActor::HandleUpdateCounters(
}

if (CachedStatsFetchingStartTs == TInstant::Zero()) {
// TODO(#2674): do shard<->shard stats exchange less often than normal
// metrics collection since it's O(n^2) (where n == shardCount)
auto response =
std::make_unique<TEvIndexTablet::TEvGetStorageStatsResponse>();
auto* stats = response->Record.MutableStats();
FillSelfStorageStats(stats);
const auto& shardIds = GetFileSystem().GetShardFileSystemIds();
// if shardIds isn't empty and current tablet is a shard, it will
// collect self stats via TGetShardStatsActor
if (shardIds.empty() || IsMainTablet()) {
FillSelfStorageStats(stats);
}
if (shardIds.empty()) {
CachedAggregateStats = std::move(*stats);
return;
Expand Down Expand Up @@ -852,9 +856,10 @@ void TIndexTabletActor::HandleGetStorageStats(
auto* stats = response->Record.MutableStats();
// shards shouldn't collect other shards' stats (unless it's background
// shard <-> shard stats exchange which is handled in HandleUpdateCounters)
const auto& shardIds = IsMainTablet()
const auto& shardIds = req.GetForceFetchShardStats() || IsMainTablet()
? GetFileSystem().GetShardFileSystemIds()
: Default<google::protobuf::RepeatedPtrField<TString>>();
req.SetForceFetchShardStats(false);
if (req.GetAllowCache()) {
*stats = CachedAggregateStats;
const ui32 shardMetricsCount =
Expand Down Expand Up @@ -911,6 +916,12 @@ void TIndexTabletActor::HandleGetStorageStats(
ev->Get()->CallContext);
requestInfo->StartedTs = ctx.Now();

if (!IsMainTablet()) {
// if current tablet is a shard, it will collect self stats via
// TGetShardStatsActor
stats->Clear();
}

auto actor = std::make_unique<TGetShardStatsActor>(
LogTag,
SelfId(),
Expand Down
4 changes: 4 additions & 0 deletions cloud/filestore/private/api/protos/tablet.proto
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ message TGetStorageStatsRequest

// Use cached data, don't require up-to-date stats.
bool AllowCache = 6;

// Forces shard stats fetching even if this request is itself sent to a
// shard.
bool ForceFetchShardStats = 7;
}

message TGetStorageStatsResponse
Expand Down

0 comments on commit 4f702df

Please sign in to comment.