From b2b68b82a77ba8bbb2dc4017543189e19c77d193 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Thu, 21 May 2026 18:59:12 -0400 Subject: [PATCH 01/39] [PROTON] Refactor linked virtual node handling in buildHatchetJson and buildHatchetMsgPack --- third_party/proton/csrc/lib/Data/TreeData.cpp | 67 +++++++++++++++---- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 381c50e0ed4c..c9d1b670744c 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -343,8 +343,9 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, if (!hasLinkedTargets) { return; } - std::function appendLinkedVirtualNode = - [&](size_t virtualNodeId, json &outNode, json &parentMetricsJson) { + std::function appendLinkedVirtualNode = + [&](size_t virtualNodeId, json &outNode, + json &parentMetricsJson) -> bool { const auto &virtualNode = virtualTree->getNode(virtualNodeId); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); @@ -368,17 +369,22 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, linkedChildren.get_ref().reserve( virtualNode.children.size()); for (const auto &child : virtualNode.children) { - linkedChildren.push_back(json::object()); - appendLinkedVirtualNode(child.id, linkedChildren.back(), - outNode["metrics"]); + json linkedChildNode; + if (appendLinkedVirtualNode(child.id, linkedChildNode, + outNode["metrics"])) { + linkedChildren.push_back(std::move(linkedChildNode)); + } } + return !outNode["metrics"].empty() || + !outNode["children"].empty(); }; for (const auto &virtualChild : virtualRootNode.children) { json linkedRootChildNode; - appendLinkedVirtualNode(virtualChild.id, linkedRootChildNode, - metricsJson); - childrenArray.push_back(std::move(linkedRootChildNode)); + if (appendLinkedVirtualNode(virtualChild.id, linkedRootChildNode, + metricsJson)) { + childrenArray.push_back(std::move(linkedRootChildNode)); + } } }); @@ -723,6 +729,38 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, !treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty(); + std::function hasLinkedVirtualNode = + [&](size_t virtualNodeId) { + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + const auto metricsIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + if (metricsIt != treeNode.metricSet.linkedMetrics.end() && + countMetricEntries(metricsIt->second, /*isRoot=*/false) > + 0) { + return true; + } + if (countPromotedFlexibleMetricEntries( + virtualNode.children, + treeNode.metricSet.linkedFlexibleMetrics) > 0) { + return true; + } + for (const auto &child : virtualNode.children) { + if (hasLinkedVirtualNode(child.id)) { + return true; + } + } + return false; + }; + auto countLinkedVirtualChildren = [&](const auto &children) { + uint32_t childCount = 0; + for (const auto &child : children) { + if (hasLinkedVirtualNode(child.id)) { + ++childCount; + } + } + return childCount; + }; + std::function packLinkedVirtualNode = [&](size_t virtualNodeId) { const auto &virtualNode = virtualTree->getNode(virtualNodeId); @@ -755,16 +793,17 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, treeNode.metricSet.linkedFlexibleMetrics); writer.packStr("children"); - writer.packArray( - static_cast(virtualNode.children.size())); + writer.packArray(countLinkedVirtualChildren(virtualNode.children)); for (const auto &child : virtualNode.children) { - packLinkedVirtualNode(child.id); + if (hasLinkedVirtualNode(child.id)) { + packLinkedVirtualNode(child.id); + } } }; uint32_t linkedChildCount = hasLinkedTargets - ? static_cast(virtualRootNode.children.size()) + ? countLinkedVirtualChildren(virtualRootNode.children) : 0; writer.packStr("children"); writer.packArray(static_cast(treeNode.children.size()) + @@ -774,7 +813,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } if (hasLinkedTargets) { for (const auto &virtualChild : virtualRootNode.children) { - packLinkedVirtualNode(virtualChild.id); + if (hasLinkedVirtualNode(virtualChild.id)) { + packLinkedVirtualNode(virtualChild.id); + } } } }; From 43b245da0fb938a9f063071cdfea8025eda9f53c Mon Sep 17 00:00:00 2001 From: Jokeren Date: Thu, 21 May 2026 19:12:50 -0400 Subject: [PATCH 02/39] Enhance cudagraph tests by removing empty frames and validating metric payloads --- third_party/proton/test/test_profile.py | 183 +++++++++--------------- 1 file changed, 68 insertions(+), 115 deletions(-) diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index d41c48c73acf..5096c1546856 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -136,6 +136,8 @@ def fn(): for i in range(10): with proton.scope(f"iter_{i}"): fn() + with proton.scope("iter_without_kernel"): + pass with proton.scope("test0"): g.replay() @@ -149,6 +151,8 @@ def fn(): for i in range(10): with proton.scope(f"new_iter_{i}"): fn() + with proton.scope("new_iter_without_kernel"): + pass with proton.scope("test2"): g.replay() @@ -177,22 +181,34 @@ def fn(): assert test0_frame["children"][0]["metrics"]["time (ns)"] > 0 else: # cuda backend supports "" annotation + def has_metric_payload(frame): + return bool(frame["metrics"]) or any( + has_metric_payload(child) for child in frame["children"] + ) + for test_frame in [test0_frame, test1_frame, test2_frame]: - child = _find_frame_by_name(test_frame, "") - assert child is not None - # check all iterations - total_iters = 0 - for child in child["children"]: - iter_frame = "iter" if test_frame != test2_frame else "new_iter" - if iter_frame in child["frame"]["name"]: # TODO(Keren): remove empty frames - if "time (ns)" in child["children"][0]["metrics"]: - total_iters += 1 - # 0...9 iterations - assert total_iters == 10 + capture_frame = _find_frame_by_name(test_frame, "") + assert capture_frame is not None + iter_prefix = "new_iter" if test_frame == test2_frame else "iter" + expected_iter_names = { + f"{iter_prefix}_{i}" for i in range(10) + } + empty_iter_name = f"{iter_prefix}_without_kernel" + capture_children = capture_frame["children"] + capture_child_names = { + child["frame"]["name"] for child in capture_children + } + + assert empty_iter_name not in capture_child_names + assert expected_iter_names <= capture_child_names + for child in capture_children: + assert has_metric_payload(child) + if child["frame"]["name"] in expected_iter_names: + assert child["children"][0]["metrics"]["time (ns)"] > 0 @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports metrics profiling in cudagraphs") -def test_cudagraph_metric_queue_handles_inactive_replay(tmp_path: pathlib.Path, device: str): +def test_cudagraph_deactivate_graph(tmp_path: pathlib.Path, device: str): stream = torch.cuda.Stream() torch.cuda.set_stream(stream) @@ -218,7 +234,7 @@ def profiled_kernel(x, y): profiled_kernel[(1, )](x, y) torch.cuda.synchronize() - temp_file = tmp_path / "test_cudagraph_metric_queue_handles_inactive_replay.hatchet" + temp_file = tmp_path / "test_cudagraph_deactivate_graph.hatchet" session = proton.start(str(temp_file.with_suffix("")), context="shadow", hook="triton") try: inactive_graph = torch.cuda.CUDAGraph() @@ -260,64 +276,8 @@ def profiled_kernel(x, y): assert profiled_frame["metrics"]["sum_metric"] == float(x.numel()) -@pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports cudagraph replay") -def test_cudagraph_not_captured_by_profiler(tmp_path: pathlib.Path, capfd, device: str): - stream = torch.cuda.Stream() - torch.cuda.set_stream(stream) - - @triton.jit - def foo(x, y, z): - tl.store(z, tl.load(y) + tl.load(x)) - - def fn(): - a = torch.ones((2, 2), device=device) - b = torch.ones((2, 2), device=device) - c = a + b - foo[(1, )](a, b, c) - - # Build/capture graph before profiler starts. - fn() - g = torch.cuda.CUDAGraph() - with cuda_graph_without_gc(g): - fn() - - temp_file = tmp_path / "test_cudagraph_not_captured_by_profiler.hatchet" - proton.start(str(temp_file.with_suffix("")), context="shadow") - with proton.scope("replay0"): - g.replay() - with proton.scope("replay1"): - g.replay() - proton.finalize() - - captured = capfd.readouterr() - assert captured.err.count("Cannot find graph for graphExecId:") == 1 - assert "start profiling before the graph is created" in captured.err - - with temp_file.open() as f: - data = json.load(f) - replay0_frame = None - replay1_frame = None - for child in data[0]["children"]: - if child["frame"]["name"] == "replay0": - replay0_frame = child - elif child["frame"]["name"] == "replay1": - replay1_frame = child - assert replay0_frame is not None - assert replay1_frame is not None - assert len(replay0_frame["children"]) >= 3 - assert len(replay1_frame["children"]) >= 3 - - def has_positive_time_metric(node): - if node["metrics"].get("time (ns)", 0) > 0: - return True - return any(has_positive_time_metric(child) for child in node["children"]) - - assert has_positive_time_metric(replay0_frame) - assert has_positive_time_metric(replay1_frame) - - @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports cudagraph deactivation") -def test_cudagraph_deactivate(tmp_path, device: str): +def test_cudagraph_deactivate_within_graph(tmp_path, device: str): stream = torch.cuda.Stream() torch.cuda.set_stream(stream) @@ -336,7 +296,7 @@ def fn(session): c = a + b foo[(1, )](a, b, c) - temp_file = tmp_path / "test_cudagraph_deactivate.hatchet" + temp_file = tmp_path / "test_cudagraph_deactivate_within_graph.hatchet" session = proton.start(str(temp_file.with_suffix("")), context="shadow", hook="triton") # warmup @@ -385,8 +345,7 @@ def fn(session): @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports cudagraph replay") -@pytest.mark.parametrize("data_format", ["hatchet", "hatchet_msgpack"]) -def test_cudagraph_filters_unlinked_virtual_scopes(tmp_path: pathlib.Path, data_format: str, device: str): +def test_cudagraph_not_captured_by_profiler(tmp_path: pathlib.Path, capfd, device: str): stream = torch.cuda.Stream() torch.cuda.set_stream(stream) @@ -394,57 +353,51 @@ def test_cudagraph_filters_unlinked_virtual_scopes(tmp_path: pathlib.Path, data_ def foo(x, y, z): tl.store(z, tl.load(y) + tl.load(x)) - a = torch.ones((2, 2), device=device) - b = torch.ones((2, 2), device=device) - c = torch.empty_like(a) - - temp_file = tmp_path / f"test_cudagraph_filters_unlinked_virtual_scopes.{data_format}" - proton.start(str(temp_file.with_suffix("")), context="shadow") - - # Warmup to avoid one-time setup effects in replay output. - foo[(1, )](a, b, c) + def fn(): + a = torch.ones((2, 2), device=device) + b = torch.ones((2, 2), device=device) + c = a + b + foo[(1, )](a, b, c) + # Build/capture graph before profiler starts. + fn() g = torch.cuda.CUDAGraph() with cuda_graph_without_gc(g): - with proton.scope("iter_with_kernel"): - foo[(1, )](a, b, c) - with proton.scope("iter_without_kernel"): - pass + fn() - with proton.scope("replay"): + temp_file = tmp_path / "test_cudagraph_not_captured_by_profiler.hatchet" + proton.start(str(temp_file.with_suffix("")), context="shadow") + with proton.scope("replay0"): g.replay() + with proton.scope("replay1"): + g.replay() + proton.finalize() - proton.finalize(output_format=data_format) - - if data_format == "hatchet_msgpack": - import msgpack + captured = capfd.readouterr() + assert captured.err.count("Cannot find graph for graphExecId:") == 1 + assert "start profiling before the graph is created" in captured.err - with temp_file.open("rb") as f: - data = msgpack.load(f, raw=False, strict_map_key=False) - else: - with temp_file.open() as f: - data = json.load(f) + with temp_file.open() as f: + data = json.load(f) + replay0_frame = None + replay1_frame = None + for child in data[0]["children"]: + if child["frame"]["name"] == "replay0": + replay0_frame = child + elif child["frame"]["name"] == "replay1": + replay1_frame = child + assert replay0_frame is not None + assert replay1_frame is not None + assert len(replay0_frame["children"]) >= 3 + assert len(replay1_frame["children"]) >= 3 - replay_frame = next( - (child for child in data[0]["children"] if child["frame"]["name"] == "replay"), - None, - ) - assert replay_frame is not None - capture_frame = _find_frame_by_name(replay_frame, "") - assert capture_frame is not None + def has_positive_time_metric(node): + if node["metrics"].get("time (ns)", 0) > 0: + return True + return any(has_positive_time_metric(child) for child in node["children"]) - capture_children = capture_frame["children"] - capture_child_names = {child["frame"]["name"] for child in capture_children} - assert "iter_with_kernel" in capture_child_names - assert "iter_without_kernel" not in capture_child_names - - iter_with_kernel_frame = next( - (child for child in capture_children if child["frame"]["name"] == "iter_with_kernel"), - None, - ) - assert iter_with_kernel_frame is not None - assert len(iter_with_kernel_frame["children"]) > 0 - assert iter_with_kernel_frame["children"][0]["metrics"]["time (ns)"] > 0 + assert has_positive_time_metric(replay0_frame) + assert has_positive_time_metric(replay1_frame) @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports metrics profiling in cudagraphs") From 69cfac0679467d6f68cb9f6b733d06bcebea1ad8 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Thu, 21 May 2026 19:13:03 -0400 Subject: [PATCH 03/39] Refactor cudagraph test to streamline metric payload checks and remove unnecessary line breaks --- third_party/proton/csrc/lib/Data/TreeData.cpp | 67 +++++++++---------- third_party/proton/test/test_profile.py | 12 +--- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index c9d1b670744c..247a3ae7894b 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -346,38 +346,35 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, std::function appendLinkedVirtualNode = [&](size_t virtualNodeId, json &outNode, json &parentMetricsJson) -> bool { - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - const auto flexibleIt = - treeNode.metricSet.linkedFlexibleMetrics.find(virtualNodeId); - outNode = json::object(); - outNode["frame"] = {{"name", virtualNode.name}, - {"type", "function"}}; - outNode["metrics"] = json::object(); - if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { - appendMetrics(outNode["metrics"], metricsIt->second); - } - // Linked flexible metrics are only attached to - // children, so they always belong on the parent frame. - if (flexibleIt != - treeNode.metricSet.linkedFlexibleMetrics.end()) { - appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); - } - outNode["children"] = json::array(); - auto &linkedChildren = outNode["children"]; - linkedChildren.get_ref().reserve( - virtualNode.children.size()); - for (const auto &child : virtualNode.children) { - json linkedChildNode; - if (appendLinkedVirtualNode(child.id, linkedChildNode, - outNode["metrics"])) { - linkedChildren.push_back(std::move(linkedChildNode)); - } - } - return !outNode["metrics"].empty() || - !outNode["children"].empty(); - }; + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + const auto metricsIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + const auto flexibleIt = + treeNode.metricSet.linkedFlexibleMetrics.find(virtualNodeId); + outNode = json::object(); + outNode["frame"] = {{"name", virtualNode.name}, {"type", "function"}}; + outNode["metrics"] = json::object(); + if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { + appendMetrics(outNode["metrics"], metricsIt->second); + } + // Linked flexible metrics are only attached to + // children, so they always belong on the parent frame. + if (flexibleIt != treeNode.metricSet.linkedFlexibleMetrics.end()) { + appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); + } + outNode["children"] = json::array(); + auto &linkedChildren = outNode["children"]; + linkedChildren.get_ref().reserve( + virtualNode.children.size()); + for (const auto &child : virtualNode.children) { + json linkedChildNode; + if (appendLinkedVirtualNode(child.id, linkedChildNode, + outNode["metrics"])) { + linkedChildren.push_back(std::move(linkedChildNode)); + } + } + return !outNode["metrics"].empty() || !outNode["children"].empty(); + }; for (const auto &virtualChild : virtualRootNode.children) { json linkedRootChildNode; @@ -735,8 +732,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); if (metricsIt != treeNode.metricSet.linkedMetrics.end() && - countMetricEntries(metricsIt->second, /*isRoot=*/false) > - 0) { + countMetricEntries(metricsIt->second, /*isRoot=*/false) > 0) { return true; } if (countPromotedFlexibleMetricEntries( @@ -793,7 +789,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, treeNode.metricSet.linkedFlexibleMetrics); writer.packStr("children"); - writer.packArray(countLinkedVirtualChildren(virtualNode.children)); + writer.packArray( + countLinkedVirtualChildren(virtualNode.children)); for (const auto &child : virtualNode.children) { if (hasLinkedVirtualNode(child.id)) { packLinkedVirtualNode(child.id); diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index 5096c1546856..c79e8bdedf1d 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -182,22 +182,16 @@ def fn(): else: # cuda backend supports "" annotation def has_metric_payload(frame): - return bool(frame["metrics"]) or any( - has_metric_payload(child) for child in frame["children"] - ) + return bool(frame["metrics"]) or any(has_metric_payload(child) for child in frame["children"]) for test_frame in [test0_frame, test1_frame, test2_frame]: capture_frame = _find_frame_by_name(test_frame, "") assert capture_frame is not None iter_prefix = "new_iter" if test_frame == test2_frame else "iter" - expected_iter_names = { - f"{iter_prefix}_{i}" for i in range(10) - } + expected_iter_names = {f"{iter_prefix}_{i}" for i in range(10)} empty_iter_name = f"{iter_prefix}_without_kernel" capture_children = capture_frame["children"] - capture_child_names = { - child["frame"]["name"] for child in capture_children - } + capture_child_names = {child["frame"]["name"] for child in capture_children} assert empty_iter_name not in capture_child_names assert expected_iter_names <= capture_child_names From a28738a58c714d3222efa16bbc878809a91f6541 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 09:21:57 -0400 Subject: [PATCH 04/39] Optimize Proton TreeData msgpack flushing --- third_party/proton/csrc/include/Data/Metric.h | 42 +- .../csrc/include/Utility/MsgPackWriter.h | 11 + third_party/proton/csrc/lib/Data/TreeData.cpp | 647 ++++++++++++------ .../lib/Profiler/Cupti/CuptiPCSampling.cpp | 2 +- .../proton/csrc/lib/Utility/MsgPackWriter.cpp | 17 +- 5 files changed, 480 insertions(+), 239 deletions(-) diff --git a/third_party/proton/csrc/include/Data/Metric.h b/third_party/proton/csrc/include/Data/Metric.h index 7be999af54c3..183094720f4e 100644 --- a/third_party/proton/csrc/include/Data/Metric.h +++ b/third_party/proton/csrc/include/Data/Metric.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,7 @@ class Metric { virtual const std::string &getName() const = 0; - virtual const std::string &getValueName(int valueId) const = 0; + virtual std::string_view getValueName(int valueId) const = 0; virtual bool isProperty(int valueId) const = 0; @@ -81,7 +82,8 @@ class Metric { if (values[valueId].index() != value.index()) { throw makeInvalidArgument( std::string("Metric value type mismatch for valueId ") + - std::to_string(valueId) + " (" + getValueName(valueId) + ")" + + std::to_string(valueId) + " (" + + std::string(getValueName(valueId)) + ")" + ": current=" + getTypeNameForIndex(values[valueId].index()) + ", new=" + getTypeNameForIndex(value.index())); } @@ -105,8 +107,9 @@ class Metric { throw makeInvalidArgument( std::string("Vector metric size mismatch for " "valueId ") + - std::to_string(valueId) + " (" + getValueName(valueId) + - "): current=" + std::to_string(currentValue.size()) + + std::to_string(valueId) + " (" + + std::string(getValueName(valueId)) + "): current=" + + std::to_string(currentValue.size()) + ", new=" + std::to_string(otherValue.size())); } for (size_t i = 0; i < currentValue.size(); ++i) { @@ -116,8 +119,9 @@ class Metric { throw makeLogicError( std::string("Metric aggregation not supported for " "valueId ") + - std::to_string(valueId) + " (" + getValueName(valueId) + - "): type=" + getTypeNameForIndex(values[valueId].index())); + std::to_string(valueId) + " (" + + std::string(getValueName(valueId)) + "): type=" + + getTypeNameForIndex(values[valueId].index())); } } }, @@ -172,7 +176,7 @@ class FlexibleMetric : public Metric { const std::string &getName() const override { return name; } - const std::string &getValueName(int valueId) const override { + std::string_view getValueName(int valueId) const override { return valueName; } @@ -219,7 +223,11 @@ class KernelMetric : public Metric { const std::string &getName() const override { return name; } - const std::string &getValueName(int valueId) const override { + static constexpr std::string_view getValueName(kernelMetricKind valueId) { + return VALUE_NAMES[valueId]; + } + + std::string_view getValueName(int valueId) const override { return VALUE_NAMES[valueId]; } @@ -232,7 +240,7 @@ class KernelMetric : public Metric { true, true, false, false, true, true, true, true}; const static inline bool EXCLUSIVE[kernelMetricKind::Count] = { false, false, false, false, true, true, true, true}; - const static inline std::string VALUE_NAMES[kernelMetricKind::Count] = { + static constexpr std::string_view VALUE_NAMES[kernelMetricKind::Count] = { "start_time (ns)", "end_time (ns)", "count", "time (ns)", "device_id", "device_type", "stream_id", "is_metric_kernel", }; @@ -278,14 +286,18 @@ class PCSamplingMetric : public Metric { const std::string &getName() const override { return name; } - const std::string &getValueName(int valueId) const override { + static constexpr std::string_view getValueName(PCSamplingMetricKind valueId) { + return VALUE_NAMES[valueId]; + } + + std::string_view getValueName(int valueId) const override { return VALUE_NAMES[valueId]; } bool isProperty(int valueId) const override { return false; } bool isExclusive(int valueId) const override { return false; } - const static inline std::string VALUE_NAMES[PCSamplingMetricKind::Count] = { + static constexpr std::string_view VALUE_NAMES[PCSamplingMetricKind::Count] = { "num_samples", "num_stalled_samples", "stalled_branch_resolving", @@ -359,7 +371,11 @@ class CycleMetric : public Metric { const std::string &getName() const override { return name; } - const std::string &getValueName(int valueId) const override { + static constexpr std::string_view getValueName(CycleMetricKind valueId) { + return VALUE_NAMES[valueId]; + } + + std::string_view getValueName(int valueId) const override { return VALUE_NAMES[valueId]; } @@ -374,7 +390,7 @@ class CycleMetric : public Metric { const static inline bool EXCLUSIVE[CycleMetricKind::Count] = { false, false, true, true, true, true, true, true, true, true, true, true, false, false, false}; - const static inline std::string VALUE_NAMES[CycleMetricKind::Count] = { + static constexpr std::string_view VALUE_NAMES[CycleMetricKind::Count] = { "start_cycle", "end_cycle", "cycles", "normalized_cycles", "kernel_id", "kernel_name", "block_id", "processor_id", "unit_id", "device_id", "device_type", "time_shift_cost", diff --git a/third_party/proton/csrc/include/Utility/MsgPackWriter.h b/third_party/proton/csrc/include/Utility/MsgPackWriter.h index 639b4cdfad2a..4d441a912785 100644 --- a/third_party/proton/csrc/include/Utility/MsgPackWriter.h +++ b/third_party/proton/csrc/include/Utility/MsgPackWriter.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -21,6 +22,16 @@ class MsgPackWriter { void packInt(int64_t value); void packDouble(double value); void packStr(std::string_view value); + template void packFixStrLiteral(const char (&value)[N]) { + static_assert(N > 0); + constexpr uint32_t size = static_cast(N - 1); + static_assert(size <= 31); + out.push_back(static_cast(0xa0 | size)); + const auto offset = out.size(); + out.resize(offset + size); + std::memcpy(out.data() + offset, value, size); + } + void packUIntString(uint64_t value); void packArray(uint32_t size); void packMap(uint32_t size); diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 247a3ae7894b..7f136bf33afc 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -8,9 +8,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -18,6 +20,7 @@ #include #include #include +#include #include namespace proton { @@ -25,7 +28,6 @@ namespace proton { namespace { constexpr size_t kMaxRegisteredDeviceIds = 32; - struct MetricSummary { // Whether we observed at least one kernel metric. bool hasKernelMetric = false; @@ -93,6 +95,8 @@ class TreeData::Tree { size_t id = DummyId; }; + static constexpr size_t kChildIndexThreshold = 8; + TreeNode() = default; explicit TreeNode(size_t id, const std::string &name) : id(id), Context(name) {} @@ -104,26 +108,45 @@ class TreeData::Tree { void addChild(std::string_view childName, size_t id) { children.push_back({childName, id}); - childIndex.emplace(childName, id); + if (childIndex) { + childIndex->emplace(childName, id); + return; + } + if (children.size() > kChildIndexThreshold) { + childIndex = std::make_unique>(); + childIndex->reserve(children.size()); + for (const auto &child : children) { + childIndex->emplace(child.name, child.id); + } + } } size_t findChild(std::string_view childName) const { - auto it = childIndex.find(childName); - return it != childIndex.end() ? it->second : DummyId; + if (childIndex) { + auto it = childIndex->find(childName); + return it != childIndex->end() ? it->second : DummyId; + } + for (const auto &child : children) { + if (child.name == childName) { + return child.id; + } + } + return DummyId; } size_t parentId = DummyId; size_t id = DummyId; std::vector children = {}; - std::unordered_map childIndex = {}; + std::unique_ptr> childIndex = + {}; // Direct and linked metrics associated with this tree node. DataEntry::MetricSet metricSet{}; friend class Tree; }; Tree() { - treeNodeMap.try_emplace(TreeNode::RootId, TreeNode::RootId, - TreeNode::RootId, "ROOT"); + treeNodes.emplace_back(TreeNode::RootId, TreeNode::RootId, "ROOT"); } size_t addNode(const std::vector &contexts, size_t parentId) { @@ -134,14 +157,14 @@ class TreeData::Tree { } size_t addNode(const Context &context, size_t parentId) { - auto &parent = treeNodeMap.at(parentId); + auto &parent = getNode(parentId); std::string_view contextName = context.name; auto existingChildId = parent.findChild(contextName); if (existingChildId != TreeNode::DummyId) return existingChildId; auto id = nextContextId++; - auto [it, inserted] = treeNodeMap.try_emplace(id, id, parentId, context); - parent.addChild(it->second.name, id); + treeNodes.emplace_back(id, parentId, context); + parent.addChild(treeNodes.back().name, id); return id; } @@ -153,15 +176,16 @@ class TreeData::Tree { return parentId; } - TreeNode &getNode(size_t id) { return treeNodeMap.at(id); } + TreeNode &getNode(size_t id) { return treeNodes.at(id); } void upsertFlexibleMetric(size_t contextId, const FlexibleMetric &flexibleMetric) { - auto &node = treeNodeMap.at(contextId); + auto &node = getNode(contextId); auto &flexibleMetrics = node.metricSet.flexibleMetrics; - auto it = flexibleMetrics.find(flexibleMetric.getValueName(0)); + auto valueName = std::string(flexibleMetric.getValueName(0)); + auto it = flexibleMetrics.find(valueName); if (it == flexibleMetrics.end()) { - flexibleMetrics.emplace(flexibleMetric.getValueName(0), flexibleMetric); + flexibleMetrics.emplace(std::move(valueName), flexibleMetric); } else { it->second.updateMetric(flexibleMetric); } @@ -184,21 +208,22 @@ class TreeData::Tree { } } - size_t size() const { return nextContextId; } + size_t size() const { return treeNodes.size(); } Tree structure() const { Tree cloned; + cloned.treeNodes.clear(); cloned.nextContextId = nextContextId; - for (const auto &[id, node] : treeNodeMap) { - cloned.treeNodeMap.try_emplace(id, id, node.parentId, node); + for (const auto &node : treeNodes) { + cloned.treeNodes.emplace_back(node.id, node.parentId, node); } - for (const auto &[id, node] : treeNodeMap) { - auto &clonedNode = cloned.treeNodeMap.at(id); + for (const auto &node : treeNodes) { + auto &clonedNode = cloned.getNode(node.id); clonedNode.children.reserve(node.children.size()); for (const auto &child : node.children) { - clonedNode.addChild(cloned.treeNodeMap[child.id].name, child.id); + clonedNode.addChild(cloned.getNode(child.id).name, child.id); } } @@ -207,8 +232,8 @@ class TreeData::Tree { private: size_t nextContextId = TreeNode::RootId + 1; - // tree node id -> tree node - std::unordered_map treeNodeMap; + // Node ids are dense and assigned sequentially, so index lookup is enough. + std::deque treeNodes; }; json TreeData::buildHatchetJson(TreeData::Tree *tree, @@ -219,43 +244,49 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, jsonNodes[TreeData::Tree::TreeNode::RootId] = &(output.back()); MetricSummary metricSummary; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); + auto appendKernelMetric = [&](json &metricsJson, + const KernelMetric *kernelMetric) { + uint64_t duration = + std::get(kernelMetric->getValue(KernelMetric::Duration)); + uint64_t invocations = std::get( + kernelMetric->getValue(KernelMetric::Invocations)); + uint64_t deviceId = + std::get(kernelMetric->getValue(KernelMetric::DeviceId)); + uint64_t deviceType = + std::get(kernelMetric->getValue(KernelMetric::DeviceType)); + metricSummary.hasKernelMetric = true; + metricSummary.updateDeviceIdMask(deviceType, deviceId); + const auto &deviceTypeName = + getDeviceTypeString(static_cast(deviceType)); + const auto durationName = + KernelMetric::getValueName(KernelMetric::Duration); + const auto invocationsName = + KernelMetric::getValueName(KernelMetric::Invocations); + const auto deviceIdName = KernelMetric::getValueName(KernelMetric::DeviceId); + const auto deviceTypeNameKey = + KernelMetric::getValueName(KernelMetric::DeviceType); + const auto deviceIdStr = std::to_string(deviceId); + + metricsJson[std::string(durationName)] = duration; + metricsJson[std::string(invocationsName)] = invocations; + metricsJson[std::string(deviceIdName)] = deviceIdStr; + metricsJson[std::string(deviceTypeNameKey)] = deviceTypeName; + }; auto appendMetrics = [&](json &metricsJson, const std::map> &metrics) { metricSummary.observeMetrics(metrics); for (const auto &[metricKind, metric] : metrics) { if (metricKind == MetricKind::Kernel) { - auto *kernelMetric = static_cast(metric.get()); - uint64_t duration = - std::get(kernelMetric->getValue(KernelMetric::Duration)); - uint64_t invocations = std::get( - kernelMetric->getValue(KernelMetric::Invocations)); - uint64_t deviceId = - std::get(kernelMetric->getValue(KernelMetric::DeviceId)); - uint64_t deviceType = std::get( - kernelMetric->getValue(KernelMetric::DeviceType)); - const auto &deviceTypeName = - getDeviceTypeString(static_cast(deviceType)); - const auto &durationName = - kernelMetric->getValueName(KernelMetric::Duration); - const auto &invocationsName = - kernelMetric->getValueName(KernelMetric::Invocations); - const auto &deviceIdName = - kernelMetric->getValueName(KernelMetric::DeviceId); - const auto &deviceTypeNameKey = - kernelMetric->getValueName(KernelMetric::DeviceType); - const auto deviceIdStr = std::to_string(deviceId); - - metricsJson[durationName] = duration; - metricsJson[invocationsName] = invocations; - metricsJson[deviceIdName] = deviceIdStr; - metricsJson[deviceTypeNameKey] = deviceTypeName; + appendKernelMetric(metricsJson, + static_cast(metric.get())); } else if (metricKind == MetricKind::PCSampling) { auto *pcSamplingMetric = static_cast(metric.get()); for (size_t i = 0; i < PCSamplingMetric::Count; i++) { - const auto &valueName = pcSamplingMetric->getValueName(i); - std::visit([&](auto &&value) { metricsJson[valueName] = value; }, - pcSamplingMetric->getValues()[i]); + const auto valueName = pcSamplingMetric->getValueName(i); + std::visit( + [&](auto &&value) { metricsJson[std::string(valueName)] = value; }, + pcSamplingMetric->getValues()[i]); } } else if (metricKind == MetricKind::Cycle) { auto *cycleMetric = static_cast(metric.get()); @@ -267,21 +298,19 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, std::get(cycleMetric->getValue(CycleMetric::DeviceId)); uint64_t deviceType = std::get(cycleMetric->getValue(CycleMetric::DeviceType)); - const auto &durationName = - cycleMetric->getValueName(CycleMetric::Duration); - const auto &normalizedDurationName = - cycleMetric->getValueName(CycleMetric::NormalizedDuration); - const auto &deviceIdName = - cycleMetric->getValueName(CycleMetric::DeviceId); - const auto &deviceTypeName = - cycleMetric->getValueName(CycleMetric::DeviceType); + const auto durationName = CycleMetric::getValueName(CycleMetric::Duration); + const auto normalizedDurationName = + CycleMetric::getValueName(CycleMetric::NormalizedDuration); + const auto deviceIdName = CycleMetric::getValueName(CycleMetric::DeviceId); + const auto deviceTypeName = + CycleMetric::getValueName(CycleMetric::DeviceType); const auto deviceIdStr = std::to_string(deviceId); const auto deviceTypeStr = std::to_string(deviceType); - metricsJson[durationName] = duration; - metricsJson[normalizedDurationName] = normalizedDuration; - metricsJson[deviceIdName] = deviceIdStr; - metricsJson[deviceTypeName] = deviceTypeStr; + metricsJson[std::string(durationName)] = duration; + metricsJson[std::string(normalizedDurationName)] = normalizedDuration; + metricsJson[std::string(deviceIdName)] = deviceIdStr; + metricsJson[std::string(deviceTypeName)] = deviceTypeStr; } else if (metricKind == MetricKind::Flexible) { // Flexible metrics are handled in a different way } else { @@ -293,7 +322,7 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, [&](json &metricsJson, const std::map &flexibleMetrics) { for (const auto &[_, flexibleMetric] : flexibleMetrics) { - const auto &valueName = flexibleMetric.getValueName(0); + const auto valueName = flexibleMetric.getValueName(0); std::visit( [&](auto &&v) { using T = std::decay_t; @@ -301,12 +330,12 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, std::is_same_v || std::is_same_v || std::is_same_v) { - metricsJson[valueName] = v; + metricsJson[std::string(valueName)] = v; } else if constexpr (std::is_same_v> || std::is_same_v> || std::is_same_v>) { - metricsJson[valueName] = json::array(); - auto &arr = metricsJson[valueName]; + metricsJson[std::string(valueName)] = json::array(); + auto &arr = metricsJson[std::string(valueName)]; arr.get_ref().reserve(v.size()); for (const auto &value : v) { arr.push_back(value); @@ -320,7 +349,7 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, }; tree->template walk( [&](TreeData::Tree::TreeNode &treeNode) { - const auto contextName = treeNode.name; + const auto &contextName = treeNode.name; auto contextId = treeNode.id; json *jsonNode = jsonNodes[contextId]; (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}}; @@ -346,35 +375,38 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, std::function appendLinkedVirtualNode = [&](size_t virtualNodeId, json &outNode, json &parentMetricsJson) -> bool { - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - const auto flexibleIt = - treeNode.metricSet.linkedFlexibleMetrics.find(virtualNodeId); - outNode = json::object(); - outNode["frame"] = {{"name", virtualNode.name}, {"type", "function"}}; - outNode["metrics"] = json::object(); - if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { - appendMetrics(outNode["metrics"], metricsIt->second); - } - // Linked flexible metrics are only attached to - // children, so they always belong on the parent frame. - if (flexibleIt != treeNode.metricSet.linkedFlexibleMetrics.end()) { - appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); - } - outNode["children"] = json::array(); - auto &linkedChildren = outNode["children"]; - linkedChildren.get_ref().reserve( - virtualNode.children.size()); - for (const auto &child : virtualNode.children) { - json linkedChildNode; - if (appendLinkedVirtualNode(child.id, linkedChildNode, - outNode["metrics"])) { - linkedChildren.push_back(std::move(linkedChildNode)); - } - } - return !outNode["metrics"].empty() || !outNode["children"].empty(); - }; + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + const auto metricsIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + const auto flexibleIt = + treeNode.metricSet.linkedFlexibleMetrics.find(virtualNodeId); + outNode = json::object(); + outNode["frame"] = {{"name", virtualNode.name}, + {"type", "function"}}; + outNode["metrics"] = json::object(); + if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { + appendMetrics(outNode["metrics"], metricsIt->second); + } + // Linked flexible metrics are only attached to + // children, so they always belong on the parent frame. + if (flexibleIt != + treeNode.metricSet.linkedFlexibleMetrics.end()) { + appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); + } + outNode["children"] = json::array(); + auto &linkedChildren = outNode["children"]; + linkedChildren.get_ref().reserve( + virtualNode.children.size()); + for (const auto &child : virtualNode.children) { + json linkedChildNode; + if (appendLinkedVirtualNode(child.id, linkedChildNode, + outNode["metrics"])) { + linkedChildren.push_back(std::move(linkedChildNode)); + } + } + return !outNode["metrics"].empty() || + !outNode["children"].empty(); + }; for (const auto &virtualChild : virtualRootNode.children) { json linkedRootChildNode; @@ -386,24 +418,26 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, }); if (metricSummary.hasKernelMetric) { - KernelMetric kernelMetric; output[TreeData::Tree::TreeNode::RootId]["metrics"] - [kernelMetric.getValueName(KernelMetric::Invocations)] = 0; + [std::string(KernelMetric::getValueName( + KernelMetric::Invocations))] = 0; output[TreeData::Tree::TreeNode::RootId]["metrics"] - [kernelMetric.getValueName(KernelMetric::Duration)] = 0; + [std::string(KernelMetric::getValueName(KernelMetric::Duration))] = + 0; } if (metricSummary.hasCycleMetric) { - CycleMetric cycleMetric; output[TreeData::Tree::TreeNode::RootId]["metrics"] - [cycleMetric.getValueName(CycleMetric::Duration)] = 0; + [std::string(CycleMetric::getValueName(CycleMetric::Duration))] = 0; output[TreeData::Tree::TreeNode::RootId]["metrics"] - [cycleMetric.getValueName(CycleMetric::NormalizedDuration)] = 0; + [std::string(CycleMetric::getValueName( + CycleMetric::NormalizedDuration))] = 0; } if (metricSummary.hasPCSamplingMetric) { PCSamplingMetric pcSamplingMetric; for (size_t i = 0; i < PCSamplingMetric::Count; i++) { - const auto &valueName = pcSamplingMetric.getValueName(i); - output[TreeData::Tree::TreeNode::RootId]["metrics"][valueName] = 0; + const auto valueName = pcSamplingMetric.getValueName(i); + output[TreeData::Tree::TreeNode::RootId]["metrics"] + [std::string(valueName)] = 0; } } @@ -448,53 +482,14 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.reserve(16 * 1024 * 1024); // 16 MB MetricSummary metricSummary; + metricSummary.hasKernelMetric = true; const std::map> emptyMetrics; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - tree->template walk( - [&](TreeData::Tree::TreeNode &treeNode) { - metricSummary.observeMetrics(treeNode.metricSet.metrics); - for (const auto &[_, linkedMetrics] : - treeNode.metricSet.linkedMetrics) { - metricSummary.observeMetrics(linkedMetrics); - } - }); - - // We only need these metrics for tree data - KernelMetric kernelMetric; - auto &kernelMetricDurationName = - kernelMetric.getValueName(KernelMetric::Duration); - auto &kernelMetricInvocationsName = - kernelMetric.getValueName(KernelMetric::Invocations); - auto &kernelMetricDeviceIdName = - kernelMetric.getValueName(KernelMetric::DeviceId); - auto &kernelMetricDeviceTypeName = - kernelMetric.getValueName(KernelMetric::DeviceType); - CycleMetric cycleMetric; - auto &cycleMetricDurationName = - cycleMetric.getValueName(CycleMetric::Duration); - auto &cycleMetricNormalizedDurationName = - cycleMetric.getValueName(CycleMetric::NormalizedDuration); - auto &cycleMetricDeviceIdName = - cycleMetric.getValueName(CycleMetric::DeviceId); - auto &cycleMetricDeviceTypeName = - cycleMetric.getValueName(CycleMetric::DeviceType); - std::set kernelInclusiveValueNames = { - kernelMetricDurationName, kernelMetricInvocationsName}; - std::set kernelExclusiveValueNames = { - kernelMetricDeviceIdName, kernelMetricDeviceTypeName}; - std::set cycleInclusiveValueNames = { - cycleMetricDurationName, cycleMetricNormalizedDurationName}; - std::set cycleExclusiveValueNames = {cycleMetricDeviceIdName, - cycleMetricDeviceTypeName}; - const auto kernelInclusiveCount = - static_cast(kernelInclusiveValueNames.size()); - const auto kernelTotalCount = static_cast( - kernelInclusiveValueNames.size() + kernelExclusiveValueNames.size()); - const auto cycleInclusiveCount = - static_cast(cycleInclusiveValueNames.size()); - const auto cycleTotalCount = static_cast( - cycleInclusiveValueNames.size() + cycleExclusiveValueNames.size()); + constexpr uint32_t kernelInclusiveCount = 2; + constexpr uint32_t kernelTotalCount = 4; + constexpr uint32_t cycleInclusiveCount = 2; + constexpr uint32_t cycleTotalCount = 4; auto packFlexibleMetricValue = [&](const MetricValueType &value) { std::visit( @@ -566,6 +561,39 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto countFlexibleMetricEntries = [&](const std::map &flexibleMetrics) -> uint32_t { return static_cast(flexibleMetrics.size()); }; + auto packKernelMetricName = [&](KernelMetric::kernelMetricKind valueId) { + writer.packStr(KernelMetric::getValueName(valueId)); + }; + auto packCycleMetricName = [&](CycleMetric::CycleMetricKind valueId) { + writer.packStr(CycleMetric::getValueName(valueId)); + }; + + auto packKernelMetricFields = [&](uint64_t duration, uint64_t invocations, + uint64_t deviceId, uint64_t deviceType) { + metricSummary.updateDeviceIdMask(deviceType, deviceId); + const auto &deviceTypeName = + getDeviceTypeString(static_cast(deviceType)); + packKernelMetricName(KernelMetric::Duration); + writer.packUInt(duration); + packKernelMetricName(KernelMetric::Invocations); + writer.packUInt(invocations); + packKernelMetricName(KernelMetric::DeviceId); + writer.packUIntString(deviceId); + packKernelMetricName(KernelMetric::DeviceType); + writer.packStr(deviceTypeName); + }; + + auto packKernelMetricValues = [&](const KernelMetric *kernelMetric) { + uint64_t duration = + std::get(kernelMetric->getValue(KernelMetric::Duration)); + uint64_t invocations = std::get( + kernelMetric->getValue(KernelMetric::Invocations)); + uint64_t deviceId = + std::get(kernelMetric->getValue(KernelMetric::DeviceId)); + uint64_t deviceType = + std::get(kernelMetric->getValue(KernelMetric::DeviceType)); + packKernelMetricFields(duration, invocations, deviceId, deviceType); + }; auto packMetrics = [&](const std::map> &metrics, @@ -573,36 +601,18 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, for (const auto &[metricKind, metric] : metrics) { if (metricKind == MetricKind::Kernel) { if (isRoot) { - writer.packStr(kernelMetricDurationName); + packKernelMetricName(KernelMetric::Duration); writer.packUInt(0); - writer.packStr(kernelMetricInvocationsName); + packKernelMetricName(KernelMetric::Invocations); writer.packUInt(0); continue; } - auto *kernelMetric = static_cast(metric.get()); - uint64_t duration = - std::get(kernelMetric->getValue(KernelMetric::Duration)); - uint64_t invocations = std::get( - kernelMetric->getValue(KernelMetric::Invocations)); - uint64_t deviceId = - std::get(kernelMetric->getValue(KernelMetric::DeviceId)); - uint64_t deviceType = std::get( - kernelMetric->getValue(KernelMetric::DeviceType)); - const auto &deviceTypeName = - getDeviceTypeString(static_cast(deviceType)); - writer.packStr(kernelMetricDurationName); - writer.packUInt(duration); - writer.packStr(kernelMetricInvocationsName); - writer.packUInt(invocations); - writer.packStr(kernelMetricDeviceIdName); - writer.packStr(std::to_string(deviceId)); - writer.packStr(kernelMetricDeviceTypeName); - writer.packStr(deviceTypeName); + packKernelMetricValues(static_cast(metric.get())); } else if (metricKind == MetricKind::PCSampling) { auto *pcSamplingMetric = static_cast(metric.get()); for (size_t i = 0; i < PCSamplingMetric::Count; i++) { - const auto &valueName = pcSamplingMetric->getValueName(i); + const auto valueName = pcSamplingMetric->getValueName(i); writer.packStr(valueName); if (isRoot) { writer.packUInt(0); @@ -613,9 +623,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } else if (metricKind == MetricKind::Cycle) { if (isRoot) { - writer.packStr(cycleMetricDurationName); + packCycleMetricName(CycleMetric::Duration); writer.packUInt(0); - writer.packStr(cycleMetricNormalizedDurationName); + packCycleMetricName(CycleMetric::NormalizedDuration); writer.packUInt(0); continue; } @@ -629,15 +639,16 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, std::get(cycleMetric->getValue(CycleMetric::DeviceId)); uint64_t deviceType = std::get(cycleMetric->getValue(CycleMetric::DeviceType)); + metricSummary.updateDeviceIdMask(deviceType, deviceId); - writer.packStr(cycleMetricDurationName); + packCycleMetricName(CycleMetric::Duration); writer.packUInt(duration); - writer.packStr(cycleMetricNormalizedDurationName); + packCycleMetricName(CycleMetric::NormalizedDuration); writer.packDouble(normalizedDuration); - writer.packStr(cycleMetricDeviceIdName); - writer.packStr(std::to_string(deviceId)); - writer.packStr(cycleMetricDeviceTypeName); - writer.packStr(std::to_string(deviceType)); + packCycleMetricName(CycleMetric::DeviceId); + writer.packUIntString(deviceId); + packCycleMetricName(CycleMetric::DeviceType); + writer.packUIntString(deviceType); } else { throw makeLogicError("MetricKind not supported"); } @@ -645,25 +656,25 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, if (isRoot) { if (metricSummary.hasKernelMetric && metrics.find(MetricKind::Kernel) == metrics.end()) { - writer.packStr(kernelMetricDurationName); + packKernelMetricName(KernelMetric::Duration); writer.packUInt(0); - writer.packStr(kernelMetricInvocationsName); + packKernelMetricName(KernelMetric::Invocations); writer.packUInt(0); } if (metricSummary.hasPCSamplingMetric && metrics.find(MetricKind::PCSampling) == metrics.end()) { PCSamplingMetric pcSamplingMetric; for (size_t i = 0; i < PCSamplingMetric::Count; i++) { - const auto &valueName = pcSamplingMetric.getValueName(i); + const auto valueName = pcSamplingMetric.getValueName(i); writer.packStr(valueName); writer.packUInt(0); } } if (metricSummary.hasCycleMetric && metrics.find(MetricKind::Cycle) == metrics.end()) { - writer.packStr(cycleMetricDurationName); + packCycleMetricName(CycleMetric::Duration); writer.packUInt(0); - writer.packStr(cycleMetricNormalizedDurationName); + packCycleMetricName(CycleMetric::NormalizedDuration); writer.packUInt(0); } } @@ -671,7 +682,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packFlexibleMetrics = [&](const std::map &flexibleMetrics) { for (const auto &[_, flexibleMetric] : flexibleMetrics) { - const auto &valueName = flexibleMetric.getValueName(0); + const auto valueName = flexibleMetric.getValueName(0); writer.packStr(valueName); packFlexibleMetricValue(flexibleMetric.getValues()[0]); } @@ -699,18 +710,18 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } }; - std::function packNode = - [&](TreeData::Tree::TreeNode &treeNode) { + auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) + -> void { writer.packMap(3); - writer.packStr("frame"); + writer.packFixStrLiteral("frame"); writer.packMap(2); - writer.packStr("name"); + writer.packFixStrLiteral("name"); writer.packStr(treeNode.name); - writer.packStr("type"); - writer.packStr("function"); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); - writer.packStr("metrics"); + writer.packFixStrLiteral("metrics"); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; writer.packMap( countMetricEntries(treeNode.metricSet.metrics, isRoot) + @@ -726,13 +737,27 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, !treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty(); - std::function hasLinkedVirtualNode = - [&](size_t virtualNodeId) { + auto getOnlyLinkedKernelMetric = [&](size_t virtualNodeId) + -> const KernelMetric * { + const auto metricsIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + if (metricsIt == treeNode.metricSet.linkedMetrics.end() || + metricsIt->second.size() != 1 || + metricsIt->second.begin()->first != MetricKind::Kernel) { + return nullptr; + } + return static_cast( + metricsIt->second.begin()->second.get()); + }; + + auto hasLinkedVirtualNode = [&](auto &&hasLinkedVirtualNode, + size_t virtualNodeId) -> bool { const auto &virtualNode = virtualTree->getNode(virtualNodeId); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); if (metricsIt != treeNode.metricSet.linkedMetrics.end() && - countMetricEntries(metricsIt->second, /*isRoot=*/false) > 0) { + countMetricEntries(metricsIt->second, /*isRoot=*/false) > + 0) { return true; } if (countPromotedFlexibleMetricEntries( @@ -741,35 +766,181 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, return true; } for (const auto &child : virtualNode.children) { - if (hasLinkedVirtualNode(child.id)) { + if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { return true; } } return false; }; - auto countLinkedVirtualChildren = [&](const auto &children) { + auto isGeneratedNumericScope = [](std::string_view name) { + constexpr std::string_view prefix = "scope_"; + if (name.size() <= prefix.size() || + name.substr(0, prefix.size()) != prefix) { + return false; + } + for (size_t i = prefix.size(); i < name.size(); ++i) { + if (name[i] < '0' || name[i] > '9') { + return false; + } + } + return true; + }; + auto shouldCollapseLinkedVirtualNode = + [&](const TreeData::Tree::TreeNode &virtualNode) { + if (!isGeneratedNumericScope(virtualNode.name) || + virtualNode.children.empty()) { + return false; + } + if (treeNode.metricSet.linkedMetrics.find(virtualNode.id) != + treeNode.metricSet.linkedMetrics.end()) { + return false; + } + return countPromotedFlexibleMetricEntries( + virtualNode.children, + treeNode.metricSet.linkedFlexibleMetrics) == 0; + }; + auto countLinkedVirtualChildrenExpanded = + [&](auto &&countLinkedVirtualChildrenExpanded, + const auto &children) -> uint32_t { uint32_t childCount = 0; for (const auto &child : children) { - if (hasLinkedVirtualNode(child.id)) { + if (!hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { + continue; + } + const auto &virtualNode = virtualTree->getNode(child.id); + if (shouldCollapseLinkedVirtualNode(virtualNode)) { + childCount += countLinkedVirtualChildrenExpanded( + countLinkedVirtualChildrenExpanded, virtualNode.children); + } else { ++childCount; } } return childCount; }; - std::function packLinkedVirtualNode = - [&](size_t virtualNodeId) { + struct AggregatedKernelMetric { + std::string_view name; + uint64_t duration = 0; + uint64_t invocations = 0; + uint64_t deviceId = 0; + uint64_t deviceType = 0; + }; + auto tryPackAggregatedCollapsedChildren = + [&](const TreeData::Tree::TreeNode &virtualNode) { + if (virtualNode.children.size() < 64) { + return false; + } + std::vector aggregatedMetrics; + std::unordered_map nameToIndex; + aggregatedMetrics.reserve(8); + nameToIndex.reserve(8); + + for (const auto &child : virtualNode.children) { + if (!hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { + continue; + } + const auto &childNode = virtualTree->getNode(child.id); + if (!shouldCollapseLinkedVirtualNode(childNode)) { + return false; + } + for (const auto &grandchild : childNode.children) { + if (!hasLinkedVirtualNode(hasLinkedVirtualNode, + grandchild.id)) { + continue; + } + const auto &grandchildNode = + virtualTree->getNode(grandchild.id); + if (!grandchildNode.children.empty()) { + return false; + } + const auto *kernelMetric = + getOnlyLinkedKernelMetric(grandchild.id); + if (kernelMetric == nullptr) { + return false; + } + auto name = std::string_view(grandchildNode.name); + auto [it, inserted] = + nameToIndex.emplace(name, aggregatedMetrics.size()); + if (inserted) { + AggregatedKernelMetric aggregate; + aggregate.name = name; + aggregate.deviceId = std::get( + kernelMetric->getValue(KernelMetric::DeviceId)); + aggregate.deviceType = std::get( + kernelMetric->getValue(KernelMetric::DeviceType)); + aggregatedMetrics.push_back(aggregate); + } + auto &aggregate = aggregatedMetrics[it->second]; + aggregate.duration += std::get( + kernelMetric->getValue(KernelMetric::Duration)); + aggregate.invocations += std::get( + kernelMetric->getValue(KernelMetric::Invocations)); + } + } + + if (aggregatedMetrics.empty()) { + return false; + } + + writer.packArray( + static_cast(aggregatedMetrics.size())); + for (const auto &aggregate : aggregatedMetrics) { + writer.packMap(3); + writer.packFixStrLiteral("frame"); + writer.packMap(2); + writer.packFixStrLiteral("name"); + writer.packStr(aggregate.name); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); + + writer.packFixStrLiteral("metrics"); + writer.packMap(kernelTotalCount); + packKernelMetricFields(aggregate.duration, + aggregate.invocations, + aggregate.deviceId, + aggregate.deviceType); + + writer.packFixStrLiteral("children"); + writer.packArray(0); + } + return true; + }; + + auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, + size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); + const auto *onlyKernelMetric = + getOnlyLinkedKernelMetric(virtualNodeId); + if (virtualNode.children.empty() && + onlyKernelMetric != nullptr) { + writer.packMap(3); + + writer.packFixStrLiteral("frame"); + writer.packMap(2); + writer.packFixStrLiteral("name"); + writer.packStr(virtualNode.name); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); + + writer.packFixStrLiteral("metrics"); + writer.packMap(kernelTotalCount); + packKernelMetricValues(onlyKernelMetric); + + writer.packFixStrLiteral("children"); + writer.packArray(0); + return; + } + writer.packMap(3); - writer.packStr("frame"); + writer.packFixStrLiteral("frame"); writer.packMap(2); - writer.packStr("name"); + writer.packFixStrLiteral("name"); writer.packStr(virtualNode.name); - writer.packStr("type"); - writer.packStr("function"); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); - writer.packStr("metrics"); + writer.packFixStrLiteral("metrics"); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); const auto &linkedMetrics = @@ -788,35 +959,67 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, virtualNode.children, treeNode.metricSet.linkedFlexibleMetrics); - writer.packStr("children"); - writer.packArray( - countLinkedVirtualChildren(virtualNode.children)); - for (const auto &child : virtualNode.children) { - if (hasLinkedVirtualNode(child.id)) { - packLinkedVirtualNode(child.id); + writer.packFixStrLiteral("children"); + if (!tryPackAggregatedCollapsedChildren(virtualNode)) { + writer.packArray(countLinkedVirtualChildrenExpanded( + countLinkedVirtualChildrenExpanded, virtualNode.children)); + for (const auto &child : virtualNode.children) { + if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { + const auto &childNode = virtualTree->getNode(child.id); + if (shouldCollapseLinkedVirtualNode(childNode)) { + for (const auto &grandchild : childNode.children) { + if (hasLinkedVirtualNode(hasLinkedVirtualNode, + grandchild.id)) { + packLinkedVirtualNode(packLinkedVirtualNode, + grandchild.id); + } + } + } else { + packLinkedVirtualNode(packLinkedVirtualNode, child.id); + } + } } } }; uint32_t linkedChildCount = hasLinkedTargets - ? countLinkedVirtualChildren(virtualRootNode.children) + ? countLinkedVirtualChildrenExpanded( + countLinkedVirtualChildrenExpanded, + virtualRootNode.children) : 0; - writer.packStr("children"); + writer.packFixStrLiteral("children"); writer.packArray(static_cast(treeNode.children.size()) + linkedChildCount); for (const auto &child : treeNode.children) { - packNode(tree->getNode(child.id)); + packNode(packNode, tree->getNode(child.id)); } if (hasLinkedTargets) { for (const auto &virtualChild : virtualRootNode.children) { - if (hasLinkedVirtualNode(virtualChild.id)) { - packLinkedVirtualNode(virtualChild.id); + if (hasLinkedVirtualNode(hasLinkedVirtualNode, virtualChild.id)) { + const auto &virtualChildNode = + virtualTree->getNode(virtualChild.id); + if (shouldCollapseLinkedVirtualNode(virtualChildNode)) { + for (const auto &grandchild : virtualChildNode.children) { + if (hasLinkedVirtualNode(hasLinkedVirtualNode, + grandchild.id)) { + packLinkedVirtualNode(packLinkedVirtualNode, + grandchild.id); + } + } + } else { + packLinkedVirtualNode(packLinkedVirtualNode, virtualChild.id); + } } } } }; + // Hatchet format: [tree, device_metadata]. Always emit 2 elements to match + // the JSON serializer, even if device_metadata is empty. + writer.packArray(2); + packNode(packNode, tree->getNode(TreeData::Tree::TreeNode::RootId)); + uint32_t deviceTypeEntries = 0; for (size_t deviceType = 0; deviceType < static_cast(DeviceType::COUNT); ++deviceType) { @@ -824,10 +1027,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, ++deviceTypeEntries; } } - // Hatchet format: [tree, device_metadata]. Always emit 2 elements to match - // the JSON serializer, even if device_metadata is empty. - writer.packArray(2); - packNode(tree->getNode(TreeData::Tree::TreeNode::RootId)); auto countSetBits = [](uint32_t mask) -> uint32_t { uint32_t count = 0; @@ -859,15 +1058,15 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, Device device = getDevice(static_cast(deviceType), deviceId); writer.packStr(std::to_string(deviceId)); writer.packMap(5); - writer.packStr("clock_rate"); + writer.packFixStrLiteral("clock_rate"); writer.packUInt(device.clockRate); - writer.packStr("memory_clock_rate"); + writer.packFixStrLiteral("memory_clock_rate"); writer.packUInt(device.memoryClockRate); - writer.packStr("bus_width"); + writer.packFixStrLiteral("bus_width"); writer.packUInt(device.busWidth); - writer.packStr("arch"); + writer.packFixStrLiteral("arch"); writer.packStr(device.arch); - writer.packStr("num_sms"); + writer.packFixStrLiteral("num_sms"); writer.packUInt(device.numSms); } } diff --git a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp index 488b78773bc4..3924d8e3de86 100644 --- a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp +++ b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp @@ -103,7 +103,7 @@ size_t matchStallReasonsToIndices( std::string::npos; std::string cuptiStallName = std::string(stallReasonNames[i]); for (size_t j = 0; j < PCSamplingMetric::PCSamplingMetricKind::Count; j++) { - auto metricName = PCSamplingMetric().getValueName(j); + auto metricName = std::string(PCSamplingMetric().getValueName(j)); if (cuptiStallName.find(metricName) != std::string::npos) { if (notIssued) notIssuedStallReasonIndices.insert(stallReasonIndices[i]); diff --git a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp index 65980c9746b7..7fd81134d8ff 100644 --- a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp +++ b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp @@ -1,6 +1,10 @@ #include "Utility/MsgPackWriter.h" +#include "Utility/Errors.h" + +#include #include +#include #include #include #include @@ -88,7 +92,18 @@ void MsgPackWriter::packStr(std::string_view value) { out.push_back(0xdb); writeBE(out, static_cast(size)); } - out.insert(out.end(), value.begin(), value.end()); + const auto offset = out.size(); + out.resize(offset + size); + std::memcpy(out.data() + offset, value.data(), size); +} + +void MsgPackWriter::packUIntString(uint64_t value) { + char buffer[std::numeric_limits::digits10 + 1]; + auto [ptr, ec] = std::to_chars(buffer, buffer + sizeof(buffer), value); + if (ec != std::errc()) { + throw makeLogicError("Failed to encode integer as string"); + } + packStr(std::string_view(buffer, static_cast(ptr - buffer))); } void MsgPackWriter::packArray(uint32_t size) { From 89d5f8edabcbade1acb909e475f43cb30b4866a2 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 13:42:49 -0400 Subject: [PATCH 05/39] Remove non-general TreeData serialization shortcuts --- third_party/proton/csrc/lib/Data/TreeData.cpp | 237 +++--------------- 1 file changed, 40 insertions(+), 197 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 7f136bf33afc..a92efd1677c2 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -95,6 +95,8 @@ class TreeData::Tree { size_t id = DummyId; }; + // Keep a linear child list for small fanouts to avoid hash table overhead. + // Build the lookup index only once repeated child scans become expensive. static constexpr size_t kChildIndexThreshold = 8; TreeNode() = default; @@ -258,19 +260,15 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, metricSummary.updateDeviceIdMask(deviceType, deviceId); const auto &deviceTypeName = getDeviceTypeString(static_cast(deviceType)); - const auto durationName = - KernelMetric::getValueName(KernelMetric::Duration); - const auto invocationsName = - KernelMetric::getValueName(KernelMetric::Invocations); - const auto deviceIdName = KernelMetric::getValueName(KernelMetric::DeviceId); - const auto deviceTypeNameKey = - KernelMetric::getValueName(KernelMetric::DeviceType); const auto deviceIdStr = std::to_string(deviceId); - metricsJson[std::string(durationName)] = duration; - metricsJson[std::string(invocationsName)] = invocations; - metricsJson[std::string(deviceIdName)] = deviceIdStr; - metricsJson[std::string(deviceTypeNameKey)] = deviceTypeName; + metricsJson[KernelMetric::getValueName(KernelMetric::Duration)] = duration; + metricsJson[KernelMetric::getValueName(KernelMetric::Invocations)] = + invocations; + metricsJson[KernelMetric::getValueName(KernelMetric::DeviceId)] = + deviceIdStr; + metricsJson[KernelMetric::getValueName(KernelMetric::DeviceType)] = + deviceTypeName; }; auto appendMetrics = [&](json &metricsJson, const std::map> @@ -283,9 +281,10 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, } else if (metricKind == MetricKind::PCSampling) { auto *pcSamplingMetric = static_cast(metric.get()); for (size_t i = 0; i < PCSamplingMetric::Count; i++) { - const auto valueName = pcSamplingMetric->getValueName(i); + const auto valueName = PCSamplingMetric::getValueName( + static_cast(i)); std::visit( - [&](auto &&value) { metricsJson[std::string(valueName)] = value; }, + [&](auto &&value) { metricsJson[valueName] = value; }, pcSamplingMetric->getValues()[i]); } } else if (metricKind == MetricKind::Cycle) { @@ -298,19 +297,17 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, std::get(cycleMetric->getValue(CycleMetric::DeviceId)); uint64_t deviceType = std::get(cycleMetric->getValue(CycleMetric::DeviceType)); - const auto durationName = CycleMetric::getValueName(CycleMetric::Duration); - const auto normalizedDurationName = - CycleMetric::getValueName(CycleMetric::NormalizedDuration); - const auto deviceIdName = CycleMetric::getValueName(CycleMetric::DeviceId); - const auto deviceTypeName = - CycleMetric::getValueName(CycleMetric::DeviceType); const auto deviceIdStr = std::to_string(deviceId); const auto deviceTypeStr = std::to_string(deviceType); - metricsJson[std::string(durationName)] = duration; - metricsJson[std::string(normalizedDurationName)] = normalizedDuration; - metricsJson[std::string(deviceIdName)] = deviceIdStr; - metricsJson[std::string(deviceTypeName)] = deviceTypeStr; + metricsJson[CycleMetric::getValueName(CycleMetric::Duration)] = + duration; + metricsJson[CycleMetric::getValueName( + CycleMetric::NormalizedDuration)] = normalizedDuration; + metricsJson[CycleMetric::getValueName(CycleMetric::DeviceId)] = + deviceIdStr; + metricsJson[CycleMetric::getValueName(CycleMetric::DeviceType)] = + deviceTypeStr; } else if (metricKind == MetricKind::Flexible) { // Flexible metrics are handled in a different way } else { @@ -330,13 +327,12 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, std::is_same_v || std::is_same_v || std::is_same_v) { - metricsJson[std::string(valueName)] = v; + metricsJson[valueName] = v; } else if constexpr (std::is_same_v> || std::is_same_v> || std::is_same_v>) { - metricsJson[std::string(valueName)] = json::array(); - auto &arr = metricsJson[std::string(valueName)]; - arr.get_ref().reserve(v.size()); + auto &arr = metricsJson[valueName] = json::array(); + arr.template get_ref().reserve(v.size()); for (const auto &value : v) { arr.push_back(value); } @@ -419,25 +415,22 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, if (metricSummary.hasKernelMetric) { output[TreeData::Tree::TreeNode::RootId]["metrics"] - [std::string(KernelMetric::getValueName( - KernelMetric::Invocations))] = 0; + [KernelMetric::getValueName(KernelMetric::Invocations)] = 0; output[TreeData::Tree::TreeNode::RootId]["metrics"] - [std::string(KernelMetric::getValueName(KernelMetric::Duration))] = - 0; + [KernelMetric::getValueName(KernelMetric::Duration)] = 0; } if (metricSummary.hasCycleMetric) { output[TreeData::Tree::TreeNode::RootId]["metrics"] - [std::string(CycleMetric::getValueName(CycleMetric::Duration))] = 0; + [CycleMetric::getValueName(CycleMetric::Duration)] = 0; output[TreeData::Tree::TreeNode::RootId]["metrics"] - [std::string(CycleMetric::getValueName( - CycleMetric::NormalizedDuration))] = 0; + [CycleMetric::getValueName(CycleMetric::NormalizedDuration)] = 0; } if (metricSummary.hasPCSamplingMetric) { - PCSamplingMetric pcSamplingMetric; for (size_t i = 0; i < PCSamplingMetric::Count; i++) { - const auto valueName = pcSamplingMetric.getValueName(i); + const auto valueName = PCSamplingMetric::getValueName( + static_cast(i)); output[TreeData::Tree::TreeNode::RootId]["metrics"] - [std::string(valueName)] = 0; + [valueName] = 0; } } @@ -772,140 +765,17 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } return false; }; - auto isGeneratedNumericScope = [](std::string_view name) { - constexpr std::string_view prefix = "scope_"; - if (name.size() <= prefix.size() || - name.substr(0, prefix.size()) != prefix) { - return false; - } - for (size_t i = prefix.size(); i < name.size(); ++i) { - if (name[i] < '0' || name[i] > '9') { - return false; - } - } - return true; - }; - auto shouldCollapseLinkedVirtualNode = - [&](const TreeData::Tree::TreeNode &virtualNode) { - if (!isGeneratedNumericScope(virtualNode.name) || - virtualNode.children.empty()) { - return false; - } - if (treeNode.metricSet.linkedMetrics.find(virtualNode.id) != - treeNode.metricSet.linkedMetrics.end()) { - return false; - } - return countPromotedFlexibleMetricEntries( - virtualNode.children, - treeNode.metricSet.linkedFlexibleMetrics) == 0; - }; - auto countLinkedVirtualChildrenExpanded = - [&](auto &&countLinkedVirtualChildrenExpanded, - const auto &children) -> uint32_t { + auto countLinkedVirtualChildren = + [&](const auto &children) -> uint32_t { uint32_t childCount = 0; for (const auto &child : children) { - if (!hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { - continue; - } - const auto &virtualNode = virtualTree->getNode(child.id); - if (shouldCollapseLinkedVirtualNode(virtualNode)) { - childCount += countLinkedVirtualChildrenExpanded( - countLinkedVirtualChildrenExpanded, virtualNode.children); - } else { + if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { ++childCount; } } return childCount; }; - struct AggregatedKernelMetric { - std::string_view name; - uint64_t duration = 0; - uint64_t invocations = 0; - uint64_t deviceId = 0; - uint64_t deviceType = 0; - }; - auto tryPackAggregatedCollapsedChildren = - [&](const TreeData::Tree::TreeNode &virtualNode) { - if (virtualNode.children.size() < 64) { - return false; - } - std::vector aggregatedMetrics; - std::unordered_map nameToIndex; - aggregatedMetrics.reserve(8); - nameToIndex.reserve(8); - - for (const auto &child : virtualNode.children) { - if (!hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { - continue; - } - const auto &childNode = virtualTree->getNode(child.id); - if (!shouldCollapseLinkedVirtualNode(childNode)) { - return false; - } - for (const auto &grandchild : childNode.children) { - if (!hasLinkedVirtualNode(hasLinkedVirtualNode, - grandchild.id)) { - continue; - } - const auto &grandchildNode = - virtualTree->getNode(grandchild.id); - if (!grandchildNode.children.empty()) { - return false; - } - const auto *kernelMetric = - getOnlyLinkedKernelMetric(grandchild.id); - if (kernelMetric == nullptr) { - return false; - } - auto name = std::string_view(grandchildNode.name); - auto [it, inserted] = - nameToIndex.emplace(name, aggregatedMetrics.size()); - if (inserted) { - AggregatedKernelMetric aggregate; - aggregate.name = name; - aggregate.deviceId = std::get( - kernelMetric->getValue(KernelMetric::DeviceId)); - aggregate.deviceType = std::get( - kernelMetric->getValue(KernelMetric::DeviceType)); - aggregatedMetrics.push_back(aggregate); - } - auto &aggregate = aggregatedMetrics[it->second]; - aggregate.duration += std::get( - kernelMetric->getValue(KernelMetric::Duration)); - aggregate.invocations += std::get( - kernelMetric->getValue(KernelMetric::Invocations)); - } - } - - if (aggregatedMetrics.empty()) { - return false; - } - - writer.packArray( - static_cast(aggregatedMetrics.size())); - for (const auto &aggregate : aggregatedMetrics) { - writer.packMap(3); - writer.packFixStrLiteral("frame"); - writer.packMap(2); - writer.packFixStrLiteral("name"); - writer.packStr(aggregate.name); - writer.packFixStrLiteral("type"); - writer.packFixStrLiteral("function"); - - writer.packFixStrLiteral("metrics"); - writer.packMap(kernelTotalCount); - packKernelMetricFields(aggregate.duration, - aggregate.invocations, - aggregate.deviceId, - aggregate.deviceType); - - writer.packFixStrLiteral("children"); - writer.packArray(0); - } - return true; - }; - auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); @@ -960,33 +830,18 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, treeNode.metricSet.linkedFlexibleMetrics); writer.packFixStrLiteral("children"); - if (!tryPackAggregatedCollapsedChildren(virtualNode)) { - writer.packArray(countLinkedVirtualChildrenExpanded( - countLinkedVirtualChildrenExpanded, virtualNode.children)); - for (const auto &child : virtualNode.children) { - if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { - const auto &childNode = virtualTree->getNode(child.id); - if (shouldCollapseLinkedVirtualNode(childNode)) { - for (const auto &grandchild : childNode.children) { - if (hasLinkedVirtualNode(hasLinkedVirtualNode, - grandchild.id)) { - packLinkedVirtualNode(packLinkedVirtualNode, - grandchild.id); - } - } - } else { - packLinkedVirtualNode(packLinkedVirtualNode, child.id); - } - } + writer.packArray( + countLinkedVirtualChildren(virtualNode.children)); + for (const auto &child : virtualNode.children) { + if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { + packLinkedVirtualNode(packLinkedVirtualNode, child.id); } } }; uint32_t linkedChildCount = hasLinkedTargets - ? countLinkedVirtualChildrenExpanded( - countLinkedVirtualChildrenExpanded, - virtualRootNode.children) + ? countLinkedVirtualChildren(virtualRootNode.children) : 0; writer.packFixStrLiteral("children"); writer.packArray(static_cast(treeNode.children.size()) + @@ -997,19 +852,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, if (hasLinkedTargets) { for (const auto &virtualChild : virtualRootNode.children) { if (hasLinkedVirtualNode(hasLinkedVirtualNode, virtualChild.id)) { - const auto &virtualChildNode = - virtualTree->getNode(virtualChild.id); - if (shouldCollapseLinkedVirtualNode(virtualChildNode)) { - for (const auto &grandchild : virtualChildNode.children) { - if (hasLinkedVirtualNode(hasLinkedVirtualNode, - grandchild.id)) { - packLinkedVirtualNode(packLinkedVirtualNode, - grandchild.id); - } - } - } else { - packLinkedVirtualNode(packLinkedVirtualNode, virtualChild.id); - } + packLinkedVirtualNode(packLinkedVirtualNode, virtualChild.id); } } } From 7fcb4b797ca6d6b3bcb53663d73969a43409cad8 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 14:07:07 -0400 Subject: [PATCH 06/39] Inline trivial TreeData serialization helpers --- third_party/proton/csrc/lib/Data/TreeData.cpp | 224 ++++++++---------- 1 file changed, 98 insertions(+), 126 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index a92efd1677c2..a0e1f4806ded 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -246,38 +246,35 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, jsonNodes[TreeData::Tree::TreeNode::RootId] = &(output.back()); MetricSummary metricSummary; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - auto appendKernelMetric = [&](json &metricsJson, - const KernelMetric *kernelMetric) { - uint64_t duration = - std::get(kernelMetric->getValue(KernelMetric::Duration)); - uint64_t invocations = std::get( - kernelMetric->getValue(KernelMetric::Invocations)); - uint64_t deviceId = - std::get(kernelMetric->getValue(KernelMetric::DeviceId)); - uint64_t deviceType = - std::get(kernelMetric->getValue(KernelMetric::DeviceType)); - metricSummary.hasKernelMetric = true; - metricSummary.updateDeviceIdMask(deviceType, deviceId); - const auto &deviceTypeName = - getDeviceTypeString(static_cast(deviceType)); - const auto deviceIdStr = std::to_string(deviceId); - - metricsJson[KernelMetric::getValueName(KernelMetric::Duration)] = duration; - metricsJson[KernelMetric::getValueName(KernelMetric::Invocations)] = - invocations; - metricsJson[KernelMetric::getValueName(KernelMetric::DeviceId)] = - deviceIdStr; - metricsJson[KernelMetric::getValueName(KernelMetric::DeviceType)] = - deviceTypeName; - }; auto appendMetrics = [&](json &metricsJson, const std::map> &metrics) { metricSummary.observeMetrics(metrics); for (const auto &[metricKind, metric] : metrics) { if (metricKind == MetricKind::Kernel) { - appendKernelMetric(metricsJson, - static_cast(metric.get())); + auto *kernelMetric = static_cast(metric.get()); + uint64_t duration = + std::get(kernelMetric->getValue(KernelMetric::Duration)); + uint64_t invocations = std::get( + kernelMetric->getValue(KernelMetric::Invocations)); + uint64_t deviceId = + std::get(kernelMetric->getValue(KernelMetric::DeviceId)); + uint64_t deviceType = std::get( + kernelMetric->getValue(KernelMetric::DeviceType)); + metricSummary.hasKernelMetric = true; + metricSummary.updateDeviceIdMask(deviceType, deviceId); + const auto &deviceTypeName = + getDeviceTypeString(static_cast(deviceType)); + const auto deviceIdStr = std::to_string(deviceId); + + metricsJson[KernelMetric::getValueName(KernelMetric::Duration)] = + duration; + metricsJson[KernelMetric::getValueName(KernelMetric::Invocations)] = + invocations; + metricsJson[KernelMetric::getValueName(KernelMetric::DeviceId)] = + deviceIdStr; + metricsJson[KernelMetric::getValueName(KernelMetric::DeviceType)] = + deviceTypeName; } else if (metricKind == MetricKind::PCSampling) { auto *pcSamplingMetric = static_cast(metric.get()); for (size_t i = 0; i < PCSamplingMetric::Count; i++) { @@ -484,40 +481,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, constexpr uint32_t cycleInclusiveCount = 2; constexpr uint32_t cycleTotalCount = 4; - auto packFlexibleMetricValue = [&](const MetricValueType &value) { - std::visit( - [&](auto &&v) { - using T = std::decay_t; - if constexpr (std::is_same_v) { - writer.packUInt(v); - } else if constexpr (std::is_same_v) { - writer.packInt(v); - } else if constexpr (std::is_same_v) { - writer.packDouble(v); - } else if constexpr (std::is_same_v) { - writer.packStr(v); - } else if constexpr (std::is_same_v>) { - writer.packArray(static_cast(v.size())); - for (auto value : v) { - writer.packUInt(value); - } - } else if constexpr (std::is_same_v>) { - writer.packArray(static_cast(v.size())); - for (auto value : v) { - writer.packInt(value); - } - } else if constexpr (std::is_same_v>) { - writer.packArray(static_cast(v.size())); - for (auto value : v) { - writer.packDouble(value); - } - } else { - static_assert(sizeof(T) == 0, "Unsupported MetricValueType"); - } - }, - value); - }; - auto countMetricEntries = [&](const std::map> &metrics, bool isRoot) -> uint32_t { @@ -551,31 +514,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } return metricEntries; }; - auto countFlexibleMetricEntries = - [&](const std::map &flexibleMetrics) - -> uint32_t { return static_cast(flexibleMetrics.size()); }; - auto packKernelMetricName = [&](KernelMetric::kernelMetricKind valueId) { - writer.packStr(KernelMetric::getValueName(valueId)); - }; - auto packCycleMetricName = [&](CycleMetric::CycleMetricKind valueId) { - writer.packStr(CycleMetric::getValueName(valueId)); - }; - - auto packKernelMetricFields = [&](uint64_t duration, uint64_t invocations, - uint64_t deviceId, uint64_t deviceType) { - metricSummary.updateDeviceIdMask(deviceType, deviceId); - const auto &deviceTypeName = - getDeviceTypeString(static_cast(deviceType)); - packKernelMetricName(KernelMetric::Duration); - writer.packUInt(duration); - packKernelMetricName(KernelMetric::Invocations); - writer.packUInt(invocations); - packKernelMetricName(KernelMetric::DeviceId); - writer.packUIntString(deviceId); - packKernelMetricName(KernelMetric::DeviceType); - writer.packStr(deviceTypeName); - }; - auto packKernelMetricValues = [&](const KernelMetric *kernelMetric) { uint64_t duration = std::get(kernelMetric->getValue(KernelMetric::Duration)); @@ -585,7 +523,17 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, std::get(kernelMetric->getValue(KernelMetric::DeviceId)); uint64_t deviceType = std::get(kernelMetric->getValue(KernelMetric::DeviceType)); - packKernelMetricFields(duration, invocations, deviceId, deviceType); + metricSummary.updateDeviceIdMask(deviceType, deviceId); + const auto &deviceTypeName = + getDeviceTypeString(static_cast(deviceType)); + writer.packStr(KernelMetric::getValueName(KernelMetric::Duration)); + writer.packUInt(duration); + writer.packStr(KernelMetric::getValueName(KernelMetric::Invocations)); + writer.packUInt(invocations); + writer.packStr(KernelMetric::getValueName(KernelMetric::DeviceId)); + writer.packUIntString(deviceId); + writer.packStr(KernelMetric::getValueName(KernelMetric::DeviceType)); + writer.packStr(deviceTypeName); }; auto packMetrics = [&](const std::map> @@ -594,9 +542,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, for (const auto &[metricKind, metric] : metrics) { if (metricKind == MetricKind::Kernel) { if (isRoot) { - packKernelMetricName(KernelMetric::Duration); + writer.packStr(KernelMetric::getValueName(KernelMetric::Duration)); writer.packUInt(0); - packKernelMetricName(KernelMetric::Invocations); + writer.packStr(KernelMetric::getValueName(KernelMetric::Invocations)); writer.packUInt(0); continue; } @@ -616,9 +564,10 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } else if (metricKind == MetricKind::Cycle) { if (isRoot) { - packCycleMetricName(CycleMetric::Duration); + writer.packStr(CycleMetric::getValueName(CycleMetric::Duration)); writer.packUInt(0); - packCycleMetricName(CycleMetric::NormalizedDuration); + writer.packStr( + CycleMetric::getValueName(CycleMetric::NormalizedDuration)); writer.packUInt(0); continue; } @@ -634,13 +583,14 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, std::get(cycleMetric->getValue(CycleMetric::DeviceType)); metricSummary.updateDeviceIdMask(deviceType, deviceId); - packCycleMetricName(CycleMetric::Duration); + writer.packStr(CycleMetric::getValueName(CycleMetric::Duration)); writer.packUInt(duration); - packCycleMetricName(CycleMetric::NormalizedDuration); + writer.packStr( + CycleMetric::getValueName(CycleMetric::NormalizedDuration)); writer.packDouble(normalizedDuration); - packCycleMetricName(CycleMetric::DeviceId); + writer.packStr(CycleMetric::getValueName(CycleMetric::DeviceId)); writer.packUIntString(deviceId); - packCycleMetricName(CycleMetric::DeviceType); + writer.packStr(CycleMetric::getValueName(CycleMetric::DeviceType)); writer.packUIntString(deviceType); } else { throw makeLogicError("MetricKind not supported"); @@ -649,9 +599,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, if (isRoot) { if (metricSummary.hasKernelMetric && metrics.find(MetricKind::Kernel) == metrics.end()) { - packKernelMetricName(KernelMetric::Duration); + writer.packStr(KernelMetric::getValueName(KernelMetric::Duration)); writer.packUInt(0); - packKernelMetricName(KernelMetric::Invocations); + writer.packStr(KernelMetric::getValueName(KernelMetric::Invocations)); writer.packUInt(0); } if (metricSummary.hasPCSamplingMetric && @@ -665,9 +615,10 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } if (metricSummary.hasCycleMetric && metrics.find(MetricKind::Cycle) == metrics.end()) { - packCycleMetricName(CycleMetric::Duration); + writer.packStr(CycleMetric::getValueName(CycleMetric::Duration)); writer.packUInt(0); - packCycleMetricName(CycleMetric::NormalizedDuration); + writer.packStr( + CycleMetric::getValueName(CycleMetric::NormalizedDuration)); writer.packUInt(0); } } @@ -677,7 +628,39 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, for (const auto &[_, flexibleMetric] : flexibleMetrics) { const auto valueName = flexibleMetric.getValueName(0); writer.packStr(valueName); - packFlexibleMetricValue(flexibleMetric.getValues()[0]); + std::visit( + [&](auto &&v) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + writer.packUInt(v); + } else if constexpr (std::is_same_v) { + writer.packInt(v); + } else if constexpr (std::is_same_v) { + writer.packDouble(v); + } else if constexpr (std::is_same_v) { + writer.packStr(v); + } else if constexpr (std::is_same_v>) { + writer.packArray(static_cast(v.size())); + for (auto value : v) { + writer.packUInt(value); + } + } else if constexpr (std::is_same_v>) { + writer.packArray(static_cast(v.size())); + for (auto value : v) { + writer.packInt(value); + } + } else if constexpr (std::is_same_v>) { + writer.packArray(static_cast(v.size())); + for (auto value : v) { + writer.packDouble(value); + } + } else { + static_assert(sizeof(T) == 0, "Unsupported MetricValueType"); + } + }, + flexibleMetric.getValues()[0]); } }; auto countPromotedFlexibleMetricEntries = @@ -688,7 +671,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, for (const auto &child : children) { auto it = linkedFlexibleMetrics.find(child.id); if (it != linkedFlexibleMetrics.end()) { - metricEntries += countFlexibleMetricEntries(it->second); + metricEntries += static_cast(it->second.size()); } } return metricEntries; @@ -718,7 +701,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; writer.packMap( countMetricEntries(treeNode.metricSet.metrics, isRoot) + - countFlexibleMetricEntries(treeNode.metricSet.flexibleMetrics) + + static_cast(treeNode.metricSet.flexibleMetrics.size()) + countPromotedFlexibleMetricEntries( virtualRootNode.children, treeNode.metricSet.linkedFlexibleMetrics)); @@ -730,19 +713,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, !treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty(); - auto getOnlyLinkedKernelMetric = [&](size_t virtualNodeId) - -> const KernelMetric * { - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - if (metricsIt == treeNode.metricSet.linkedMetrics.end() || - metricsIt->second.size() != 1 || - metricsIt->second.begin()->first != MetricKind::Kernel) { - return nullptr; - } - return static_cast( - metricsIt->second.begin()->second.get()); - }; - auto hasLinkedVirtualNode = [&](auto &&hasLinkedVirtualNode, size_t virtualNodeId) -> bool { const auto &virtualNode = virtualTree->getNode(virtualNodeId); @@ -779,8 +749,15 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); - const auto *onlyKernelMetric = - getOnlyLinkedKernelMetric(virtualNodeId); + const KernelMetric *onlyKernelMetric = nullptr; + const auto onlyMetricIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + if (onlyMetricIt != treeNode.metricSet.linkedMetrics.end() && + onlyMetricIt->second.size() == 1 && + onlyMetricIt->second.begin()->first == MetricKind::Kernel) { + onlyKernelMetric = static_cast( + onlyMetricIt->second.begin()->second.get()); + } if (virtualNode.children.empty() && onlyKernelMetric != nullptr) { writer.packMap(3); @@ -871,15 +848,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } - auto countSetBits = [](uint32_t mask) -> uint32_t { - uint32_t count = 0; - while (mask) { - mask &= (mask - 1); - ++count; - } - return count; - }; - writer.packMap(deviceTypeEntries); for (size_t deviceType = 0; deviceType < static_cast(DeviceType::COUNT); ++deviceType) { @@ -892,7 +860,11 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, getDeviceTypeString(static_cast(deviceType)); writer.packStr(deviceTypeName); - writer.packMap(countSetBits(mask)); + uint32_t deviceIdEntries = 0; + for (auto remaining = mask; remaining != 0; remaining &= (remaining - 1)) { + ++deviceIdEntries; + } + writer.packMap(deviceIdEntries); for (uint64_t deviceId = 0; deviceId < kMaxRegisteredDeviceIds; ++deviceId) { if ((mask & (1u << static_cast(deviceId))) == 0) { From 395ee4caa2a4508044945292b9de7b4fd469ff71 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 17:31:01 -0400 Subject: [PATCH 07/39] Handle graph-specific capture tags during dumping --- .../proton/csrc/include/Profiler/Graph.h | 25 ++++ .../proton/csrc/lib/Data/TraceData.cpp | 19 ++- third_party/proton/csrc/lib/Data/TreeData.cpp | 120 ++++++++++++------ 3 files changed, 117 insertions(+), 47 deletions(-) diff --git a/third_party/proton/csrc/include/Profiler/Graph.h b/third_party/proton/csrc/include/Profiler/Graph.h index a4491da51b07..097dcba1efb4 100644 --- a/third_party/proton/csrc/include/Profiler/Graph.h +++ b/third_party/proton/csrc/include/Profiler/Graph.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -45,6 +47,29 @@ struct GraphState { // Capture tag to identify captured call paths static constexpr const char *captureTag = ""; static constexpr const char *metricTag = ""; + + static std::string makeCaptureTag(uint32_t graphId) { + return std::string(captureTag) + std::to_string(graphId); + } + + static bool isCaptureTag(std::string_view name) { + constexpr std::string_view prefix = captureTag; + if (name.size() < prefix.size() || + name.substr(0, prefix.size()) != prefix) { + return false; + } + for (size_t i = prefix.size(); i < name.size(); ++i) { + if (name[i] < '0' || name[i] > '9') { + return false; + } + } + return true; + } + + static std::string_view getDisplayName(std::string_view name) { + return isCaptureTag(name) ? std::string_view(captureTag) : name; + } + struct NodeState { // The graph node id for this node uint64_t nodeId{}; diff --git a/third_party/proton/csrc/lib/Data/TraceData.cpp b/third_party/proton/csrc/lib/Data/TraceData.cpp index 450c30a32ed2..84bec46008a2 100644 --- a/third_party/proton/csrc/lib/Data/TraceData.cpp +++ b/third_party/proton/csrc/lib/Data/TraceData.cpp @@ -687,11 +687,12 @@ void reconstructGraphScopeEvents( isMetadataKernel = true; break; } - if (context.name == GraphState::captureTag) { + if (GraphState::isCaptureTag(context.name)) { seenCaptureTag = true; } if (seenCaptureTag) { - graphContexts.push_back(context); + graphContexts.emplace_back( + std::string(GraphState::getDisplayName(context.name))); } } if (isMetadataKernel) { @@ -973,9 +974,17 @@ void TraceData::dumpChromeTrace(std::ostream &os, size_t phase) const { for (auto targetEntryId : targetEntryIds) { // Linked target ids are event ids, so resolve through the event first. auto &targetEvent = virtualTrace->getEvent(targetEntryId); - targetIdToVirtualContexts.emplace( - targetEntryId, virtualTrace->getContexts(targetEvent.contextId, - /*skipRoot=*/true)); + auto resolvedContexts = + virtualTrace->getContexts(targetEvent.contextId, + /*skipRoot=*/true); + std::vector virtualContexts; + virtualContexts.reserve(resolvedContexts.size()); + for (const auto &context : resolvedContexts) { + virtualContexts.emplace_back( + std::string(GraphState::getDisplayName(context.name))); + } + targetIdToVirtualContexts.emplace(targetEntryId, + std::move(virtualContexts)); } }); } diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index a0e1f4806ded..207f927acf2c 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -3,6 +3,7 @@ #include "Data/Metric.h" #include "Device.h" #include "DeviceType.h" +#include "Profiler/Graph.h" #include "Utility/Errors.h" #include "Utility/MsgPackWriter.h" #include @@ -340,12 +341,23 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, flexibleMetric.getValues()[0]); } }; + auto appendPromotedFlexibleMetrics = + [&](const auto &children, json &metricsJson, + const DataEntry::LinkedFlexibleMetricMap &linkedFlexibleMetrics) { + for (const auto &child : children) { + auto it = linkedFlexibleMetrics.find(child.id); + if (it != linkedFlexibleMetrics.end()) { + appendFlexibleMetrics(metricsJson, it->second); + } + } + }; tree->template walk( [&](TreeData::Tree::TreeNode &treeNode) { const auto &contextName = treeNode.name; auto contextId = treeNode.id; json *jsonNode = jsonNodes[contextId]; - (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}}; + (*jsonNode)["frame"] = {{"name", GraphState::getDisplayName(contextName)}, + {"type", "function"}}; (*jsonNode)["metrics"] = json::object(); auto &metricsJson = (*jsonNode)["metrics"]; appendMetrics(metricsJson, treeNode.metricSet.metrics); @@ -365,16 +377,35 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, if (!hasLinkedTargets) { return; } - std::function appendLinkedVirtualNode = + std::set linkedVirtualNodeIds; + auto addLinkedVirtualAncestors = [&](size_t virtualNodeId, + bool includeSelf) { + if (!includeSelf) { + virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; + } + while (virtualNodeId != Tree::TreeNode::RootId) { + linkedVirtualNodeIds.insert(virtualNodeId); + virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; + } + }; + for (const auto &[virtualNodeId, _] : treeNode.metricSet.linkedMetrics) { + addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/true); + } + for (const auto &[virtualNodeId, _] : + treeNode.metricSet.linkedFlexibleMetrics) { + addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/false); + } + std::function appendLinkedVirtualNode = [&](size_t virtualNodeId, json &outNode, - json &parentMetricsJson) -> bool { + json &parentMetricsJson) { const auto &virtualNode = virtualTree->getNode(virtualNodeId); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); const auto flexibleIt = treeNode.metricSet.linkedFlexibleMetrics.find(virtualNodeId); outNode = json::object(); - outNode["frame"] = {{"name", virtualNode.name}, + outNode["frame"] = {{"name", GraphState::getDisplayName( + virtualNode.name)}, {"type", "function"}}; outNode["metrics"] = json::object(); if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { @@ -386,26 +417,29 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, treeNode.metricSet.linkedFlexibleMetrics.end()) { appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); } + appendPromotedFlexibleMetrics( + virtualNode.children, outNode["metrics"], + treeNode.metricSet.linkedFlexibleMetrics); outNode["children"] = json::array(); auto &linkedChildren = outNode["children"]; linkedChildren.get_ref().reserve( virtualNode.children.size()); for (const auto &child : virtualNode.children) { - json linkedChildNode; - if (appendLinkedVirtualNode(child.id, linkedChildNode, - outNode["metrics"])) { - linkedChildren.push_back(std::move(linkedChildNode)); + if (linkedVirtualNodeIds.find(child.id) != + linkedVirtualNodeIds.end()) { + linkedChildren.push_back(json::object()); + appendLinkedVirtualNode(child.id, linkedChildren.back(), + outNode["metrics"]); } } - return !outNode["metrics"].empty() || - !outNode["children"].empty(); }; for (const auto &virtualChild : virtualRootNode.children) { - json linkedRootChildNode; - if (appendLinkedVirtualNode(virtualChild.id, linkedRootChildNode, - metricsJson)) { - childrenArray.push_back(std::move(linkedRootChildNode)); + if (linkedVirtualNodeIds.find(virtualChild.id) != + linkedVirtualNodeIds.end()) { + childrenArray.push_back(json::object()); + appendLinkedVirtualNode(virtualChild.id, childrenArray.back(), + metricsJson); } } }); @@ -693,7 +727,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("frame"); writer.packMap(2); writer.packFixStrLiteral("name"); - writer.packStr(treeNode.name); + writer.packStr(GraphState::getDisplayName(treeNode.name)); writer.packFixStrLiteral("type"); writer.packFixStrLiteral("function"); @@ -712,34 +746,34 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, const bool hasLinkedTargets = !treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty(); + std::set linkedVirtualNodeIds; + if (hasLinkedTargets) { + auto addLinkedVirtualAncestors = [&](size_t virtualNodeId, + bool includeSelf) { + if (!includeSelf) { + virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; + } + while (virtualNodeId != Tree::TreeNode::RootId) { + linkedVirtualNodeIds.insert(virtualNodeId); + virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; + } + }; + for (const auto &[virtualNodeId, _] : + treeNode.metricSet.linkedMetrics) { + addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/true); + } + for (const auto &[virtualNodeId, _] : + treeNode.metricSet.linkedFlexibleMetrics) { + addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/false); + } + } - auto hasLinkedVirtualNode = [&](auto &&hasLinkedVirtualNode, - size_t virtualNodeId) -> bool { - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - if (metricsIt != treeNode.metricSet.linkedMetrics.end() && - countMetricEntries(metricsIt->second, /*isRoot=*/false) > - 0) { - return true; - } - if (countPromotedFlexibleMetricEntries( - virtualNode.children, - treeNode.metricSet.linkedFlexibleMetrics) > 0) { - return true; - } - for (const auto &child : virtualNode.children) { - if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { - return true; - } - } - return false; - }; auto countLinkedVirtualChildren = [&](const auto &children) -> uint32_t { uint32_t childCount = 0; for (const auto &child : children) { - if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { + if (linkedVirtualNodeIds.find(child.id) != + linkedVirtualNodeIds.end()) { ++childCount; } } @@ -765,7 +799,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("frame"); writer.packMap(2); writer.packFixStrLiteral("name"); - writer.packStr(virtualNode.name); + writer.packStr(GraphState::getDisplayName(virtualNode.name)); writer.packFixStrLiteral("type"); writer.packFixStrLiteral("function"); @@ -783,7 +817,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("frame"); writer.packMap(2); writer.packFixStrLiteral("name"); - writer.packStr(virtualNode.name); + writer.packStr(GraphState::getDisplayName(virtualNode.name)); writer.packFixStrLiteral("type"); writer.packFixStrLiteral("function"); @@ -810,7 +844,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packArray( countLinkedVirtualChildren(virtualNode.children)); for (const auto &child : virtualNode.children) { - if (hasLinkedVirtualNode(hasLinkedVirtualNode, child.id)) { + if (linkedVirtualNodeIds.find(child.id) != + linkedVirtualNodeIds.end()) { packLinkedVirtualNode(packLinkedVirtualNode, child.id); } } @@ -828,7 +863,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } if (hasLinkedTargets) { for (const auto &virtualChild : virtualRootNode.children) { - if (hasLinkedVirtualNode(hasLinkedVirtualNode, virtualChild.id)) { + if (linkedVirtualNodeIds.find(virtualChild.id) != + linkedVirtualNodeIds.end()) { packLinkedVirtualNode(packLinkedVirtualNode, virtualChild.id); } } From 90233449dd7b1d994e1ec95cd21adc50035a3328 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 18:07:49 -0400 Subject: [PATCH 08/39] Tag CUDA graph capture contexts by graph id --- third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp index 8b74cc70ea87..790a7d0d6a2c 100644 --- a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp @@ -514,15 +514,16 @@ void CuptiProfiler::CuptiProfilerPimpl::handleGraphResourceCallbacks( } for (auto *data : profiler.dataSet) { auto currentContexts = data->getContexts(); + auto graphCaptureTag = GraphState::makeCaptureTag(graphId); std::vector contexts; - contexts.emplace_back(GraphState::captureTag); + contexts.emplace_back(graphCaptureTag); for (const auto &context : currentContexts) { contexts.push_back(context); } if (isMetricKernelNode) { auto flexibleMetricContexts = data->getContexts(false); std::vector flexibleMetricEntryContexts; - flexibleMetricEntryContexts.emplace_back(GraphState::captureTag); + flexibleMetricEntryContexts.emplace_back(graphCaptureTag); for (const auto &context : flexibleMetricContexts) { flexibleMetricEntryContexts.push_back(context); } From 918f9dc2c2427bafe97c432d8628df3f6822c761 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 18:28:39 -0400 Subject: [PATCH 09/39] Resolve graph linked dumps from target roots --- .../proton/csrc/include/Profiler/Graph.h | 16 ++ third_party/proton/csrc/lib/Data/TreeData.cpp | 158 ++++++++---------- 2 files changed, 82 insertions(+), 92 deletions(-) diff --git a/third_party/proton/csrc/include/Profiler/Graph.h b/third_party/proton/csrc/include/Profiler/Graph.h index 097dcba1efb4..5134d5f69f6b 100644 --- a/third_party/proton/csrc/include/Profiler/Graph.h +++ b/third_party/proton/csrc/include/Profiler/Graph.h @@ -66,6 +66,22 @@ struct GraphState { return true; } + static std::optional getCaptureGraphId(std::string_view name) { + constexpr std::string_view prefix = captureTag; + if (name.size() <= prefix.size() || + name.substr(0, prefix.size()) != prefix) { + return std::nullopt; + } + uint32_t graphId = 0; + for (size_t i = prefix.size(); i < name.size(); ++i) { + if (name[i] < '0' || name[i] > '9') { + return std::nullopt; + } + graphId = graphId * 10 + static_cast(name[i] - '0'); + } + return graphId; + } + static std::string_view getDisplayName(std::string_view name) { return isCaptureTag(name) ? std::string_view(captureTag) : name; } diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 207f927acf2c..a8acda209ab3 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -6,6 +6,7 @@ #include "Profiler/Graph.h" #include "Utility/Errors.h" #include "Utility/MsgPackWriter.h" +#include #include #include #include @@ -16,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -140,6 +140,7 @@ class TreeData::Tree { size_t parentId = DummyId; size_t id = DummyId; + size_t graphRootId = DummyId; std::vector children = {}; std::unique_ptr> childIndex = {}; @@ -167,6 +168,11 @@ class TreeData::Tree { return existingChildId; auto id = nextContextId++; treeNodes.emplace_back(id, parentId, context); + if (GraphState::getCaptureGraphId(contextName)) { + treeNodes.back().graphRootId = id; + } else { + treeNodes.back().graphRootId = parent.graphRootId; + } parent.addChild(treeNodes.back().name, id); return id; } @@ -181,6 +187,8 @@ class TreeData::Tree { TreeNode &getNode(size_t id) { return treeNodes.at(id); } + size_t getGraphRootId(size_t id) { return getNode(id).graphRootId; } + void upsertFlexibleMetric(size_t contextId, const FlexibleMetric &flexibleMetric) { auto &node = getNode(contextId); @@ -220,6 +228,7 @@ class TreeData::Tree { for (const auto &node : treeNodes) { cloned.treeNodes.emplace_back(node.id, node.parentId, node); + cloned.treeNodes.back().graphRootId = node.graphRootId; } for (const auto &node : treeNodes) { @@ -246,7 +255,26 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, output.push_back(json::object()); jsonNodes[TreeData::Tree::TreeNode::RootId] = &(output.back()); MetricSummary metricSummary; - const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); + auto getLinkedGraphRootIds = + [&](const DataEntry::MetricSet &metricSet) -> std::vector { + std::vector rootIds; + rootIds.reserve(metricSet.linkedMetrics.size() + + metricSet.linkedFlexibleMetrics.size()); + auto addRootId = [&](size_t linkedId) { + auto rootId = virtualTree->getGraphRootId(linkedId); + if (rootId != Tree::TreeNode::DummyId && + std::find(rootIds.begin(), rootIds.end(), rootId) == rootIds.end()) { + rootIds.push_back(rootId); + } + }; + for (const auto &[linkedId, _] : metricSet.linkedMetrics) { + addRootId(linkedId); + } + for (const auto &[linkedId, _] : metricSet.linkedFlexibleMetrics) { + addRootId(linkedId); + } + return rootIds; + }; auto appendMetrics = [&](json &metricsJson, const std::map> &metrics) { @@ -364,37 +392,16 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, appendFlexibleMetrics(metricsJson, treeNode.metricSet.flexibleMetrics); auto &childrenArray = (*jsonNode)["children"]; childrenArray = json::array(); - const bool hasLinkedTargets = - !treeNode.metricSet.linkedMetrics.empty() || - !treeNode.metricSet.linkedFlexibleMetrics.empty(); + auto linkedGraphRootIds = getLinkedGraphRootIds(treeNode.metricSet); childrenArray.get_ref().reserve( - treeNode.children.size() + - (hasLinkedTargets ? virtualRootNode.children.size() : 0)); + treeNode.children.size() + linkedGraphRootIds.size()); for (const auto &child : treeNode.children) { childrenArray.push_back(json::object()); jsonNodes[child.id] = &childrenArray.back(); } - if (!hasLinkedTargets) { + if (linkedGraphRootIds.empty()) { return; } - std::set linkedVirtualNodeIds; - auto addLinkedVirtualAncestors = [&](size_t virtualNodeId, - bool includeSelf) { - if (!includeSelf) { - virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; - } - while (virtualNodeId != Tree::TreeNode::RootId) { - linkedVirtualNodeIds.insert(virtualNodeId); - virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; - } - }; - for (const auto &[virtualNodeId, _] : treeNode.metricSet.linkedMetrics) { - addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/true); - } - for (const auto &[virtualNodeId, _] : - treeNode.metricSet.linkedFlexibleMetrics) { - addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/false); - } std::function appendLinkedVirtualNode = [&](size_t virtualNodeId, json &outNode, json &parentMetricsJson) { @@ -425,22 +432,16 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, linkedChildren.get_ref().reserve( virtualNode.children.size()); for (const auto &child : virtualNode.children) { - if (linkedVirtualNodeIds.find(child.id) != - linkedVirtualNodeIds.end()) { - linkedChildren.push_back(json::object()); - appendLinkedVirtualNode(child.id, linkedChildren.back(), - outNode["metrics"]); - } + linkedChildren.push_back(json::object()); + appendLinkedVirtualNode(child.id, linkedChildren.back(), + outNode["metrics"]); } }; - for (const auto &virtualChild : virtualRootNode.children) { - if (linkedVirtualNodeIds.find(virtualChild.id) != - linkedVirtualNodeIds.end()) { - childrenArray.push_back(json::object()); - appendLinkedVirtualNode(virtualChild.id, childrenArray.back(), - metricsJson); - } + for (auto virtualRootId : linkedGraphRootIds) { + childrenArray.push_back(json::object()); + appendLinkedVirtualNode(virtualRootId, childrenArray.back(), + metricsJson); } }); @@ -509,6 +510,26 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, metricSummary.hasKernelMetric = true; const std::map> emptyMetrics; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); + auto getLinkedGraphRootIds = + [&](const DataEntry::MetricSet &metricSet) -> std::vector { + std::vector rootIds; + rootIds.reserve(metricSet.linkedMetrics.size() + + metricSet.linkedFlexibleMetrics.size()); + auto addRootId = [&](size_t linkedId) { + auto rootId = virtualTree->getGraphRootId(linkedId); + if (rootId != Tree::TreeNode::DummyId && + std::find(rootIds.begin(), rootIds.end(), rootId) == rootIds.end()) { + rootIds.push_back(rootId); + } + }; + for (const auto &[linkedId, _] : metricSet.linkedMetrics) { + addRootId(linkedId); + } + for (const auto &[linkedId, _] : metricSet.linkedFlexibleMetrics) { + addRootId(linkedId); + } + return rootIds; + }; constexpr uint32_t kernelInclusiveCount = 2; constexpr uint32_t kernelTotalCount = 4; @@ -743,42 +764,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, packFlexibleMetrics(treeNode.metricSet.flexibleMetrics); packPromotedFlexibleMetrics(virtualRootNode.children, treeNode.metricSet.linkedFlexibleMetrics); - const bool hasLinkedTargets = - !treeNode.metricSet.linkedMetrics.empty() || - !treeNode.metricSet.linkedFlexibleMetrics.empty(); - std::set linkedVirtualNodeIds; - if (hasLinkedTargets) { - auto addLinkedVirtualAncestors = [&](size_t virtualNodeId, - bool includeSelf) { - if (!includeSelf) { - virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; - } - while (virtualNodeId != Tree::TreeNode::RootId) { - linkedVirtualNodeIds.insert(virtualNodeId); - virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; - } - }; - for (const auto &[virtualNodeId, _] : - treeNode.metricSet.linkedMetrics) { - addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/true); - } - for (const auto &[virtualNodeId, _] : - treeNode.metricSet.linkedFlexibleMetrics) { - addLinkedVirtualAncestors(virtualNodeId, /*includeSelf=*/false); - } - } - - auto countLinkedVirtualChildren = - [&](const auto &children) -> uint32_t { - uint32_t childCount = 0; - for (const auto &child : children) { - if (linkedVirtualNodeIds.find(child.id) != - linkedVirtualNodeIds.end()) { - ++childCount; - } - } - return childCount; - }; + auto linkedGraphRootIds = getLinkedGraphRootIds(treeNode.metricSet); auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { @@ -842,32 +828,20 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("children"); writer.packArray( - countLinkedVirtualChildren(virtualNode.children)); + static_cast(virtualNode.children.size())); for (const auto &child : virtualNode.children) { - if (linkedVirtualNodeIds.find(child.id) != - linkedVirtualNodeIds.end()) { - packLinkedVirtualNode(packLinkedVirtualNode, child.id); - } + packLinkedVirtualNode(packLinkedVirtualNode, child.id); } }; - uint32_t linkedChildCount = - hasLinkedTargets - ? countLinkedVirtualChildren(virtualRootNode.children) - : 0; writer.packFixStrLiteral("children"); writer.packArray(static_cast(treeNode.children.size()) + - linkedChildCount); + static_cast(linkedGraphRootIds.size())); for (const auto &child : treeNode.children) { packNode(packNode, tree->getNode(child.id)); } - if (hasLinkedTargets) { - for (const auto &virtualChild : virtualRootNode.children) { - if (linkedVirtualNodeIds.find(virtualChild.id) != - linkedVirtualNodeIds.end()) { - packLinkedVirtualNode(packLinkedVirtualNode, virtualChild.id); - } - } + for (auto virtualRootId : linkedGraphRootIds) { + packLinkedVirtualNode(packLinkedVirtualNode, virtualRootId); } }; From f51bd956c46bebfbb2bfe0af035ce9f9eca9d904 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 19:18:25 -0400 Subject: [PATCH 10/39] Remove linked kernel leaf packing shortcut --- third_party/proton/csrc/lib/Data/TreeData.cpp | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index a8acda209ab3..f3320275bfab 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -769,35 +769,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); - const KernelMetric *onlyKernelMetric = nullptr; - const auto onlyMetricIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - if (onlyMetricIt != treeNode.metricSet.linkedMetrics.end() && - onlyMetricIt->second.size() == 1 && - onlyMetricIt->second.begin()->first == MetricKind::Kernel) { - onlyKernelMetric = static_cast( - onlyMetricIt->second.begin()->second.get()); - } - if (virtualNode.children.empty() && - onlyKernelMetric != nullptr) { - writer.packMap(3); - - writer.packFixStrLiteral("frame"); - writer.packMap(2); - writer.packFixStrLiteral("name"); - writer.packStr(GraphState::getDisplayName(virtualNode.name)); - writer.packFixStrLiteral("type"); - writer.packFixStrLiteral("function"); - - writer.packFixStrLiteral("metrics"); - writer.packMap(kernelTotalCount); - packKernelMetricValues(onlyKernelMetric); - - writer.packFixStrLiteral("children"); - writer.packArray(0); - return; - } - writer.packMap(3); writer.packFixStrLiteral("frame"); From aab38e294dbea59f7994d3ebfb74dbc7a799a5dd Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 19:33:20 -0400 Subject: [PATCH 11/39] Remove redundant JSON flexible metric promotion --- third_party/proton/csrc/lib/Data/TreeData.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index f3320275bfab..cba21d8f24d0 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -369,16 +369,6 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, flexibleMetric.getValues()[0]); } }; - auto appendPromotedFlexibleMetrics = - [&](const auto &children, json &metricsJson, - const DataEntry::LinkedFlexibleMetricMap &linkedFlexibleMetrics) { - for (const auto &child : children) { - auto it = linkedFlexibleMetrics.find(child.id); - if (it != linkedFlexibleMetrics.end()) { - appendFlexibleMetrics(metricsJson, it->second); - } - } - }; tree->template walk( [&](TreeData::Tree::TreeNode &treeNode) { const auto &contextName = treeNode.name; @@ -424,9 +414,6 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, treeNode.metricSet.linkedFlexibleMetrics.end()) { appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); } - appendPromotedFlexibleMetrics( - virtualNode.children, outNode["metrics"], - treeNode.metricSet.linkedFlexibleMetrics); outNode["children"] = json::array(); auto &linkedChildren = outNode["children"]; linkedChildren.get_ref().reserve( From c8a755f738e7a62ba618e5896f724b1c2d36219b Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 19:50:40 -0400 Subject: [PATCH 12/39] Prune linked graph dumps without capture ids --- .../proton/csrc/include/Profiler/Graph.h | 34 +- third_party/proton/csrc/lib/Data/TreeData.cpp | 374 +++++++++--------- .../csrc/lib/Profiler/Cupti/CuptiProfiler.cpp | 5 +- 3 files changed, 196 insertions(+), 217 deletions(-) diff --git a/third_party/proton/csrc/include/Profiler/Graph.h b/third_party/proton/csrc/include/Profiler/Graph.h index 5134d5f69f6b..d6604e9a116f 100644 --- a/third_party/proton/csrc/include/Profiler/Graph.h +++ b/third_party/proton/csrc/include/Profiler/Graph.h @@ -9,10 +9,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -48,38 +46,8 @@ struct GraphState { static constexpr const char *captureTag = ""; static constexpr const char *metricTag = ""; - static std::string makeCaptureTag(uint32_t graphId) { - return std::string(captureTag) + std::to_string(graphId); - } - static bool isCaptureTag(std::string_view name) { - constexpr std::string_view prefix = captureTag; - if (name.size() < prefix.size() || - name.substr(0, prefix.size()) != prefix) { - return false; - } - for (size_t i = prefix.size(); i < name.size(); ++i) { - if (name[i] < '0' || name[i] > '9') { - return false; - } - } - return true; - } - - static std::optional getCaptureGraphId(std::string_view name) { - constexpr std::string_view prefix = captureTag; - if (name.size() <= prefix.size() || - name.substr(0, prefix.size()) != prefix) { - return std::nullopt; - } - uint32_t graphId = 0; - for (size_t i = prefix.size(); i < name.size(); ++i) { - if (name[i] < '0' || name[i] > '9') { - return std::nullopt; - } - graphId = graphId * 10 + static_cast(name[i] - '0'); - } - return graphId; + return name == std::string_view(captureTag); } static std::string_view getDisplayName(std::string_view name) { diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index cba21d8f24d0..14a71c1d513e 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -6,12 +6,10 @@ #include "Profiler/Graph.h" #include "Utility/Errors.h" #include "Utility/MsgPackWriter.h" -#include #include #include #include #include -#include #include #include #include @@ -116,8 +114,8 @@ class TreeData::Tree { return; } if (children.size() > kChildIndexThreshold) { - childIndex = std::make_unique>(); + childIndex = + std::make_unique>(); childIndex->reserve(children.size()); for (const auto &child : children) { childIndex->emplace(child.name, child.id); @@ -140,7 +138,6 @@ class TreeData::Tree { size_t parentId = DummyId; size_t id = DummyId; - size_t graphRootId = DummyId; std::vector children = {}; std::unique_ptr> childIndex = {}; @@ -149,9 +146,7 @@ class TreeData::Tree { friend class Tree; }; - Tree() { - treeNodes.emplace_back(TreeNode::RootId, TreeNode::RootId, "ROOT"); - } + Tree() { treeNodes.emplace_back(TreeNode::RootId, TreeNode::RootId, "ROOT"); } size_t addNode(const std::vector &contexts, size_t parentId) { for (const auto &context : contexts) { @@ -168,11 +163,6 @@ class TreeData::Tree { return existingChildId; auto id = nextContextId++; treeNodes.emplace_back(id, parentId, context); - if (GraphState::getCaptureGraphId(contextName)) { - treeNodes.back().graphRootId = id; - } else { - treeNodes.back().graphRootId = parent.graphRootId; - } parent.addChild(treeNodes.back().name, id); return id; } @@ -187,8 +177,6 @@ class TreeData::Tree { TreeNode &getNode(size_t id) { return treeNodes.at(id); } - size_t getGraphRootId(size_t id) { return getNode(id).graphRootId; } - void upsertFlexibleMetric(size_t contextId, const FlexibleMetric &flexibleMetric) { auto &node = getNode(contextId); @@ -228,7 +216,6 @@ class TreeData::Tree { for (const auto &node : treeNodes) { cloned.treeNodes.emplace_back(node.id, node.parentId, node); - cloned.treeNodes.back().graphRootId = node.graphRootId; } for (const auto &node : treeNodes) { @@ -255,26 +242,6 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, output.push_back(json::object()); jsonNodes[TreeData::Tree::TreeNode::RootId] = &(output.back()); MetricSummary metricSummary; - auto getLinkedGraphRootIds = - [&](const DataEntry::MetricSet &metricSet) -> std::vector { - std::vector rootIds; - rootIds.reserve(metricSet.linkedMetrics.size() + - metricSet.linkedFlexibleMetrics.size()); - auto addRootId = [&](size_t linkedId) { - auto rootId = virtualTree->getGraphRootId(linkedId); - if (rootId != Tree::TreeNode::DummyId && - std::find(rootIds.begin(), rootIds.end(), rootId) == rootIds.end()) { - rootIds.push_back(rootId); - } - }; - for (const auto &[linkedId, _] : metricSet.linkedMetrics) { - addRootId(linkedId); - } - for (const auto &[linkedId, _] : metricSet.linkedFlexibleMetrics) { - addRootId(linkedId); - } - return rootIds; - }; auto appendMetrics = [&](json &metricsJson, const std::map> &metrics) { @@ -309,9 +276,8 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, for (size_t i = 0; i < PCSamplingMetric::Count; i++) { const auto valueName = PCSamplingMetric::getValueName( static_cast(i)); - std::visit( - [&](auto &&value) { metricsJson[valueName] = value; }, - pcSamplingMetric->getValues()[i]); + std::visit([&](auto &&value) { metricsJson[valueName] = value; }, + pcSamplingMetric->getValues()[i]); } } else if (metricKind == MetricKind::Cycle) { auto *cycleMetric = static_cast(metric.get()); @@ -374,61 +340,92 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, const auto &contextName = treeNode.name; auto contextId = treeNode.id; json *jsonNode = jsonNodes[contextId]; - (*jsonNode)["frame"] = {{"name", GraphState::getDisplayName(contextName)}, - {"type", "function"}}; + (*jsonNode)["frame"] = { + {"name", GraphState::getDisplayName(contextName)}, + {"type", "function"}}; (*jsonNode)["metrics"] = json::object(); auto &metricsJson = (*jsonNode)["metrics"]; appendMetrics(metricsJson, treeNode.metricSet.metrics); appendFlexibleMetrics(metricsJson, treeNode.metricSet.flexibleMetrics); auto &childrenArray = (*jsonNode)["children"]; childrenArray = json::array(); - auto linkedGraphRootIds = getLinkedGraphRootIds(treeNode.metricSet); + const auto &virtualRootNode = + virtualTree->getNode(Tree::TreeNode::RootId); + std::vector linkedVirtualNodes; + if (!treeNode.metricSet.linkedMetrics.empty() || + !treeNode.metricSet.linkedFlexibleMetrics.empty()) { + linkedVirtualNodes.assign(virtualTree->size(), 0); + auto markLinkedVirtualNodes = [&](auto &&markLinkedVirtualNodes, + size_t virtualNodeId) -> bool { + bool hasLinkedNode = + treeNode.metricSet.linkedMetrics.find(virtualNodeId) != + treeNode.metricSet.linkedMetrics.end(); + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + for (const auto &child : virtualNode.children) { + if (treeNode.metricSet.linkedFlexibleMetrics.find(child.id) != + treeNode.metricSet.linkedFlexibleMetrics.end()) { + hasLinkedNode = true; + } + hasLinkedNode = + markLinkedVirtualNodes(markLinkedVirtualNodes, child.id) || + hasLinkedNode; + } + linkedVirtualNodes[virtualNodeId] = hasLinkedNode; + return hasLinkedNode; + }; + markLinkedVirtualNodes(markLinkedVirtualNodes, + Tree::TreeNode::RootId); + } childrenArray.get_ref().reserve( - treeNode.children.size() + linkedGraphRootIds.size()); + treeNode.children.size() + virtualRootNode.children.size()); for (const auto &child : treeNode.children) { childrenArray.push_back(json::object()); jsonNodes[child.id] = &childrenArray.back(); } - if (linkedGraphRootIds.empty()) { - return; - } - std::function appendLinkedVirtualNode = - [&](size_t virtualNodeId, json &outNode, - json &parentMetricsJson) { - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - const auto flexibleIt = - treeNode.metricSet.linkedFlexibleMetrics.find(virtualNodeId); - outNode = json::object(); - outNode["frame"] = {{"name", GraphState::getDisplayName( - virtualNode.name)}, - {"type", "function"}}; - outNode["metrics"] = json::object(); - if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { - appendMetrics(outNode["metrics"], metricsIt->second); - } - // Linked flexible metrics are only attached to - // children, so they always belong on the parent frame. - if (flexibleIt != - treeNode.metricSet.linkedFlexibleMetrics.end()) { - appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); - } - outNode["children"] = json::array(); - auto &linkedChildren = outNode["children"]; - linkedChildren.get_ref().reserve( - virtualNode.children.size()); - for (const auto &child : virtualNode.children) { - linkedChildren.push_back(json::object()); - appendLinkedVirtualNode(child.id, linkedChildren.back(), - outNode["metrics"]); - } - }; + auto appendLinkedVirtualNode = [&](auto &&appendLinkedVirtualNode, + size_t virtualNodeId, + json &outNode) -> void { + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + const auto metricsIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + outNode = json::object(); + outNode["frame"] = { + {"name", GraphState::getDisplayName(virtualNode.name)}, + {"type", "function"}}; + outNode["metrics"] = json::object(); + if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { + appendMetrics(outNode["metrics"], metricsIt->second); + } + // Linked flexible metrics are only attached to + // children, so they always belong on the parent frame. + for (const auto &child : virtualNode.children) { + auto flexibleIt = + treeNode.metricSet.linkedFlexibleMetrics.find(child.id); + if (flexibleIt != treeNode.metricSet.linkedFlexibleMetrics.end()) { + appendFlexibleMetrics(outNode["metrics"], flexibleIt->second); + } + } + outNode["children"] = json::array(); + auto &linkedChildren = outNode["children"]; + linkedChildren.get_ref().reserve( + virtualNode.children.size()); + for (const auto &child : virtualNode.children) { + if (!linkedVirtualNodes[child.id]) { + continue; + } + linkedChildren.push_back(json::object()); + appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, + linkedChildren.back()); + } + }; - for (auto virtualRootId : linkedGraphRootIds) { + for (const auto &child : virtualRootNode.children) { + if (linkedVirtualNodes.empty() || !linkedVirtualNodes[child.id]) { + continue; + } childrenArray.push_back(json::object()); - appendLinkedVirtualNode(virtualRootId, childrenArray.back(), - metricsJson); + appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, + childrenArray.back()); } }); @@ -448,8 +445,7 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, for (size_t i = 0; i < PCSamplingMetric::Count; i++) { const auto valueName = PCSamplingMetric::getValueName( static_cast(i)); - output[TreeData::Tree::TreeNode::RootId]["metrics"] - [valueName] = 0; + output[TreeData::Tree::TreeNode::RootId]["metrics"][valueName] = 0; } } @@ -497,26 +493,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, metricSummary.hasKernelMetric = true; const std::map> emptyMetrics; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - auto getLinkedGraphRootIds = - [&](const DataEntry::MetricSet &metricSet) -> std::vector { - std::vector rootIds; - rootIds.reserve(metricSet.linkedMetrics.size() + - metricSet.linkedFlexibleMetrics.size()); - auto addRootId = [&](size_t linkedId) { - auto rootId = virtualTree->getGraphRootId(linkedId); - if (rootId != Tree::TreeNode::DummyId && - std::find(rootIds.begin(), rootIds.end(), rootId) == rootIds.end()) { - rootIds.push_back(rootId); - } - }; - for (const auto &[linkedId, _] : metricSet.linkedMetrics) { - addRootId(linkedId); - } - for (const auto &[linkedId, _] : metricSet.linkedFlexibleMetrics) { - addRootId(linkedId); - } - return rootIds; - }; constexpr uint32_t kernelInclusiveCount = 2; constexpr uint32_t kernelTotalCount = 4; @@ -559,8 +535,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packKernelMetricValues = [&](const KernelMetric *kernelMetric) { uint64_t duration = std::get(kernelMetric->getValue(KernelMetric::Duration)); - uint64_t invocations = std::get( - kernelMetric->getValue(KernelMetric::Invocations)); + uint64_t invocations = + std::get(kernelMetric->getValue(KernelMetric::Invocations)); uint64_t deviceId = std::get(kernelMetric->getValue(KernelMetric::DeviceId)); uint64_t deviceType = @@ -686,14 +662,12 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, for (auto value : v) { writer.packUInt(value); } - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { writer.packArray(static_cast(v.size())); for (auto value : v) { writer.packInt(value); } - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { writer.packArray(static_cast(v.size())); for (auto value : v) { writer.packDouble(value); @@ -728,80 +702,118 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } }; - auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) - -> void { - writer.packMap(3); - - writer.packFixStrLiteral("frame"); - writer.packMap(2); - writer.packFixStrLiteral("name"); - writer.packStr(GraphState::getDisplayName(treeNode.name)); - writer.packFixStrLiteral("type"); - writer.packFixStrLiteral("function"); - - writer.packFixStrLiteral("metrics"); - const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; - writer.packMap( - countMetricEntries(treeNode.metricSet.metrics, isRoot) + - static_cast(treeNode.metricSet.flexibleMetrics.size()) + - countPromotedFlexibleMetricEntries( - virtualRootNode.children, - treeNode.metricSet.linkedFlexibleMetrics)); - packMetrics(treeNode.metricSet.metrics, isRoot); - packFlexibleMetrics(treeNode.metricSet.flexibleMetrics); - packPromotedFlexibleMetrics(virtualRootNode.children, - treeNode.metricSet.linkedFlexibleMetrics); - auto linkedGraphRootIds = getLinkedGraphRootIds(treeNode.metricSet); - - auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, - size_t virtualNodeId) -> void { - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - writer.packMap(3); - - writer.packFixStrLiteral("frame"); - writer.packMap(2); - writer.packFixStrLiteral("name"); - writer.packStr(GraphState::getDisplayName(virtualNode.name)); - writer.packFixStrLiteral("type"); - writer.packFixStrLiteral("function"); - - writer.packFixStrLiteral("metrics"); - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - const auto &linkedMetrics = - (metricsIt != treeNode.metricSet.linkedMetrics.end()) - ? metricsIt->second - : emptyMetrics; - writer.packMap( - countMetricEntries(linkedMetrics, /*isRoot=*/false) + - countPromotedFlexibleMetricEntries( - virtualNode.children, - treeNode.metricSet.linkedFlexibleMetrics)); - packMetrics(linkedMetrics, /*isRoot=*/false); - // Linked flexible metrics are only attached to - // children, so they are always packed into the parent frame. - packPromotedFlexibleMetrics( - virtualNode.children, - treeNode.metricSet.linkedFlexibleMetrics); - - writer.packFixStrLiteral("children"); - writer.packArray( - static_cast(virtualNode.children.size())); - for (const auto &child : virtualNode.children) { - packLinkedVirtualNode(packLinkedVirtualNode, child.id); - } - }; - - writer.packFixStrLiteral("children"); - writer.packArray(static_cast(treeNode.children.size()) + - static_cast(linkedGraphRootIds.size())); - for (const auto &child : treeNode.children) { - packNode(packNode, tree->getNode(child.id)); - } - for (auto virtualRootId : linkedGraphRootIds) { - packLinkedVirtualNode(packLinkedVirtualNode, virtualRootId); + auto packNode = [&](auto &&packNode, + TreeData::Tree::TreeNode &treeNode) -> void { + writer.packMap(3); + + writer.packFixStrLiteral("frame"); + writer.packMap(2); + writer.packFixStrLiteral("name"); + writer.packStr(GraphState::getDisplayName(treeNode.name)); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); + + writer.packFixStrLiteral("metrics"); + const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; + writer.packMap( + countMetricEntries(treeNode.metricSet.metrics, isRoot) + + static_cast(treeNode.metricSet.flexibleMetrics.size()) + + countPromotedFlexibleMetricEntries( + virtualRootNode.children, + treeNode.metricSet.linkedFlexibleMetrics)); + packMetrics(treeNode.metricSet.metrics, isRoot); + packFlexibleMetrics(treeNode.metricSet.flexibleMetrics); + packPromotedFlexibleMetrics(virtualRootNode.children, + treeNode.metricSet.linkedFlexibleMetrics); + std::vector linkedVirtualNodes; + if (!treeNode.metricSet.linkedMetrics.empty() || + !treeNode.metricSet.linkedFlexibleMetrics.empty()) { + linkedVirtualNodes.assign(virtualTree->size(), 0); + auto markLinkedVirtualNodes = [&](auto &&markLinkedVirtualNodes, + size_t virtualNodeId) -> bool { + bool hasLinkedNode = + treeNode.metricSet.linkedMetrics.find(virtualNodeId) != + treeNode.metricSet.linkedMetrics.end(); + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + for (const auto &child : virtualNode.children) { + if (treeNode.metricSet.linkedFlexibleMetrics.find(child.id) != + treeNode.metricSet.linkedFlexibleMetrics.end()) { + hasLinkedNode = true; + } + hasLinkedNode = + markLinkedVirtualNodes(markLinkedVirtualNodes, child.id) || + hasLinkedNode; } + linkedVirtualNodes[virtualNodeId] = hasLinkedNode; + return hasLinkedNode; }; + markLinkedVirtualNodes(markLinkedVirtualNodes, Tree::TreeNode::RootId); + } + auto countLinkedVirtualChildren = [&](const auto &children) { + uint32_t childCount = 0; + if (linkedVirtualNodes.empty()) { + return childCount; + } + for (const auto &child : children) { + if (linkedVirtualNodes[child.id]) { + ++childCount; + } + } + return childCount; + }; + + auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, + size_t virtualNodeId) -> void { + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + writer.packMap(3); + + writer.packFixStrLiteral("frame"); + writer.packMap(2); + writer.packFixStrLiteral("name"); + writer.packStr(GraphState::getDisplayName(virtualNode.name)); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); + + writer.packFixStrLiteral("metrics"); + const auto metricsIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + const auto &linkedMetrics = + (metricsIt != treeNode.metricSet.linkedMetrics.end()) + ? metricsIt->second + : emptyMetrics; + writer.packMap( + countMetricEntries(linkedMetrics, /*isRoot=*/false) + + countPromotedFlexibleMetricEntries( + virtualNode.children, treeNode.metricSet.linkedFlexibleMetrics)); + packMetrics(linkedMetrics, /*isRoot=*/false); + // Linked flexible metrics are only attached to + // children, so they are always packed into the parent frame. + packPromotedFlexibleMetrics(virtualNode.children, + treeNode.metricSet.linkedFlexibleMetrics); + + writer.packFixStrLiteral("children"); + writer.packArray(countLinkedVirtualChildren(virtualNode.children)); + for (const auto &child : virtualNode.children) { + if (!linkedVirtualNodes[child.id]) { + continue; + } + packLinkedVirtualNode(packLinkedVirtualNode, child.id); + } + }; + + writer.packFixStrLiteral("children"); + writer.packArray(static_cast(treeNode.children.size()) + + countLinkedVirtualChildren(virtualRootNode.children)); + for (const auto &child : treeNode.children) { + packNode(packNode, tree->getNode(child.id)); + } + for (const auto &child : virtualRootNode.children) { + if (linkedVirtualNodes.empty() || !linkedVirtualNodes[child.id]) { + continue; + } + packLinkedVirtualNode(packLinkedVirtualNode, child.id); + } + }; // Hatchet format: [tree, device_metadata]. Always emit 2 elements to match // the JSON serializer, even if device_metadata is empty. diff --git a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp index 790a7d0d6a2c..8b74cc70ea87 100644 --- a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp @@ -514,16 +514,15 @@ void CuptiProfiler::CuptiProfilerPimpl::handleGraphResourceCallbacks( } for (auto *data : profiler.dataSet) { auto currentContexts = data->getContexts(); - auto graphCaptureTag = GraphState::makeCaptureTag(graphId); std::vector contexts; - contexts.emplace_back(graphCaptureTag); + contexts.emplace_back(GraphState::captureTag); for (const auto &context : currentContexts) { contexts.push_back(context); } if (isMetricKernelNode) { auto flexibleMetricContexts = data->getContexts(false); std::vector flexibleMetricEntryContexts; - flexibleMetricEntryContexts.emplace_back(graphCaptureTag); + flexibleMetricEntryContexts.emplace_back(GraphState::captureTag); for (const auto &context : flexibleMetricContexts) { flexibleMetricEntryContexts.push_back(context); } From 9fbce0e6ee66506fb7bfc74e218047e605bd8348 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:06:22 -0400 Subject: [PATCH 13/39] Clarify linked flexible metric pruning --- third_party/proton/csrc/lib/Data/TreeData.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 14a71c1d513e..80d0601f0243 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -362,6 +362,9 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, treeNode.metricSet.linkedMetrics.end(); const auto &virtualNode = virtualTree->getNode(virtualNodeId); for (const auto &child : virtualNode.children) { + // Linked flexible metrics are stored on the child node, + // but serialized on the parent frame so we can omit the helper + // node from the dumped tree. if (treeNode.metricSet.linkedFlexibleMetrics.find(child.id) != treeNode.metricSet.linkedFlexibleMetrics.end()) { hasLinkedNode = true; @@ -736,6 +739,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, treeNode.metricSet.linkedMetrics.end(); const auto &virtualNode = virtualTree->getNode(virtualNodeId); for (const auto &child : virtualNode.children) { + // Linked flexible metrics are stored on the child node, but + // serialized on the parent frame so we can omit the helper node from + // the dumped tree. if (treeNode.metricSet.linkedFlexibleMetrics.find(child.id) != treeNode.metricSet.linkedFlexibleMetrics.end()) { hasLinkedNode = true; From 91ad8a5df3134de0438df212946992865aab887b Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:07:54 -0400 Subject: [PATCH 14/39] Inline graph capture tag display helpers --- third_party/proton/csrc/include/Profiler/Graph.h | 9 --------- third_party/proton/csrc/lib/Data/TraceData.cpp | 8 +++----- third_party/proton/csrc/lib/Data/TreeData.cpp | 11 ++++------- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/third_party/proton/csrc/include/Profiler/Graph.h b/third_party/proton/csrc/include/Profiler/Graph.h index d6604e9a116f..3cea6e70ae32 100644 --- a/third_party/proton/csrc/include/Profiler/Graph.h +++ b/third_party/proton/csrc/include/Profiler/Graph.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -46,14 +45,6 @@ struct GraphState { static constexpr const char *captureTag = ""; static constexpr const char *metricTag = ""; - static bool isCaptureTag(std::string_view name) { - return name == std::string_view(captureTag); - } - - static std::string_view getDisplayName(std::string_view name) { - return isCaptureTag(name) ? std::string_view(captureTag) : name; - } - struct NodeState { // The graph node id for this node uint64_t nodeId{}; diff --git a/third_party/proton/csrc/lib/Data/TraceData.cpp b/third_party/proton/csrc/lib/Data/TraceData.cpp index 84bec46008a2..2a9921b99a90 100644 --- a/third_party/proton/csrc/lib/Data/TraceData.cpp +++ b/third_party/proton/csrc/lib/Data/TraceData.cpp @@ -687,12 +687,11 @@ void reconstructGraphScopeEvents( isMetadataKernel = true; break; } - if (GraphState::isCaptureTag(context.name)) { + if (context.name == GraphState::captureTag) { seenCaptureTag = true; } if (seenCaptureTag) { - graphContexts.emplace_back( - std::string(GraphState::getDisplayName(context.name))); + graphContexts.emplace_back(context.name); } } if (isMetadataKernel) { @@ -980,8 +979,7 @@ void TraceData::dumpChromeTrace(std::ostream &os, size_t phase) const { std::vector virtualContexts; virtualContexts.reserve(resolvedContexts.size()); for (const auto &context : resolvedContexts) { - virtualContexts.emplace_back( - std::string(GraphState::getDisplayName(context.name))); + virtualContexts.emplace_back(context.name); } targetIdToVirtualContexts.emplace(targetEntryId, std::move(virtualContexts)); diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 80d0601f0243..45318435beab 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -340,9 +340,7 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, const auto &contextName = treeNode.name; auto contextId = treeNode.id; json *jsonNode = jsonNodes[contextId]; - (*jsonNode)["frame"] = { - {"name", GraphState::getDisplayName(contextName)}, - {"type", "function"}}; + (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}}; (*jsonNode)["metrics"] = json::object(); auto &metricsJson = (*jsonNode)["metrics"]; appendMetrics(metricsJson, treeNode.metricSet.metrics); @@ -393,8 +391,7 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, treeNode.metricSet.linkedMetrics.find(virtualNodeId); outNode = json::object(); outNode["frame"] = { - {"name", GraphState::getDisplayName(virtualNode.name)}, - {"type", "function"}}; + {"name", virtualNode.name}, {"type", "function"}}; outNode["metrics"] = json::object(); if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { appendMetrics(outNode["metrics"], metricsIt->second); @@ -712,7 +709,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("frame"); writer.packMap(2); writer.packFixStrLiteral("name"); - writer.packStr(GraphState::getDisplayName(treeNode.name)); + writer.packStr(treeNode.name); writer.packFixStrLiteral("type"); writer.packFixStrLiteral("function"); @@ -776,7 +773,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("frame"); writer.packMap(2); writer.packFixStrLiteral("name"); - writer.packStr(GraphState::getDisplayName(virtualNode.name)); + writer.packStr(virtualNode.name); writer.packFixStrLiteral("type"); writer.packFixStrLiteral("function"); From 3c33f6408d1aa1642a7fe1cd9c38825923e91ca7 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:21:29 -0400 Subject: [PATCH 15/39] Mark linked virtual ancestors directly --- third_party/proton/csrc/lib/Data/TreeData.cpp | 76 +++++++++---------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 45318435beab..117a37d5462d 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -353,29 +353,26 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, if (!treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty()) { linkedVirtualNodes.assign(virtualTree->size(), 0); - auto markLinkedVirtualNodes = [&](auto &&markLinkedVirtualNodes, - size_t virtualNodeId) -> bool { - bool hasLinkedNode = - treeNode.metricSet.linkedMetrics.find(virtualNodeId) != - treeNode.metricSet.linkedMetrics.end(); - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - for (const auto &child : virtualNode.children) { - // Linked flexible metrics are stored on the child node, - // but serialized on the parent frame so we can omit the helper - // node from the dumped tree. - if (treeNode.metricSet.linkedFlexibleMetrics.find(child.id) != - treeNode.metricSet.linkedFlexibleMetrics.end()) { - hasLinkedNode = true; + auto markLinkedVirtualNode = [&](size_t virtualNodeId) { + while (virtualNodeId != Tree::TreeNode::DummyId && + !linkedVirtualNodes[virtualNodeId]) { + linkedVirtualNodes[virtualNodeId] = 1; + if (virtualNodeId == Tree::TreeNode::RootId) { + break; } - hasLinkedNode = - markLinkedVirtualNodes(markLinkedVirtualNodes, child.id) || - hasLinkedNode; + virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; } - linkedVirtualNodes[virtualNodeId] = hasLinkedNode; - return hasLinkedNode; }; - markLinkedVirtualNodes(markLinkedVirtualNodes, - Tree::TreeNode::RootId); + for (const auto &[linkedId, _] : treeNode.metricSet.linkedMetrics) { + markLinkedVirtualNode(linkedId); + } + for (const auto &[linkedId, _] : + treeNode.metricSet.linkedFlexibleMetrics) { + // Flexible metrics are keyed by the child helper, but + // serialized on the parent frame so the helper node can stay out + // of the dumped tree. + markLinkedVirtualNode(virtualTree->getNode(linkedId).parentId); + } } childrenArray.get_ref().reserve( treeNode.children.size() + virtualRootNode.children.size()); @@ -390,8 +387,7 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); outNode = json::object(); - outNode["frame"] = { - {"name", virtualNode.name}, {"type", "function"}}; + outNode["frame"] = {{"name", virtualNode.name}, {"type", "function"}}; outNode["metrics"] = json::object(); if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { appendMetrics(outNode["metrics"], metricsIt->second); @@ -729,28 +725,26 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, if (!treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty()) { linkedVirtualNodes.assign(virtualTree->size(), 0); - auto markLinkedVirtualNodes = [&](auto &&markLinkedVirtualNodes, - size_t virtualNodeId) -> bool { - bool hasLinkedNode = - treeNode.metricSet.linkedMetrics.find(virtualNodeId) != - treeNode.metricSet.linkedMetrics.end(); - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - for (const auto &child : virtualNode.children) { - // Linked flexible metrics are stored on the child node, but - // serialized on the parent frame so we can omit the helper node from - // the dumped tree. - if (treeNode.metricSet.linkedFlexibleMetrics.find(child.id) != - treeNode.metricSet.linkedFlexibleMetrics.end()) { - hasLinkedNode = true; + auto markLinkedVirtualNode = [&](size_t virtualNodeId) { + while (virtualNodeId != Tree::TreeNode::DummyId && + !linkedVirtualNodes[virtualNodeId]) { + linkedVirtualNodes[virtualNodeId] = 1; + if (virtualNodeId == Tree::TreeNode::RootId) { + break; } - hasLinkedNode = - markLinkedVirtualNodes(markLinkedVirtualNodes, child.id) || - hasLinkedNode; + virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; } - linkedVirtualNodes[virtualNodeId] = hasLinkedNode; - return hasLinkedNode; }; - markLinkedVirtualNodes(markLinkedVirtualNodes, Tree::TreeNode::RootId); + for (const auto &[linkedId, _] : treeNode.metricSet.linkedMetrics) { + markLinkedVirtualNode(linkedId); + } + for (const auto &[linkedId, _] : + treeNode.metricSet.linkedFlexibleMetrics) { + // Flexible metrics are keyed by the child helper, but + // serialized on the parent frame so the helper node can stay out of + // the dumped tree. + markLinkedVirtualNode(virtualTree->getNode(linkedId).parentId); + } } auto countLinkedVirtualChildren = [&](const auto &children) { uint32_t childCount = 0; From e1ea3c87f56cb060e8ec89c061e7d147c098477c Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:27:04 -0400 Subject: [PATCH 16/39] Document MsgPack metric entry counts --- third_party/proton/csrc/lib/Data/TreeData.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 117a37d5462d..cac59f1703cd 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -490,10 +490,13 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, const std::map> emptyMetrics; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - constexpr uint32_t kernelInclusiveCount = 2; - constexpr uint32_t kernelTotalCount = 4; - constexpr uint32_t cycleInclusiveCount = 2; - constexpr uint32_t cycleTotalCount = 4; + // Root metrics only carry inclusive aggregate fields. Non-root metrics also + // include device_id and device_type, so their serialized map entry counts are + // larger. + constexpr uint32_t kernelInclusiveCount = 2; // duration, count + constexpr uint32_t kernelTotalCount = 4; // + device_id, device_type + constexpr uint32_t cycleInclusiveCount = 2; // duration, normalized_duration + constexpr uint32_t cycleTotalCount = 4; // + device_id, device_type auto countMetricEntries = [&](const std::map> &metrics, From 649af3d06ea092e40184ede7b05511ec858f9675 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:28:57 -0400 Subject: [PATCH 17/39] Document TreeData serializer helpers --- third_party/proton/csrc/lib/Data/TreeData.cpp | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index cac59f1703cd..7fcb25202116 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -242,6 +242,8 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, output.push_back(json::object()); jsonNodes[TreeData::Tree::TreeNode::RootId] = &(output.back()); MetricSummary metricSummary; + // Append fixed-schema metrics to a JSON metrics object and update device + // metadata requirements while visiting them. auto appendMetrics = [&](json &metricsJson, const std::map> &metrics) { @@ -307,6 +309,8 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, } } }; + // Append user-defined flexible metrics, preserving scalar and vector value + // types in the JSON output. auto appendFlexibleMetrics = [&](json &metricsJson, const std::map &flexibleMetrics) { @@ -353,6 +357,8 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, if (!treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty()) { linkedVirtualNodes.assign(virtualTree->size(), 0); + // Mark each linked target and its ancestors, producing the smallest + // virtual subtree needed to keep the linked target reachable. auto markLinkedVirtualNode = [&](size_t virtualNodeId) { while (virtualNodeId != Tree::TreeNode::DummyId && !linkedVirtualNodes[virtualNodeId]) { @@ -380,6 +386,7 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, childrenArray.push_back(json::object()); jsonNodes[child.id] = &childrenArray.back(); } + // Copy a marked virtual subtree into the current JSON node. auto appendLinkedVirtualNode = [&](auto &&appendLinkedVirtualNode, size_t virtualNodeId, json &outNode) -> void { @@ -498,6 +505,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, constexpr uint32_t cycleInclusiveCount = 2; // duration, normalized_duration constexpr uint32_t cycleTotalCount = 4; // + device_id, device_type + // Count the exact number of key/value entries needed for a MsgPack metrics + // map before writing it. auto countMetricEntries = [&](const std::map> &metrics, bool isRoot) -> uint32_t { @@ -531,6 +540,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } return metricEntries; }; + // Pack the four fields emitted for a concrete kernel metric. auto packKernelMetricValues = [&](const KernelMetric *kernelMetric) { uint64_t duration = std::get(kernelMetric->getValue(KernelMetric::Duration)); @@ -553,6 +563,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packStr(deviceTypeName); }; + // Pack all fixed-schema metrics for one frame. Root frames emit zero-valued + // inclusive placeholders for any metric type observed elsewhere. auto packMetrics = [&](const std::map> &metrics, bool isRoot) { @@ -640,6 +652,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } }; + // Pack user-defined flexible metrics in MsgPack, preserving scalar and vector + // value types. auto packFlexibleMetrics = [&](const std::map &flexibleMetrics) { for (const auto &[_, flexibleMetric] : flexibleMetrics) { @@ -678,6 +692,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, flexibleMetric.getValues()[0]); } }; + // Count flexible metrics attached to child helpers that will be + // promoted into the parent frame's metrics map. auto countPromotedFlexibleMetricEntries = [&](const auto &children, const DataEntry::LinkedFlexibleMetricMap &linkedFlexibleMetrics) @@ -691,6 +707,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } return metricEntries; }; + // Pack the child helper entries into the parent frame. auto packPromotedFlexibleMetrics = [&](const auto &children, const DataEntry::LinkedFlexibleMetricMap &linkedFlexibleMetrics) { @@ -701,6 +718,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } }; + // Pack a real tree node, followed by any linked virtual subtree that belongs + // under the same frame. auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { writer.packMap(3); @@ -728,6 +747,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, if (!treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty()) { linkedVirtualNodes.assign(virtualTree->size(), 0); + // Mark each linked target and its ancestors, producing the smallest + // virtual subtree needed to keep the linked target reachable. auto markLinkedVirtualNode = [&](size_t virtualNodeId) { while (virtualNodeId != Tree::TreeNode::DummyId && !linkedVirtualNodes[virtualNodeId]) { @@ -749,6 +770,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, markLinkedVirtualNode(virtualTree->getNode(linkedId).parentId); } } + // Count marked linked children so MsgPack array headers can be emitted + // before recursively packing the child nodes. auto countLinkedVirtualChildren = [&](const auto &children) { uint32_t childCount = 0; if (linkedVirtualNodes.empty()) { @@ -762,6 +785,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, return childCount; }; + // Pack a marked virtual subtree as linked children of the current real + // frame. auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); From 2b8bfc286f99c7c31d1c252eed7dda4ebc807a94 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:33:47 -0400 Subject: [PATCH 18/39] Reuse linked virtual node marks --- third_party/proton/csrc/lib/Data/TreeData.cpp | 60 ++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 7fcb25202116..18a51b63d867 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -6,6 +6,7 @@ #include "Profiler/Graph.h" #include "Utility/Errors.h" #include "Utility/MsgPackWriter.h" +#include #include #include #include @@ -242,6 +243,8 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, output.push_back(json::object()); jsonNodes[TreeData::Tree::TreeNode::RootId] = &(output.back()); MetricSummary metricSummary; + std::vector linkedVirtualNodeMarks(virtualTree->size(), 0); + uint32_t linkedVirtualNodeMark = 0; // Append fixed-schema metrics to a JSON metrics object and update device // metadata requirements while visiting them. auto appendMetrics = [&](json &metricsJson, @@ -353,16 +356,28 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, childrenArray = json::array(); const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - std::vector linkedVirtualNodes; + bool hasLinkedVirtualNodes = false; + uint32_t currentLinkedVirtualNodeMark = 0; if (!treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty()) { - linkedVirtualNodes.assign(virtualTree->size(), 0); + hasLinkedVirtualNodes = true; + // Reuse the mark buffer across tree nodes. Bumping the generation + // avoids clearing a virtual-tree-sized buffer for each real node. + ++linkedVirtualNodeMark; + if (linkedVirtualNodeMark == 0) { + std::fill(linkedVirtualNodeMarks.begin(), + linkedVirtualNodeMarks.end(), 0); + linkedVirtualNodeMark = 1; + } + currentLinkedVirtualNodeMark = linkedVirtualNodeMark; // Mark each linked target and its ancestors, producing the smallest // virtual subtree needed to keep the linked target reachable. auto markLinkedVirtualNode = [&](size_t virtualNodeId) { while (virtualNodeId != Tree::TreeNode::DummyId && - !linkedVirtualNodes[virtualNodeId]) { - linkedVirtualNodes[virtualNodeId] = 1; + linkedVirtualNodeMarks[virtualNodeId] != + currentLinkedVirtualNodeMark) { + linkedVirtualNodeMarks[virtualNodeId] = + currentLinkedVirtualNodeMark; if (virtualNodeId == Tree::TreeNode::RootId) { break; } @@ -413,7 +428,8 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, linkedChildren.get_ref().reserve( virtualNode.children.size()); for (const auto &child : virtualNode.children) { - if (!linkedVirtualNodes[child.id]) { + if (linkedVirtualNodeMarks[child.id] != + currentLinkedVirtualNodeMark) { continue; } linkedChildren.push_back(json::object()); @@ -423,7 +439,8 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, }; for (const auto &child : virtualRootNode.children) { - if (linkedVirtualNodes.empty() || !linkedVirtualNodes[child.id]) { + if (!hasLinkedVirtualNodes || linkedVirtualNodeMarks[child.id] != + currentLinkedVirtualNodeMark) { continue; } childrenArray.push_back(json::object()); @@ -496,6 +513,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, metricSummary.hasKernelMetric = true; const std::map> emptyMetrics; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); + std::vector linkedVirtualNodeMarks(virtualTree->size(), 0); + uint32_t linkedVirtualNodeMark = 0; // Root metrics only carry inclusive aggregate fields. Non-root metrics also // include device_id and device_type, so their serialized map entry counts are @@ -743,16 +762,28 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, packFlexibleMetrics(treeNode.metricSet.flexibleMetrics); packPromotedFlexibleMetrics(virtualRootNode.children, treeNode.metricSet.linkedFlexibleMetrics); - std::vector linkedVirtualNodes; + bool hasLinkedVirtualNodes = false; + uint32_t currentLinkedVirtualNodeMark = 0; if (!treeNode.metricSet.linkedMetrics.empty() || !treeNode.metricSet.linkedFlexibleMetrics.empty()) { - linkedVirtualNodes.assign(virtualTree->size(), 0); + hasLinkedVirtualNodes = true; + // Reuse the mark buffer across recursive packNode calls. Each node keeps + // its own generation id so child recursion cannot overwrite the parent's + // linked virtual subtree. + ++linkedVirtualNodeMark; + if (linkedVirtualNodeMark == 0) { + std::fill(linkedVirtualNodeMarks.begin(), linkedVirtualNodeMarks.end(), + 0); + linkedVirtualNodeMark = 1; + } + currentLinkedVirtualNodeMark = linkedVirtualNodeMark; // Mark each linked target and its ancestors, producing the smallest // virtual subtree needed to keep the linked target reachable. auto markLinkedVirtualNode = [&](size_t virtualNodeId) { while (virtualNodeId != Tree::TreeNode::DummyId && - !linkedVirtualNodes[virtualNodeId]) { - linkedVirtualNodes[virtualNodeId] = 1; + linkedVirtualNodeMarks[virtualNodeId] != + currentLinkedVirtualNodeMark) { + linkedVirtualNodeMarks[virtualNodeId] = currentLinkedVirtualNodeMark; if (virtualNodeId == Tree::TreeNode::RootId) { break; } @@ -774,11 +805,11 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, // before recursively packing the child nodes. auto countLinkedVirtualChildren = [&](const auto &children) { uint32_t childCount = 0; - if (linkedVirtualNodes.empty()) { + if (!hasLinkedVirtualNodes) { return childCount; } for (const auto &child : children) { - if (linkedVirtualNodes[child.id]) { + if (linkedVirtualNodeMarks[child.id] == currentLinkedVirtualNodeMark) { ++childCount; } } @@ -819,7 +850,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("children"); writer.packArray(countLinkedVirtualChildren(virtualNode.children)); for (const auto &child : virtualNode.children) { - if (!linkedVirtualNodes[child.id]) { + if (linkedVirtualNodeMarks[child.id] != currentLinkedVirtualNodeMark) { continue; } packLinkedVirtualNode(packLinkedVirtualNode, child.id); @@ -833,7 +864,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, packNode(packNode, tree->getNode(child.id)); } for (const auto &child : virtualRootNode.children) { - if (linkedVirtualNodes.empty() || !linkedVirtualNodes[child.id]) { + if (!hasLinkedVirtualNodes || + linkedVirtualNodeMarks[child.id] != currentLinkedVirtualNodeMark) { continue; } packLinkedVirtualNode(packLinkedVirtualNode, child.id); From 51deaa2976633baf66676cda1fd24bce18633174 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:37:35 -0400 Subject: [PATCH 19/39] Avoid throwing in packUIntString --- third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp index 7fd81134d8ff..26b3b9723c26 100644 --- a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp +++ b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp @@ -1,10 +1,7 @@ #include "Utility/MsgPackWriter.h" -#include "Utility/Errors.h" - #include #include -#include #include #include #include @@ -99,11 +96,8 @@ void MsgPackWriter::packStr(std::string_view value) { void MsgPackWriter::packUIntString(uint64_t value) { char buffer[std::numeric_limits::digits10 + 1]; - auto [ptr, ec] = std::to_chars(buffer, buffer + sizeof(buffer), value); - if (ec != std::errc()) { - throw makeLogicError("Failed to encode integer as string"); - } - packStr(std::string_view(buffer, static_cast(ptr - buffer))); + auto result = std::to_chars(buffer, buffer + sizeof(buffer), value); + packStr(std::string_view(buffer, static_cast(result.ptr - buffer))); } void MsgPackWriter::packArray(uint32_t size) { From eed568d99881042a65f161c018d68c1ddd23ddfb Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:43:26 -0400 Subject: [PATCH 20/39] Skip empty linked MsgPack metric work --- third_party/proton/csrc/lib/Data/TreeData.cpp | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 18a51b63d867..17cc2398ba54 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -511,7 +511,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, MetricSummary metricSummary; metricSummary.hasKernelMetric = true; - const std::map> emptyMetrics; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); std::vector linkedVirtualNodeMarks(virtualTree->size(), 0); uint32_t linkedVirtualNodeMark = 0; @@ -752,20 +751,27 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("metrics"); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; + const auto &linkedFlexibleMetrics = + treeNode.metricSet.linkedFlexibleMetrics; + const auto promotedFlexibleMetricEntries = + linkedFlexibleMetrics.empty() + ? 0 + : countPromotedFlexibleMetricEntries(virtualRootNode.children, + linkedFlexibleMetrics); writer.packMap( countMetricEntries(treeNode.metricSet.metrics, isRoot) + static_cast(treeNode.metricSet.flexibleMetrics.size()) + - countPromotedFlexibleMetricEntries( - virtualRootNode.children, - treeNode.metricSet.linkedFlexibleMetrics)); + promotedFlexibleMetricEntries); packMetrics(treeNode.metricSet.metrics, isRoot); packFlexibleMetrics(treeNode.metricSet.flexibleMetrics); - packPromotedFlexibleMetrics(virtualRootNode.children, - treeNode.metricSet.linkedFlexibleMetrics); + if (!linkedFlexibleMetrics.empty()) { + packPromotedFlexibleMetrics(virtualRootNode.children, + linkedFlexibleMetrics); + } bool hasLinkedVirtualNodes = false; uint32_t currentLinkedVirtualNodeMark = 0; if (!treeNode.metricSet.linkedMetrics.empty() || - !treeNode.metricSet.linkedFlexibleMetrics.empty()) { + !linkedFlexibleMetrics.empty()) { hasLinkedVirtualNodes = true; // Reuse the mark buffer across recursive packNode calls. Each node keeps // its own generation id so child recursion cannot overwrite the parent's @@ -793,8 +799,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, for (const auto &[linkedId, _] : treeNode.metricSet.linkedMetrics) { markLinkedVirtualNode(linkedId); } - for (const auto &[linkedId, _] : - treeNode.metricSet.linkedFlexibleMetrics) { + for (const auto &[linkedId, _] : linkedFlexibleMetrics) { // Flexible metrics are keyed by the child helper, but // serialized on the parent frame so the helper node can stay out of // the dumped tree. @@ -833,19 +838,25 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("metrics"); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); - const auto &linkedMetrics = - (metricsIt != treeNode.metricSet.linkedMetrics.end()) - ? metricsIt->second - : emptyMetrics; - writer.packMap( - countMetricEntries(linkedMetrics, /*isRoot=*/false) + - countPromotedFlexibleMetricEntries( - virtualNode.children, treeNode.metricSet.linkedFlexibleMetrics)); - packMetrics(linkedMetrics, /*isRoot=*/false); + const auto promotedFlexibleMetricEntries = + linkedFlexibleMetrics.empty() + ? 0 + : countPromotedFlexibleMetricEntries(virtualNode.children, + linkedFlexibleMetrics); + writer.packMap((metricsIt != treeNode.metricSet.linkedMetrics.end() + ? countMetricEntries(metricsIt->second, + /*isRoot=*/false) + : 0) + + promotedFlexibleMetricEntries); + if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { + packMetrics(metricsIt->second, /*isRoot=*/false); + } // Linked flexible metrics are only attached to // children, so they are always packed into the parent frame. - packPromotedFlexibleMetrics(virtualNode.children, - treeNode.metricSet.linkedFlexibleMetrics); + if (!linkedFlexibleMetrics.empty()) { + packPromotedFlexibleMetrics(virtualNode.children, + linkedFlexibleMetrics); + } writer.packFixStrLiteral("children"); writer.packArray(countLinkedVirtualChildren(virtualNode.children)); From 8646a65152b2c7052764d103cad7d4f4e39ac960 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:51:13 -0400 Subject: [PATCH 21/39] Pack Hatchet frame headers directly --- .../csrc/include/Utility/MsgPackWriter.h | 1 + third_party/proton/csrc/lib/Data/TreeData.cpp | 22 ++----------------- .../proton/csrc/lib/Utility/MsgPackWriter.cpp | 19 ++++++++++++++++ 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/third_party/proton/csrc/include/Utility/MsgPackWriter.h b/third_party/proton/csrc/include/Utility/MsgPackWriter.h index 4d441a912785..f0c466b78dfb 100644 --- a/third_party/proton/csrc/include/Utility/MsgPackWriter.h +++ b/third_party/proton/csrc/include/Utility/MsgPackWriter.h @@ -22,6 +22,7 @@ class MsgPackWriter { void packInt(int64_t value); void packDouble(double value); void packStr(std::string_view value); + void packHatchetFrameHeader(std::string_view name); template void packFixStrLiteral(const char (&value)[N]) { static_assert(N > 0); constexpr uint32_t size = static_cast(N - 1); diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 17cc2398ba54..98c84242083c 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -740,16 +740,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, // under the same frame. auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { - writer.packMap(3); - - writer.packFixStrLiteral("frame"); - writer.packMap(2); - writer.packFixStrLiteral("name"); - writer.packStr(treeNode.name); - writer.packFixStrLiteral("type"); - writer.packFixStrLiteral("function"); - - writer.packFixStrLiteral("metrics"); + writer.packHatchetFrameHeader(treeNode.name); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; const auto &linkedFlexibleMetrics = treeNode.metricSet.linkedFlexibleMetrics; @@ -826,16 +817,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); - writer.packMap(3); - - writer.packFixStrLiteral("frame"); - writer.packMap(2); - writer.packFixStrLiteral("name"); - writer.packStr(virtualNode.name); - writer.packFixStrLiteral("type"); - writer.packFixStrLiteral("function"); - - writer.packFixStrLiteral("metrics"); + writer.packHatchetFrameHeader(virtualNode.name); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); const auto promotedFlexibleMetricEntries = diff --git a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp index 26b3b9723c26..ffe8da971b9a 100644 --- a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp +++ b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp @@ -9,6 +9,12 @@ namespace proton { namespace { +void appendRaw(std::vector &out, const uint8_t *data, size_t size) { + const auto offset = out.size(); + out.resize(offset + size); + std::memcpy(out.data() + offset, data, size); +} + template void writeBE(std::vector &out, T value) { using U = std::make_unsigned_t; U u = static_cast(value); @@ -94,6 +100,19 @@ void MsgPackWriter::packStr(std::string_view value) { std::memcpy(out.data() + offset, value.data(), size); } +void MsgPackWriter::packHatchetFrameHeader(std::string_view name) { + static constexpr uint8_t prefix[] = {0x83, // map(3) + 0xa5, 'f', 'r', 'a', 'm', 'e', + 0x82, // frame: map(2) + 0xa4, 'n', 'a', 'm', 'e'}; + static constexpr uint8_t suffix[] = { + 0xa4, 't', 'y', 'p', 'e', 0xa8, 'f', 'u', 'n', 'c', 't', + 'i', 'o', 'n', 0xa7, 'm', 'e', 't', 'r', 'i', 'c', 's'}; + appendRaw(out, prefix, sizeof(prefix)); + packStr(name); + appendRaw(out, suffix, sizeof(suffix)); +} + void MsgPackWriter::packUIntString(uint64_t value) { char buffer[std::numeric_limits::digits10 + 1]; auto result = std::to_chars(buffer, buffer + sizeof(buffer), value); From ac2f160568796d10eb63c72f2f0a8e27c5f5d92a Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 20:59:30 -0400 Subject: [PATCH 22/39] Cache virtual MsgPack frame headers --- .../csrc/include/Utility/MsgPackWriter.h | 1 + third_party/proton/csrc/lib/Data/TreeData.cpp | 19 ++++++++++++++++++- .../proton/csrc/lib/Utility/MsgPackWriter.cpp | 10 +++++++--- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/third_party/proton/csrc/include/Utility/MsgPackWriter.h b/third_party/proton/csrc/include/Utility/MsgPackWriter.h index f0c466b78dfb..c70749d8cb6e 100644 --- a/third_party/proton/csrc/include/Utility/MsgPackWriter.h +++ b/third_party/proton/csrc/include/Utility/MsgPackWriter.h @@ -23,6 +23,7 @@ class MsgPackWriter { void packDouble(double value); void packStr(std::string_view value); void packHatchetFrameHeader(std::string_view name); + void appendRaw(const std::vector &bytes); template void packFixStrLiteral(const char (&value)[N]) { static_assert(N > 0); constexpr uint32_t size = static_cast(N - 1); diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 98c84242083c..0489168ef336 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -178,6 +178,20 @@ class TreeData::Tree { TreeNode &getNode(size_t id) { return treeNodes.at(id); } + const std::vector &getMsgPackFrameHeader(size_t id) { + if (msgPackFrameHeaderCache.size() < treeNodes.size()) { + msgPackFrameHeaderCache.resize(treeNodes.size()); + } + auto &header = msgPackFrameHeaderCache[id]; + if (header.empty()) { + MsgPackWriter writer; + writer.reserve(treeNodes[id].name.size() + 32); + writer.packHatchetFrameHeader(treeNodes[id].name); + header = std::move(writer).take(); + } + return header; + } + void upsertFlexibleMetric(size_t contextId, const FlexibleMetric &flexibleMetric) { auto &node = getNode(contextId); @@ -234,6 +248,9 @@ class TreeData::Tree { size_t nextContextId = TreeNode::RootId + 1; // Node ids are dense and assigned sequentially, so index lookup is enough. std::deque treeNodes; + // Cached MsgPack frame boilerplate keyed by dense node id. The cache is + // derived from immutable node names and grows with the tree. + std::vector> msgPackFrameHeaderCache; }; json TreeData::buildHatchetJson(TreeData::Tree *tree, @@ -817,7 +834,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); - writer.packHatchetFrameHeader(virtualNode.name); + writer.appendRaw(virtualTree->getMsgPackFrameHeader(virtualNodeId)); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); const auto promotedFlexibleMetricEntries = diff --git a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp index ffe8da971b9a..e2362d615f4d 100644 --- a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp +++ b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp @@ -9,7 +9,7 @@ namespace proton { namespace { -void appendRaw(std::vector &out, const uint8_t *data, size_t size) { +void appendBytes(std::vector &out, const uint8_t *data, size_t size) { const auto offset = out.size(); out.resize(offset + size); std::memcpy(out.data() + offset, data, size); @@ -108,9 +108,13 @@ void MsgPackWriter::packHatchetFrameHeader(std::string_view name) { static constexpr uint8_t suffix[] = { 0xa4, 't', 'y', 'p', 'e', 0xa8, 'f', 'u', 'n', 'c', 't', 'i', 'o', 'n', 0xa7, 'm', 'e', 't', 'r', 'i', 'c', 's'}; - appendRaw(out, prefix, sizeof(prefix)); + appendBytes(out, prefix, sizeof(prefix)); packStr(name); - appendRaw(out, suffix, sizeof(suffix)); + appendBytes(out, suffix, sizeof(suffix)); +} + +void MsgPackWriter::appendRaw(const std::vector &bytes) { + appendBytes(out, bytes.data(), bytes.size()); } void MsgPackWriter::packUIntString(uint64_t value) { From cae3c76ebfe365a46a8b804417ef3172e647ae13 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Fri, 22 May 2026 21:53:37 -0400 Subject: [PATCH 23/39] Refactor formatting in Metric.h and TraceData.cpp for improved readability --- third_party/proton/csrc/include/Data/Metric.h | 13 ++++++------- third_party/proton/csrc/lib/Data/TraceData.cpp | 5 ++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/third_party/proton/csrc/include/Data/Metric.h b/third_party/proton/csrc/include/Data/Metric.h index 183094720f4e..13f4cfcab999 100644 --- a/third_party/proton/csrc/include/Data/Metric.h +++ b/third_party/proton/csrc/include/Data/Metric.h @@ -82,9 +82,8 @@ class Metric { if (values[valueId].index() != value.index()) { throw makeInvalidArgument( std::string("Metric value type mismatch for valueId ") + - std::to_string(valueId) + " (" + - std::string(getValueName(valueId)) + ")" + - ": current=" + getTypeNameForIndex(values[valueId].index()) + + std::to_string(valueId) + " (" + std::string(getValueName(valueId)) + + ")" + ": current=" + getTypeNameForIndex(values[valueId].index()) + ", new=" + getTypeNameForIndex(value.index())); } // Handle string and other values separately @@ -108,8 +107,8 @@ class Metric { std::string("Vector metric size mismatch for " "valueId ") + std::to_string(valueId) + " (" + - std::string(getValueName(valueId)) + "): current=" + - std::to_string(currentValue.size()) + + std::string(getValueName(valueId)) + + "): current=" + std::to_string(currentValue.size()) + ", new=" + std::to_string(otherValue.size())); } for (size_t i = 0; i < currentValue.size(); ++i) { @@ -120,8 +119,8 @@ class Metric { std::string("Metric aggregation not supported for " "valueId ") + std::to_string(valueId) + " (" + - std::string(getValueName(valueId)) + "): type=" + - getTypeNameForIndex(values[valueId].index())); + std::string(getValueName(valueId)) + + "): type=" + getTypeNameForIndex(values[valueId].index())); } } }, diff --git a/third_party/proton/csrc/lib/Data/TraceData.cpp b/third_party/proton/csrc/lib/Data/TraceData.cpp index 2a9921b99a90..f5c9049156ad 100644 --- a/third_party/proton/csrc/lib/Data/TraceData.cpp +++ b/third_party/proton/csrc/lib/Data/TraceData.cpp @@ -973,9 +973,8 @@ void TraceData::dumpChromeTrace(std::ostream &os, size_t phase) const { for (auto targetEntryId : targetEntryIds) { // Linked target ids are event ids, so resolve through the event first. auto &targetEvent = virtualTrace->getEvent(targetEntryId); - auto resolvedContexts = - virtualTrace->getContexts(targetEvent.contextId, - /*skipRoot=*/true); + auto resolvedContexts = virtualTrace->getContexts(targetEvent.contextId, + /*skipRoot=*/true); std::vector virtualContexts; virtualContexts.reserve(resolvedContexts.size()); for (const auto &context : resolvedContexts) { From 6ed04f796097748ff2e7ac8189a240af6a9035ae Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 10:38:10 -0400 Subject: [PATCH 24/39] Simplify cached MsgPack frame headers --- third_party/proton/csrc/include/Data/Metric.h | 7 +++++ .../csrc/include/Utility/MsgPackWriter.h | 4 +-- third_party/proton/csrc/lib/Data/TreeData.cpp | 28 +++++++++++++------ .../proton/csrc/lib/Utility/MsgPackWriter.cpp | 20 ------------- 4 files changed, 29 insertions(+), 30 deletions(-) diff --git a/third_party/proton/csrc/include/Data/Metric.h b/third_party/proton/csrc/include/Data/Metric.h index 13f4cfcab999..bba13937e1a2 100644 --- a/third_party/proton/csrc/include/Data/Metric.h +++ b/third_party/proton/csrc/include/Data/Metric.h @@ -175,6 +175,7 @@ class FlexibleMetric : public Metric { const std::string &getName() const override { return name; } + // Flexible metrics carry their name as per-instance state. std::string_view getValueName(int valueId) const override { return valueName; } @@ -222,10 +223,12 @@ class KernelMetric : public Metric { const std::string &getName() const override { return name; } + // Fast path for callers that already know they are working with KernelMetric. static constexpr std::string_view getValueName(kernelMetricKind valueId) { return VALUE_NAMES[valueId]; } + // Virtual access used through the Metric interface. std::string_view getValueName(int valueId) const override { return VALUE_NAMES[valueId]; } @@ -285,10 +288,12 @@ class PCSamplingMetric : public Metric { const std::string &getName() const override { return name; } + // Fast path for callers that already know they are working with PCSamplingMetric. static constexpr std::string_view getValueName(PCSamplingMetricKind valueId) { return VALUE_NAMES[valueId]; } + // Virtual access used through the Metric interface. std::string_view getValueName(int valueId) const override { return VALUE_NAMES[valueId]; } @@ -370,10 +375,12 @@ class CycleMetric : public Metric { const std::string &getName() const override { return name; } + // Fast path for callers that already know they are working with CycleMetric. static constexpr std::string_view getValueName(CycleMetricKind valueId) { return VALUE_NAMES[valueId]; } + // Virtual access used through the Metric interface. std::string_view getValueName(int valueId) const override { return VALUE_NAMES[valueId]; } diff --git a/third_party/proton/csrc/include/Utility/MsgPackWriter.h b/third_party/proton/csrc/include/Utility/MsgPackWriter.h index c70749d8cb6e..1e79c3251414 100644 --- a/third_party/proton/csrc/include/Utility/MsgPackWriter.h +++ b/third_party/proton/csrc/include/Utility/MsgPackWriter.h @@ -22,18 +22,18 @@ class MsgPackWriter { void packInt(int64_t value); void packDouble(double value); void packStr(std::string_view value); - void packHatchetFrameHeader(std::string_view name); void appendRaw(const std::vector &bytes); template void packFixStrLiteral(const char (&value)[N]) { static_assert(N > 0); constexpr uint32_t size = static_cast(N - 1); + // MsgPack fixstr stores the string length in 5 bits, so literals must fit + // in the 0..31 byte range. static_assert(size <= 31); out.push_back(static_cast(0xa0 | size)); const auto offset = out.size(); out.resize(offset + size); std::memcpy(out.data() + offset, value, size); } - void packUIntString(uint64_t value); void packArray(uint32_t size); void packMap(uint32_t size); diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 0489168ef336..584192f2249b 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -28,6 +28,21 @@ namespace proton { namespace { constexpr size_t kMaxRegisteredDeviceIds = 32; + +std::vector buildMsgPackHatchetFrameHeader(std::string_view name) { + MsgPackWriter writer; + writer.reserve(name.size() + 32); + writer.packMap(3); + writer.packFixStrLiteral("frame"); + writer.packMap(2); + writer.packFixStrLiteral("name"); + writer.packStr(name); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); + writer.packFixStrLiteral("metrics"); + return std::move(writer).take(); +} + struct MetricSummary { // Whether we observed at least one kernel metric. bool hasKernelMetric = false; @@ -184,10 +199,7 @@ class TreeData::Tree { } auto &header = msgPackFrameHeaderCache[id]; if (header.empty()) { - MsgPackWriter writer; - writer.reserve(treeNodes[id].name.size() + 32); - writer.packHatchetFrameHeader(treeNodes[id].name); - header = std::move(writer).take(); + header = buildMsgPackHatchetFrameHeader(treeNodes[id].name); } return header; } @@ -593,7 +605,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packStr(KernelMetric::getValueName(KernelMetric::Invocations)); writer.packUInt(invocations); writer.packStr(KernelMetric::getValueName(KernelMetric::DeviceId)); - writer.packUIntString(deviceId); + writer.packStr(std::to_string(deviceId)); writer.packStr(KernelMetric::getValueName(KernelMetric::DeviceType)); writer.packStr(deviceTypeName); }; @@ -653,9 +665,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, CycleMetric::getValueName(CycleMetric::NormalizedDuration)); writer.packDouble(normalizedDuration); writer.packStr(CycleMetric::getValueName(CycleMetric::DeviceId)); - writer.packUIntString(deviceId); + writer.packStr(std::to_string(deviceId)); writer.packStr(CycleMetric::getValueName(CycleMetric::DeviceType)); - writer.packUIntString(deviceType); + writer.packStr(std::to_string(deviceType)); } else { throw makeLogicError("MetricKind not supported"); } @@ -757,7 +769,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, // under the same frame. auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { - writer.packHatchetFrameHeader(treeNode.name); + writer.appendRaw(tree->getMsgPackFrameHeader(treeNode.id)); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; const auto &linkedFlexibleMetrics = treeNode.metricSet.linkedFlexibleMetrics; diff --git a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp index e2362d615f4d..591dfd35a93d 100644 --- a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp +++ b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp @@ -1,6 +1,5 @@ #include "Utility/MsgPackWriter.h" -#include #include #include #include @@ -100,29 +99,10 @@ void MsgPackWriter::packStr(std::string_view value) { std::memcpy(out.data() + offset, value.data(), size); } -void MsgPackWriter::packHatchetFrameHeader(std::string_view name) { - static constexpr uint8_t prefix[] = {0x83, // map(3) - 0xa5, 'f', 'r', 'a', 'm', 'e', - 0x82, // frame: map(2) - 0xa4, 'n', 'a', 'm', 'e'}; - static constexpr uint8_t suffix[] = { - 0xa4, 't', 'y', 'p', 'e', 0xa8, 'f', 'u', 'n', 'c', 't', - 'i', 'o', 'n', 0xa7, 'm', 'e', 't', 'r', 'i', 'c', 's'}; - appendBytes(out, prefix, sizeof(prefix)); - packStr(name); - appendBytes(out, suffix, sizeof(suffix)); -} - void MsgPackWriter::appendRaw(const std::vector &bytes) { appendBytes(out, bytes.data(), bytes.size()); } -void MsgPackWriter::packUIntString(uint64_t value) { - char buffer[std::numeric_limits::digits10 + 1]; - auto result = std::to_chars(buffer, buffer + sizeof(buffer), value); - packStr(std::string_view(buffer, static_cast(result.ptr - buffer))); -} - void MsgPackWriter::packArray(uint32_t size) { if (size <= 15) { out.push_back(static_cast(0x90 | size)); From 1c90e442be3848603f263585a47a09e8280528be Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 10:56:12 -0400 Subject: [PATCH 25/39] Remove TreeData child index cache --- third_party/proton/csrc/lib/Data/TreeData.cpp | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 584192f2249b..a3cd78a4d5ed 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -110,10 +109,6 @@ class TreeData::Tree { size_t id = DummyId; }; - // Keep a linear child list for small fanouts to avoid hash table overhead. - // Build the lookup index only once repeated child scans become expensive. - static constexpr size_t kChildIndexThreshold = 8; - TreeNode() = default; explicit TreeNode(size_t id, const std::string &name) : id(id), Context(name) {} @@ -125,25 +120,9 @@ class TreeData::Tree { void addChild(std::string_view childName, size_t id) { children.push_back({childName, id}); - if (childIndex) { - childIndex->emplace(childName, id); - return; - } - if (children.size() > kChildIndexThreshold) { - childIndex = - std::make_unique>(); - childIndex->reserve(children.size()); - for (const auto &child : children) { - childIndex->emplace(child.name, child.id); - } - } } size_t findChild(std::string_view childName) const { - if (childIndex) { - auto it = childIndex->find(childName); - return it != childIndex->end() ? it->second : DummyId; - } for (const auto &child : children) { if (child.name == childName) { return child.id; @@ -155,8 +134,6 @@ class TreeData::Tree { size_t parentId = DummyId; size_t id = DummyId; std::vector children = {}; - std::unique_ptr> childIndex = - {}; // Direct and linked metrics associated with this tree node. DataEntry::MetricSet metricSet{}; friend class Tree; From 46b3aa1781a28a4e888b536a48160e8613c8a847 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 12:50:45 -0400 Subject: [PATCH 26/39] Revert TraceData context copy changes --- third_party/proton/csrc/lib/Data/TraceData.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TraceData.cpp b/third_party/proton/csrc/lib/Data/TraceData.cpp index f5c9049156ad..450c30a32ed2 100644 --- a/third_party/proton/csrc/lib/Data/TraceData.cpp +++ b/third_party/proton/csrc/lib/Data/TraceData.cpp @@ -691,7 +691,7 @@ void reconstructGraphScopeEvents( seenCaptureTag = true; } if (seenCaptureTag) { - graphContexts.emplace_back(context.name); + graphContexts.push_back(context); } } if (isMetadataKernel) { @@ -973,15 +973,9 @@ void TraceData::dumpChromeTrace(std::ostream &os, size_t phase) const { for (auto targetEntryId : targetEntryIds) { // Linked target ids are event ids, so resolve through the event first. auto &targetEvent = virtualTrace->getEvent(targetEntryId); - auto resolvedContexts = virtualTrace->getContexts(targetEvent.contextId, - /*skipRoot=*/true); - std::vector virtualContexts; - virtualContexts.reserve(resolvedContexts.size()); - for (const auto &context : resolvedContexts) { - virtualContexts.emplace_back(context.name); - } - targetIdToVirtualContexts.emplace(targetEntryId, - std::move(virtualContexts)); + targetIdToVirtualContexts.emplace( + targetEntryId, virtualTrace->getContexts(targetEvent.contextId, + /*skipRoot=*/true)); } }); } From 0da7be0eee7d33f9fe6e0ac421189f893349342d Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 13:07:25 -0400 Subject: [PATCH 27/39] Restore TreeData child index map --- third_party/proton/csrc/lib/Data/TreeData.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index a3cd78a4d5ed..03bb07cf9947 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -120,20 +121,18 @@ class TreeData::Tree { void addChild(std::string_view childName, size_t id) { children.push_back({childName, id}); + childIndex.emplace(childName, id); } size_t findChild(std::string_view childName) const { - for (const auto &child : children) { - if (child.name == childName) { - return child.id; - } - } - return DummyId; + auto it = childIndex.find(childName); + return it != childIndex.end() ? it->second : DummyId; } size_t parentId = DummyId; size_t id = DummyId; std::vector children = {}; + std::unordered_map childIndex = {}; // Direct and linked metrics associated with this tree node. DataEntry::MetricSet metricSet{}; friend class Tree; From 0a9c196ffa4173272fa1e84606472041ba947a44 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 13:25:47 -0400 Subject: [PATCH 28/39] Refine Proton MsgPack byte appends --- .../csrc/include/Utility/MsgPackWriter.h | 2 +- third_party/proton/csrc/lib/Data/TreeData.cpp | 66 ++++++++++--------- .../proton/csrc/lib/Utility/MsgPackWriter.cpp | 12 ++-- 3 files changed, 39 insertions(+), 41 deletions(-) diff --git a/third_party/proton/csrc/include/Utility/MsgPackWriter.h b/third_party/proton/csrc/include/Utility/MsgPackWriter.h index 1e79c3251414..4ba9f9c6a251 100644 --- a/third_party/proton/csrc/include/Utility/MsgPackWriter.h +++ b/third_party/proton/csrc/include/Utility/MsgPackWriter.h @@ -22,7 +22,7 @@ class MsgPackWriter { void packInt(int64_t value); void packDouble(double value); void packStr(std::string_view value); - void appendRaw(const std::vector &bytes); + void appendBytes(const std::vector &bytes); template void packFixStrLiteral(const char (&value)[N]) { static_assert(N > 0); constexpr uint32_t size = static_cast(N - 1); diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 03bb07cf9947..dd8057572701 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -138,7 +137,10 @@ class TreeData::Tree { friend class Tree; }; - Tree() { treeNodes.emplace_back(TreeNode::RootId, TreeNode::RootId, "ROOT"); } + Tree() { + treeNodeMap.try_emplace(TreeNode::RootId, TreeNode::RootId, + TreeNode::RootId, "ROOT"); + } size_t addNode(const std::vector &contexts, size_t parentId) { for (const auto &context : contexts) { @@ -148,14 +150,14 @@ class TreeData::Tree { } size_t addNode(const Context &context, size_t parentId) { - auto &parent = getNode(parentId); + auto &parent = treeNodeMap.at(parentId); std::string_view contextName = context.name; auto existingChildId = parent.findChild(contextName); if (existingChildId != TreeNode::DummyId) return existingChildId; auto id = nextContextId++; - treeNodes.emplace_back(id, parentId, context); - parent.addChild(treeNodes.back().name, id); + auto [it, inserted] = treeNodeMap.try_emplace(id, id, parentId, context); + parent.addChild(it->second.name, id); return id; } @@ -167,22 +169,11 @@ class TreeData::Tree { return parentId; } - TreeNode &getNode(size_t id) { return treeNodes.at(id); } - - const std::vector &getMsgPackFrameHeader(size_t id) { - if (msgPackFrameHeaderCache.size() < treeNodes.size()) { - msgPackFrameHeaderCache.resize(treeNodes.size()); - } - auto &header = msgPackFrameHeaderCache[id]; - if (header.empty()) { - header = buildMsgPackHatchetFrameHeader(treeNodes[id].name); - } - return header; - } + TreeNode &getNode(size_t id) { return treeNodeMap.at(id); } void upsertFlexibleMetric(size_t contextId, const FlexibleMetric &flexibleMetric) { - auto &node = getNode(contextId); + auto &node = treeNodeMap.at(contextId); auto &flexibleMetrics = node.metricSet.flexibleMetrics; auto valueName = std::string(flexibleMetric.getValueName(0)); auto it = flexibleMetrics.find(valueName); @@ -210,22 +201,21 @@ class TreeData::Tree { } } - size_t size() const { return treeNodes.size(); } + size_t size() const { return nextContextId; } Tree structure() const { Tree cloned; - cloned.treeNodes.clear(); cloned.nextContextId = nextContextId; - for (const auto &node : treeNodes) { - cloned.treeNodes.emplace_back(node.id, node.parentId, node); + for (const auto &[id, node] : treeNodeMap) { + cloned.treeNodeMap.try_emplace(id, id, node.parentId, node); } - for (const auto &node : treeNodes) { - auto &clonedNode = cloned.getNode(node.id); + for (const auto &[id, node] : treeNodeMap) { + auto &clonedNode = cloned.treeNodeMap.at(id); clonedNode.children.reserve(node.children.size()); for (const auto &child : node.children) { - clonedNode.addChild(cloned.getNode(child.id).name, child.id); + clonedNode.addChild(cloned.treeNodeMap[child.id].name, child.id); } } @@ -234,11 +224,8 @@ class TreeData::Tree { private: size_t nextContextId = TreeNode::RootId + 1; - // Node ids are dense and assigned sequentially, so index lookup is enough. - std::deque treeNodes; - // Cached MsgPack frame boilerplate keyed by dense node id. The cache is - // derived from immutable node names and grows with the tree. - std::vector> msgPackFrameHeaderCache; + // tree node id -> tree node + std::unordered_map treeNodeMap; }; json TreeData::buildHatchetJson(TreeData::Tree *tree, @@ -519,6 +506,19 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); std::vector linkedVirtualNodeMarks(virtualTree->size(), 0); uint32_t linkedVirtualNodeMark = 0; + std::vector> treeMsgPackFrameHeaderCache(tree->size()); + std::vector> virtualMsgPackFrameHeaderCache( + virtualTree->size()); + auto getMsgPackFrameHeader = + [&](TreeData::Tree *sourceTree, + std::vector> &cache, + size_t id) -> const std::vector & { + auto &header = cache[id]; + if (header.empty()) { + header = buildMsgPackHatchetFrameHeader(sourceTree->getNode(id).name); + } + return header; + }; // Root metrics only carry inclusive aggregate fields. Non-root metrics also // include device_id and device_type, so their serialized map entry counts are @@ -745,7 +745,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, // under the same frame. auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { - writer.appendRaw(tree->getMsgPackFrameHeader(treeNode.id)); + writer.appendBytes( + getMsgPackFrameHeader(tree, treeMsgPackFrameHeaderCache, treeNode.id)); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; const auto &linkedFlexibleMetrics = treeNode.metricSet.linkedFlexibleMetrics; @@ -822,7 +823,8 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); - writer.appendRaw(virtualTree->getMsgPackFrameHeader(virtualNodeId)); + writer.appendBytes(getMsgPackFrameHeader( + virtualTree, virtualMsgPackFrameHeaderCache, virtualNodeId)); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); const auto promotedFlexibleMetricEntries = diff --git a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp index 591dfd35a93d..dfbb493dabe0 100644 --- a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp +++ b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp @@ -8,12 +8,6 @@ namespace proton { namespace { -void appendBytes(std::vector &out, const uint8_t *data, size_t size) { - const auto offset = out.size(); - out.resize(offset + size); - std::memcpy(out.data() + offset, data, size); -} - template void writeBE(std::vector &out, T value) { using U = std::make_unsigned_t; U u = static_cast(value); @@ -99,8 +93,10 @@ void MsgPackWriter::packStr(std::string_view value) { std::memcpy(out.data() + offset, value.data(), size); } -void MsgPackWriter::appendRaw(const std::vector &bytes) { - appendBytes(out, bytes.data(), bytes.size()); +void MsgPackWriter::appendBytes(const std::vector &bytes) { + const auto offset = out.size(); + out.resize(offset + bytes.size()); + std::memcpy(out.data() + offset, bytes.data(), bytes.size()); } void MsgPackWriter::packArray(uint32_t size) { From d21651540d7c08677d684e7cf0ed1ad9541fc6f2 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 13:42:17 -0400 Subject: [PATCH 29/39] Simplify Proton JSON linked virtual dumping --- third_party/proton/csrc/lib/Data/TreeData.cpp | 88 +++++-------------- 1 file changed, 24 insertions(+), 64 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index dd8057572701..5c043226f0d8 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -235,8 +235,6 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, output.push_back(json::object()); jsonNodes[TreeData::Tree::TreeNode::RootId] = &(output.back()); MetricSummary metricSummary; - std::vector linkedVirtualNodeMarks(virtualTree->size(), 0); - uint32_t linkedVirtualNodeMark = 0; // Append fixed-schema metrics to a JSON metrics object and update device // metadata requirements while visiting them. auto appendMetrics = [&](json &metricsJson, @@ -348,63 +346,29 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, childrenArray = json::array(); const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - bool hasLinkedVirtualNodes = false; - uint32_t currentLinkedVirtualNodeMark = 0; - if (!treeNode.metricSet.linkedMetrics.empty() || - !treeNode.metricSet.linkedFlexibleMetrics.empty()) { - hasLinkedVirtualNodes = true; - // Reuse the mark buffer across tree nodes. Bumping the generation - // avoids clearing a virtual-tree-sized buffer for each real node. - ++linkedVirtualNodeMark; - if (linkedVirtualNodeMark == 0) { - std::fill(linkedVirtualNodeMarks.begin(), - linkedVirtualNodeMarks.end(), 0); - linkedVirtualNodeMark = 1; - } - currentLinkedVirtualNodeMark = linkedVirtualNodeMark; - // Mark each linked target and its ancestors, producing the smallest - // virtual subtree needed to keep the linked target reachable. - auto markLinkedVirtualNode = [&](size_t virtualNodeId) { - while (virtualNodeId != Tree::TreeNode::DummyId && - linkedVirtualNodeMarks[virtualNodeId] != - currentLinkedVirtualNodeMark) { - linkedVirtualNodeMarks[virtualNodeId] = - currentLinkedVirtualNodeMark; - if (virtualNodeId == Tree::TreeNode::RootId) { - break; - } - virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; - } - }; - for (const auto &[linkedId, _] : treeNode.metricSet.linkedMetrics) { - markLinkedVirtualNode(linkedId); - } - for (const auto &[linkedId, _] : - treeNode.metricSet.linkedFlexibleMetrics) { - // Flexible metrics are keyed by the child helper, but - // serialized on the parent frame so the helper node can stay out - // of the dumped tree. - markLinkedVirtualNode(virtualTree->getNode(linkedId).parentId); - } - } childrenArray.get_ref().reserve( treeNode.children.size() + virtualRootNode.children.size()); for (const auto &child : treeNode.children) { childrenArray.push_back(json::object()); jsonNodes[child.id] = &childrenArray.back(); } - // Copy a marked virtual subtree into the current JSON node. + if (treeNode.metricSet.linkedMetrics.empty() && + treeNode.metricSet.linkedFlexibleMetrics.empty()) { + return; + } + // JSON dumping is not the performance-critical path, so keep linked + // virtual serialization straightforward: walk the virtual tree and + // skip only empty leaves. Ancestors without metrics are still emitted + // when they are needed to keep metric-bearing descendants reachable. auto appendLinkedVirtualNode = [&](auto &&appendLinkedVirtualNode, size_t virtualNodeId, - json &outNode) -> void { + json &outChildren) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); - outNode = json::object(); - outNode["frame"] = {{"name", virtualNode.name}, {"type", "function"}}; - outNode["metrics"] = json::object(); + json metricsJson = json::object(); if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { - appendMetrics(outNode["metrics"], metricsIt->second); + appendMetrics(metricsJson, metricsIt->second); } // Linked flexible metrics are only attached to // children, so they always belong on the parent frame. @@ -412,32 +376,28 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, auto flexibleIt = treeNode.metricSet.linkedFlexibleMetrics.find(child.id); if (flexibleIt != treeNode.metricSet.linkedFlexibleMetrics.end()) { - appendFlexibleMetrics(outNode["metrics"], flexibleIt->second); + appendFlexibleMetrics(metricsJson, flexibleIt->second); } } - outNode["children"] = json::array(); - auto &linkedChildren = outNode["children"]; - linkedChildren.get_ref().reserve( - virtualNode.children.size()); + json linkedChildren = json::array(); for (const auto &child : virtualNode.children) { - if (linkedVirtualNodeMarks[child.id] != - currentLinkedVirtualNodeMark) { - continue; - } - linkedChildren.push_back(json::object()); appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, - linkedChildren.back()); + linkedChildren); } + if (metricsJson.empty() && linkedChildren.empty()) { + return; + } + outChildren.push_back(json::object()); + auto &outNode = outChildren.back(); + outNode["frame"] = {{"name", virtualNode.name}, + {"type", "function"}}; + outNode["metrics"] = std::move(metricsJson); + outNode["children"] = std::move(linkedChildren); }; for (const auto &child : virtualRootNode.children) { - if (!hasLinkedVirtualNodes || linkedVirtualNodeMarks[child.id] != - currentLinkedVirtualNodeMark) { - continue; - } - childrenArray.push_back(json::object()); appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, - childrenArray.back()); + childrenArray); } }); From 9027c35462fc2ad0533d5c4b59973e7759b84cb1 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 14:41:52 -0400 Subject: [PATCH 30/39] Split empty-node pruning from fast file write --- third_party/proton/csrc/lib/Data/TreeData.cpp | 122 ++++-------- third_party/proton/test/test_profile.py | 177 ++++++++++++------ 2 files changed, 148 insertions(+), 151 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 5c043226f0d8..c5d984292f73 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -346,29 +346,32 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, childrenArray = json::array(); const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); + const bool hasLinkedTargets = + !treeNode.metricSet.linkedMetrics.empty() || + !treeNode.metricSet.linkedFlexibleMetrics.empty(); childrenArray.get_ref().reserve( - treeNode.children.size() + virtualRootNode.children.size()); + treeNode.children.size() + + (hasLinkedTargets ? virtualRootNode.children.size() : 0)); for (const auto &child : treeNode.children) { childrenArray.push_back(json::object()); jsonNodes[child.id] = &childrenArray.back(); } - if (treeNode.metricSet.linkedMetrics.empty() && - treeNode.metricSet.linkedFlexibleMetrics.empty()) { + if (!hasLinkedTargets) { return; } - // JSON dumping is not the performance-critical path, so keep linked - // virtual serialization straightforward: walk the virtual tree and - // skip only empty leaves. Ancestors without metrics are still emitted - // when they are needed to keep metric-bearing descendants reachable. + // JSON dumping is not the performance-critical path, so use a direct + // recursive copy of the linked virtual tree. auto appendLinkedVirtualNode = [&](auto &&appendLinkedVirtualNode, size_t virtualNodeId, - json &outChildren) -> void { + json &outNode) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); - json metricsJson = json::object(); + outNode = json::object(); + outNode["frame"] = {{"name", virtualNode.name}, {"type", "function"}}; + outNode["metrics"] = json::object(); if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { - appendMetrics(metricsJson, metricsIt->second); + appendMetrics(outNode["metrics"], metricsIt->second); } // Linked flexible metrics are only attached to // children, so they always belong on the parent frame. @@ -376,28 +379,24 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, auto flexibleIt = treeNode.metricSet.linkedFlexibleMetrics.find(child.id); if (flexibleIt != treeNode.metricSet.linkedFlexibleMetrics.end()) { - appendFlexibleMetrics(metricsJson, flexibleIt->second); + appendFlexibleMetrics(outNode["metrics"], flexibleIt->second); } } - json linkedChildren = json::array(); + outNode["children"] = json::array(); + auto &linkedChildren = outNode["children"]; + linkedChildren.get_ref().reserve( + virtualNode.children.size()); for (const auto &child : virtualNode.children) { + linkedChildren.push_back(json::object()); appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, - linkedChildren); + linkedChildren.back()); } - if (metricsJson.empty() && linkedChildren.empty()) { - return; - } - outChildren.push_back(json::object()); - auto &outNode = outChildren.back(); - outNode["frame"] = {{"name", virtualNode.name}, - {"type", "function"}}; - outNode["metrics"] = std::move(metricsJson); - outNode["children"] = std::move(linkedChildren); }; for (const auto &child : virtualRootNode.children) { + childrenArray.push_back(json::object()); appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, - childrenArray); + childrenArray.back()); } }); @@ -464,8 +463,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, MetricSummary metricSummary; metricSummary.hasKernelMetric = true; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - std::vector linkedVirtualNodeMarks(virtualTree->size(), 0); - uint32_t linkedVirtualNodeMark = 0; std::vector> treeMsgPackFrameHeaderCache(tree->size()); std::vector> virtualMsgPackFrameHeaderCache( virtualTree->size()); @@ -725,61 +722,10 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, packPromotedFlexibleMetrics(virtualRootNode.children, linkedFlexibleMetrics); } - bool hasLinkedVirtualNodes = false; - uint32_t currentLinkedVirtualNodeMark = 0; - if (!treeNode.metricSet.linkedMetrics.empty() || - !linkedFlexibleMetrics.empty()) { - hasLinkedVirtualNodes = true; - // Reuse the mark buffer across recursive packNode calls. Each node keeps - // its own generation id so child recursion cannot overwrite the parent's - // linked virtual subtree. - ++linkedVirtualNodeMark; - if (linkedVirtualNodeMark == 0) { - std::fill(linkedVirtualNodeMarks.begin(), linkedVirtualNodeMarks.end(), - 0); - linkedVirtualNodeMark = 1; - } - currentLinkedVirtualNodeMark = linkedVirtualNodeMark; - // Mark each linked target and its ancestors, producing the smallest - // virtual subtree needed to keep the linked target reachable. - auto markLinkedVirtualNode = [&](size_t virtualNodeId) { - while (virtualNodeId != Tree::TreeNode::DummyId && - linkedVirtualNodeMarks[virtualNodeId] != - currentLinkedVirtualNodeMark) { - linkedVirtualNodeMarks[virtualNodeId] = currentLinkedVirtualNodeMark; - if (virtualNodeId == Tree::TreeNode::RootId) { - break; - } - virtualNodeId = virtualTree->getNode(virtualNodeId).parentId; - } - }; - for (const auto &[linkedId, _] : treeNode.metricSet.linkedMetrics) { - markLinkedVirtualNode(linkedId); - } - for (const auto &[linkedId, _] : linkedFlexibleMetrics) { - // Flexible metrics are keyed by the child helper, but - // serialized on the parent frame so the helper node can stay out of - // the dumped tree. - markLinkedVirtualNode(virtualTree->getNode(linkedId).parentId); - } - } - // Count marked linked children so MsgPack array headers can be emitted - // before recursively packing the child nodes. - auto countLinkedVirtualChildren = [&](const auto &children) { - uint32_t childCount = 0; - if (!hasLinkedVirtualNodes) { - return childCount; - } - for (const auto &child : children) { - if (linkedVirtualNodeMarks[child.id] == currentLinkedVirtualNodeMark) { - ++childCount; - } - } - return childCount; - }; + const bool hasLinkedTargets = !treeNode.metricSet.linkedMetrics.empty() || + !linkedFlexibleMetrics.empty(); - // Pack a marked virtual subtree as linked children of the current real - // frame. + // Pack the linked virtual subtree as children of the current real frame. auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); @@ -808,27 +754,25 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } writer.packFixStrLiteral("children"); - writer.packArray(countLinkedVirtualChildren(virtualNode.children)); + writer.packArray(static_cast(virtualNode.children.size())); for (const auto &child : virtualNode.children) { - if (linkedVirtualNodeMarks[child.id] != currentLinkedVirtualNodeMark) { - continue; - } packLinkedVirtualNode(packLinkedVirtualNode, child.id); } }; writer.packFixStrLiteral("children"); writer.packArray(static_cast(treeNode.children.size()) + - countLinkedVirtualChildren(virtualRootNode.children)); + (hasLinkedTargets + ? static_cast( + virtualRootNode.children.size()) + : 0)); for (const auto &child : treeNode.children) { packNode(packNode, tree->getNode(child.id)); } - for (const auto &child : virtualRootNode.children) { - if (!hasLinkedVirtualNodes || - linkedVirtualNodeMarks[child.id] != currentLinkedVirtualNodeMark) { - continue; + if (hasLinkedTargets) { + for (const auto &child : virtualRootNode.children) { + packLinkedVirtualNode(packLinkedVirtualNode, child.id); } - packLinkedVirtualNode(packLinkedVirtualNode, child.id); } }; diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index c79e8bdedf1d..d41c48c73acf 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -136,8 +136,6 @@ def fn(): for i in range(10): with proton.scope(f"iter_{i}"): fn() - with proton.scope("iter_without_kernel"): - pass with proton.scope("test0"): g.replay() @@ -151,8 +149,6 @@ def fn(): for i in range(10): with proton.scope(f"new_iter_{i}"): fn() - with proton.scope("new_iter_without_kernel"): - pass with proton.scope("test2"): g.replay() @@ -181,28 +177,22 @@ def fn(): assert test0_frame["children"][0]["metrics"]["time (ns)"] > 0 else: # cuda backend supports "" annotation - def has_metric_payload(frame): - return bool(frame["metrics"]) or any(has_metric_payload(child) for child in frame["children"]) - for test_frame in [test0_frame, test1_frame, test2_frame]: - capture_frame = _find_frame_by_name(test_frame, "") - assert capture_frame is not None - iter_prefix = "new_iter" if test_frame == test2_frame else "iter" - expected_iter_names = {f"{iter_prefix}_{i}" for i in range(10)} - empty_iter_name = f"{iter_prefix}_without_kernel" - capture_children = capture_frame["children"] - capture_child_names = {child["frame"]["name"] for child in capture_children} - - assert empty_iter_name not in capture_child_names - assert expected_iter_names <= capture_child_names - for child in capture_children: - assert has_metric_payload(child) - if child["frame"]["name"] in expected_iter_names: - assert child["children"][0]["metrics"]["time (ns)"] > 0 + child = _find_frame_by_name(test_frame, "") + assert child is not None + # check all iterations + total_iters = 0 + for child in child["children"]: + iter_frame = "iter" if test_frame != test2_frame else "new_iter" + if iter_frame in child["frame"]["name"]: # TODO(Keren): remove empty frames + if "time (ns)" in child["children"][0]["metrics"]: + total_iters += 1 + # 0...9 iterations + assert total_iters == 10 @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports metrics profiling in cudagraphs") -def test_cudagraph_deactivate_graph(tmp_path: pathlib.Path, device: str): +def test_cudagraph_metric_queue_handles_inactive_replay(tmp_path: pathlib.Path, device: str): stream = torch.cuda.Stream() torch.cuda.set_stream(stream) @@ -228,7 +218,7 @@ def profiled_kernel(x, y): profiled_kernel[(1, )](x, y) torch.cuda.synchronize() - temp_file = tmp_path / "test_cudagraph_deactivate_graph.hatchet" + temp_file = tmp_path / "test_cudagraph_metric_queue_handles_inactive_replay.hatchet" session = proton.start(str(temp_file.with_suffix("")), context="shadow", hook="triton") try: inactive_graph = torch.cuda.CUDAGraph() @@ -270,8 +260,64 @@ def profiled_kernel(x, y): assert profiled_frame["metrics"]["sum_metric"] == float(x.numel()) +@pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports cudagraph replay") +def test_cudagraph_not_captured_by_profiler(tmp_path: pathlib.Path, capfd, device: str): + stream = torch.cuda.Stream() + torch.cuda.set_stream(stream) + + @triton.jit + def foo(x, y, z): + tl.store(z, tl.load(y) + tl.load(x)) + + def fn(): + a = torch.ones((2, 2), device=device) + b = torch.ones((2, 2), device=device) + c = a + b + foo[(1, )](a, b, c) + + # Build/capture graph before profiler starts. + fn() + g = torch.cuda.CUDAGraph() + with cuda_graph_without_gc(g): + fn() + + temp_file = tmp_path / "test_cudagraph_not_captured_by_profiler.hatchet" + proton.start(str(temp_file.with_suffix("")), context="shadow") + with proton.scope("replay0"): + g.replay() + with proton.scope("replay1"): + g.replay() + proton.finalize() + + captured = capfd.readouterr() + assert captured.err.count("Cannot find graph for graphExecId:") == 1 + assert "start profiling before the graph is created" in captured.err + + with temp_file.open() as f: + data = json.load(f) + replay0_frame = None + replay1_frame = None + for child in data[0]["children"]: + if child["frame"]["name"] == "replay0": + replay0_frame = child + elif child["frame"]["name"] == "replay1": + replay1_frame = child + assert replay0_frame is not None + assert replay1_frame is not None + assert len(replay0_frame["children"]) >= 3 + assert len(replay1_frame["children"]) >= 3 + + def has_positive_time_metric(node): + if node["metrics"].get("time (ns)", 0) > 0: + return True + return any(has_positive_time_metric(child) for child in node["children"]) + + assert has_positive_time_metric(replay0_frame) + assert has_positive_time_metric(replay1_frame) + + @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports cudagraph deactivation") -def test_cudagraph_deactivate_within_graph(tmp_path, device: str): +def test_cudagraph_deactivate(tmp_path, device: str): stream = torch.cuda.Stream() torch.cuda.set_stream(stream) @@ -290,7 +336,7 @@ def fn(session): c = a + b foo[(1, )](a, b, c) - temp_file = tmp_path / "test_cudagraph_deactivate_within_graph.hatchet" + temp_file = tmp_path / "test_cudagraph_deactivate.hatchet" session = proton.start(str(temp_file.with_suffix("")), context="shadow", hook="triton") # warmup @@ -339,7 +385,8 @@ def fn(session): @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports cudagraph replay") -def test_cudagraph_not_captured_by_profiler(tmp_path: pathlib.Path, capfd, device: str): +@pytest.mark.parametrize("data_format", ["hatchet", "hatchet_msgpack"]) +def test_cudagraph_filters_unlinked_virtual_scopes(tmp_path: pathlib.Path, data_format: str, device: str): stream = torch.cuda.Stream() torch.cuda.set_stream(stream) @@ -347,51 +394,57 @@ def test_cudagraph_not_captured_by_profiler(tmp_path: pathlib.Path, capfd, devic def foo(x, y, z): tl.store(z, tl.load(y) + tl.load(x)) - def fn(): - a = torch.ones((2, 2), device=device) - b = torch.ones((2, 2), device=device) - c = a + b - foo[(1, )](a, b, c) + a = torch.ones((2, 2), device=device) + b = torch.ones((2, 2), device=device) + c = torch.empty_like(a) + + temp_file = tmp_path / f"test_cudagraph_filters_unlinked_virtual_scopes.{data_format}" + proton.start(str(temp_file.with_suffix("")), context="shadow") + + # Warmup to avoid one-time setup effects in replay output. + foo[(1, )](a, b, c) - # Build/capture graph before profiler starts. - fn() g = torch.cuda.CUDAGraph() with cuda_graph_without_gc(g): - fn() + with proton.scope("iter_with_kernel"): + foo[(1, )](a, b, c) + with proton.scope("iter_without_kernel"): + pass - temp_file = tmp_path / "test_cudagraph_not_captured_by_profiler.hatchet" - proton.start(str(temp_file.with_suffix("")), context="shadow") - with proton.scope("replay0"): - g.replay() - with proton.scope("replay1"): + with proton.scope("replay"): g.replay() - proton.finalize() - captured = capfd.readouterr() - assert captured.err.count("Cannot find graph for graphExecId:") == 1 - assert "start profiling before the graph is created" in captured.err + proton.finalize(output_format=data_format) - with temp_file.open() as f: - data = json.load(f) - replay0_frame = None - replay1_frame = None - for child in data[0]["children"]: - if child["frame"]["name"] == "replay0": - replay0_frame = child - elif child["frame"]["name"] == "replay1": - replay1_frame = child - assert replay0_frame is not None - assert replay1_frame is not None - assert len(replay0_frame["children"]) >= 3 - assert len(replay1_frame["children"]) >= 3 + if data_format == "hatchet_msgpack": + import msgpack - def has_positive_time_metric(node): - if node["metrics"].get("time (ns)", 0) > 0: - return True - return any(has_positive_time_metric(child) for child in node["children"]) + with temp_file.open("rb") as f: + data = msgpack.load(f, raw=False, strict_map_key=False) + else: + with temp_file.open() as f: + data = json.load(f) - assert has_positive_time_metric(replay0_frame) - assert has_positive_time_metric(replay1_frame) + replay_frame = next( + (child for child in data[0]["children"] if child["frame"]["name"] == "replay"), + None, + ) + assert replay_frame is not None + capture_frame = _find_frame_by_name(replay_frame, "") + assert capture_frame is not None + + capture_children = capture_frame["children"] + capture_child_names = {child["frame"]["name"] for child in capture_children} + assert "iter_with_kernel" in capture_child_names + assert "iter_without_kernel" not in capture_child_names + + iter_with_kernel_frame = next( + (child for child in capture_children if child["frame"]["name"] == "iter_with_kernel"), + None, + ) + assert iter_with_kernel_frame is not None + assert len(iter_with_kernel_frame["children"]) > 0 + assert iter_with_kernel_frame["children"][0]["metrics"]["time (ns)"] > 0 @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports metrics profiling in cudagraphs") From 16d8c8e0e1732075df9963b2cac68e4083883697 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Wed, 27 May 2026 11:41:27 -0400 Subject: [PATCH 31/39] Apply Proton formatting fixes --- third_party/proton/csrc/include/Data/Metric.h | 3 ++- third_party/proton/csrc/lib/Data/TreeData.cpp | 14 ++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/third_party/proton/csrc/include/Data/Metric.h b/third_party/proton/csrc/include/Data/Metric.h index bba13937e1a2..e4da22a1ed91 100644 --- a/third_party/proton/csrc/include/Data/Metric.h +++ b/third_party/proton/csrc/include/Data/Metric.h @@ -288,7 +288,8 @@ class PCSamplingMetric : public Metric { const std::string &getName() const override { return name; } - // Fast path for callers that already know they are working with PCSamplingMetric. + // Fast path for callers that already know they are working with + // PCSamplingMetric. static constexpr std::string_view getValueName(PCSamplingMetricKind valueId) { return VALUE_NAMES[valueId]; } diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index c5d984292f73..c0e0fb353b7a 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -466,10 +466,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, std::vector> treeMsgPackFrameHeaderCache(tree->size()); std::vector> virtualMsgPackFrameHeaderCache( virtualTree->size()); - auto getMsgPackFrameHeader = - [&](TreeData::Tree *sourceTree, - std::vector> &cache, - size_t id) -> const std::vector & { + auto getMsgPackFrameHeader = [&](TreeData::Tree *sourceTree, + std::vector> &cache, + size_t id) -> const std::vector & { auto &header = cache[id]; if (header.empty()) { header = buildMsgPackHatchetFrameHeader(sourceTree->getNode(id).name); @@ -762,10 +761,9 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.packFixStrLiteral("children"); writer.packArray(static_cast(treeNode.children.size()) + - (hasLinkedTargets - ? static_cast( - virtualRootNode.children.size()) - : 0)); + (hasLinkedTargets ? static_cast( + virtualRootNode.children.size()) + : 0)); for (const auto &child : treeNode.children) { packNode(packNode, tree->getNode(child.id)); } From 291e12653be78bc7bf80a7dd3f194e9b902fade1 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 15:31:40 -0400 Subject: [PATCH 32/39] Remove Proton MsgPack frame header caches --- .../csrc/include/Utility/MsgPackWriter.h | 1 - third_party/proton/csrc/lib/Data/TreeData.cpp | 24 ++++--------------- .../proton/csrc/lib/Utility/MsgPackWriter.cpp | 6 ----- 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/third_party/proton/csrc/include/Utility/MsgPackWriter.h b/third_party/proton/csrc/include/Utility/MsgPackWriter.h index 4ba9f9c6a251..b37e3efbe954 100644 --- a/third_party/proton/csrc/include/Utility/MsgPackWriter.h +++ b/third_party/proton/csrc/include/Utility/MsgPackWriter.h @@ -22,7 +22,6 @@ class MsgPackWriter { void packInt(int64_t value); void packDouble(double value); void packStr(std::string_view value); - void appendBytes(const std::vector &bytes); template void packFixStrLiteral(const char (&value)[N]) { static_assert(N > 0); constexpr uint32_t size = static_cast(N - 1); diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index c0e0fb353b7a..69e1d558bff8 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -28,9 +28,8 @@ namespace { constexpr size_t kMaxRegisteredDeviceIds = 32; -std::vector buildMsgPackHatchetFrameHeader(std::string_view name) { - MsgPackWriter writer; - writer.reserve(name.size() + 32); +void packMsgPackHatchetFrameHeader(MsgPackWriter &writer, + std::string_view name) { writer.packMap(3); writer.packFixStrLiteral("frame"); writer.packMap(2); @@ -39,7 +38,6 @@ std::vector buildMsgPackHatchetFrameHeader(std::string_view name) { writer.packFixStrLiteral("type"); writer.packFixStrLiteral("function"); writer.packFixStrLiteral("metrics"); - return std::move(writer).take(); } struct MetricSummary { @@ -463,18 +461,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, MetricSummary metricSummary; metricSummary.hasKernelMetric = true; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); - std::vector> treeMsgPackFrameHeaderCache(tree->size()); - std::vector> virtualMsgPackFrameHeaderCache( - virtualTree->size()); - auto getMsgPackFrameHeader = [&](TreeData::Tree *sourceTree, - std::vector> &cache, - size_t id) -> const std::vector & { - auto &header = cache[id]; - if (header.empty()) { - header = buildMsgPackHatchetFrameHeader(sourceTree->getNode(id).name); - } - return header; - }; // Root metrics only carry inclusive aggregate fields. Non-root metrics also // include device_id and device_type, so their serialized map entry counts are @@ -701,8 +687,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, // under the same frame. auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { - writer.appendBytes( - getMsgPackFrameHeader(tree, treeMsgPackFrameHeaderCache, treeNode.id)); + packMsgPackHatchetFrameHeader(writer, treeNode.name); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; const auto &linkedFlexibleMetrics = treeNode.metricSet.linkedFlexibleMetrics; @@ -728,8 +713,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); - writer.appendBytes(getMsgPackFrameHeader( - virtualTree, virtualMsgPackFrameHeaderCache, virtualNodeId)); + packMsgPackHatchetFrameHeader(writer, virtualNode.name); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); const auto promotedFlexibleMetricEntries = diff --git a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp index dfbb493dabe0..8d89dbe9c3f3 100644 --- a/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp +++ b/third_party/proton/csrc/lib/Utility/MsgPackWriter.cpp @@ -93,12 +93,6 @@ void MsgPackWriter::packStr(std::string_view value) { std::memcpy(out.data() + offset, value.data(), size); } -void MsgPackWriter::appendBytes(const std::vector &bytes) { - const auto offset = out.size(); - out.resize(offset + bytes.size()); - std::memcpy(out.data() + offset, bytes.data(), bytes.size()); -} - void MsgPackWriter::packArray(uint32_t size) { if (size <= 15) { out.push_back(static_cast(0x90 | size)); From 828e20f1187b590735b2f1f44e9e4f9f9a3ea8df Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 16:15:54 -0400 Subject: [PATCH 33/39] Clarify linked flexible metric JSON placement --- third_party/proton/csrc/lib/Data/TreeData.cpp | 94 +++++++++---------- third_party/proton/test/test_profile.py | 22 +++++ 2 files changed, 68 insertions(+), 48 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index 69e1d558bff8..f243e0a3c194 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -28,18 +29,6 @@ namespace { constexpr size_t kMaxRegisteredDeviceIds = 32; -void packMsgPackHatchetFrameHeader(MsgPackWriter &writer, - std::string_view name) { - writer.packMap(3); - writer.packFixStrLiteral("frame"); - writer.packMap(2); - writer.packFixStrLiteral("name"); - writer.packStr(name); - writer.packFixStrLiteral("type"); - writer.packFixStrLiteral("function"); - writer.packFixStrLiteral("metrics"); -} - struct MetricSummary { // Whether we observed at least one kernel metric. bool hasKernelMetric = false; @@ -359,42 +348,41 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, } // JSON dumping is not the performance-critical path, so use a direct // recursive copy of the linked virtual tree. - auto appendLinkedVirtualNode = [&](auto &&appendLinkedVirtualNode, - size_t virtualNodeId, - json &outNode) -> void { - const auto &virtualNode = virtualTree->getNode(virtualNodeId); - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - outNode = json::object(); - outNode["frame"] = {{"name", virtualNode.name}, {"type", "function"}}; - outNode["metrics"] = json::object(); - if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { - appendMetrics(outNode["metrics"], metricsIt->second); - } - // Linked flexible metrics are only attached to - // children, so they always belong on the parent frame. - for (const auto &child : virtualNode.children) { - auto flexibleIt = - treeNode.metricSet.linkedFlexibleMetrics.find(child.id); - if (flexibleIt != treeNode.metricSet.linkedFlexibleMetrics.end()) { - appendFlexibleMetrics(outNode["metrics"], flexibleIt->second); - } - } - outNode["children"] = json::array(); - auto &linkedChildren = outNode["children"]; - linkedChildren.get_ref().reserve( - virtualNode.children.size()); - for (const auto &child : virtualNode.children) { - linkedChildren.push_back(json::object()); - appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, - linkedChildren.back()); - } - }; + std::function appendLinkedVirtualNode = + [&](size_t virtualNodeId, json &outNode, json &parentMetricsJson) { + const auto &virtualNode = virtualTree->getNode(virtualNodeId); + const auto metricsIt = + treeNode.metricSet.linkedMetrics.find(virtualNodeId); + const auto flexibleIt = + treeNode.metricSet.linkedFlexibleMetrics.find(virtualNodeId); + outNode = json::object(); + outNode["frame"] = {{"name", virtualNode.name}, + {"type", "function"}}; + outNode["metrics"] = json::object(); + if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { + appendMetrics(outNode["metrics"], metricsIt->second); + } + // Linked flexible metrics are attached to helper nodes, + // but they belong on the helper's parent frame. + if (flexibleIt != + treeNode.metricSet.linkedFlexibleMetrics.end()) { + appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); + } + outNode["children"] = json::array(); + auto &linkedChildren = outNode["children"]; + linkedChildren.get_ref().reserve( + virtualNode.children.size()); + for (const auto &child : virtualNode.children) { + linkedChildren.push_back(json::object()); + appendLinkedVirtualNode(child.id, linkedChildren.back(), + outNode["metrics"]); + } + }; for (const auto &child : virtualRootNode.children) { - childrenArray.push_back(json::object()); - appendLinkedVirtualNode(appendLinkedVirtualNode, child.id, - childrenArray.back()); + json linkedRootChildNode; + appendLinkedVirtualNode(child.id, linkedRootChildNode, metricsJson); + childrenArray.push_back(std::move(linkedRootChildNode)); } }); @@ -461,6 +449,16 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, MetricSummary metricSummary; metricSummary.hasKernelMetric = true; const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); + auto packHatchetFrameHeader = [&](std::string_view name) { + writer.packMap(3); + writer.packFixStrLiteral("frame"); + writer.packMap(2); + writer.packFixStrLiteral("name"); + writer.packStr(name); + writer.packFixStrLiteral("type"); + writer.packFixStrLiteral("function"); + writer.packFixStrLiteral("metrics"); + }; // Root metrics only carry inclusive aggregate fields. Non-root metrics also // include device_id and device_type, so their serialized map entry counts are @@ -687,7 +685,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, // under the same frame. auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { - packMsgPackHatchetFrameHeader(writer, treeNode.name); + packHatchetFrameHeader(treeNode.name); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; const auto &linkedFlexibleMetrics = treeNode.metricSet.linkedFlexibleMetrics; @@ -713,7 +711,7 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); - packMsgPackHatchetFrameHeader(writer, virtualNode.name); + packHatchetFrameHeader(virtualNode.name); const auto metricsIt = treeNode.metricSet.linkedMetrics.find(virtualNodeId); const auto promotedFlexibleMetricEntries = diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index d41c48c73acf..696c1da7730c 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -31,6 +31,10 @@ def _find_frame_by_name(frame, name): return None +def _find_child_frame_by_name(frame, name): + return next((child for child in frame["children"] if child["frame"]["name"] == name), None) + + @contextmanager def cuda_graph_without_gc(*args, **kwargs): # A loaded Triton CompiledKernel may be finalized by Python's cyclic GC. @@ -1696,11 +1700,20 @@ def fn(): assert scope_a_frame is not None assert scope_a_frame["metrics"]["bytes"] == 160 assert "count" not in scope_a_frame["metrics"] + scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") + if scope_a_metric_frame is not None: + assert "bytes" not in scope_a_metric_frame["metrics"] assert scope_b_frame is not None assert scope_b_frame["metrics"]["sum"] == 40.0 assert "count" not in scope_b_frame["metrics"] + scope_b_metric_frame = _find_child_frame_by_name(scope_b_frame, "") + if scope_b_metric_frame is not None: + assert "sum" not in scope_b_metric_frame["metrics"] assert scope_d_frame is not None assert scope_d_frame["metrics"]["vec"] == [0, 10, 20, 30] + scope_d_metric_frame = _find_child_frame_by_name(scope_d_frame, "") + if scope_d_metric_frame is not None: + assert "vec" not in scope_d_metric_frame["metrics"] @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports metrics profiling in cudagraphs") @@ -1840,7 +1853,13 @@ def run_on_device(device_id): assert foo_frame["metrics"]["flops"] == 40 assert foo_frame["metrics"]["device_id"] == str(device.index) assert scope_a_frame["metrics"]["bytes"] == 160 + scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") + if scope_a_metric_frame is not None: + assert "bytes" not in scope_a_metric_frame["metrics"] assert scope_b_frame["metrics"]["sum"] == 40.0 + scope_b_metric_frame = _find_child_frame_by_name(scope_b_frame, "") + if scope_b_metric_frame is not None: + assert "sum" not in scope_b_metric_frame["metrics"] assert len(data) > 1 cuda_devices = data[1].get("CUDA", {}) @@ -1957,6 +1976,9 @@ def fn(): assert scope_a_frame is not None assert foo_test_frame is not None assert scope_a_frame["metrics"]["bytes"] == test_iterations / 10 * 16 + scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") + if scope_a_metric_frame is not None: + assert "bytes" not in scope_a_metric_frame["metrics"] assert foo_test_frame["metrics"]["bytes"] == test_iterations / 10 * 16 assert foo_test_frame["metrics"]["flops"] == test_iterations / 10 * 4 From 7f8a47410035f497759106de6b9ccf2de03f413e Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 16:35:23 -0400 Subject: [PATCH 34/39] Avoid std::function in Proton MsgPack recursion --- third_party/proton/csrc/lib/Data/TreeData.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index f243e0a3c194..af296ff22f75 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -681,8 +681,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } } }; - // Pack a real tree node, followed by any linked virtual subtree that belongs - // under the same frame. auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { packHatchetFrameHeader(treeNode.name); @@ -707,7 +705,6 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, const bool hasLinkedTargets = !treeNode.metricSet.linkedMetrics.empty() || !linkedFlexibleMetrics.empty(); - // Pack the linked virtual subtree as children of the current real frame. auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); @@ -741,17 +738,18 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } }; + uint32_t linkedChildCount = + hasLinkedTargets ? static_cast(virtualRootNode.children.size()) + : 0; writer.packFixStrLiteral("children"); writer.packArray(static_cast(treeNode.children.size()) + - (hasLinkedTargets ? static_cast( - virtualRootNode.children.size()) - : 0)); + linkedChildCount); for (const auto &child : treeNode.children) { packNode(packNode, tree->getNode(child.id)); } if (hasLinkedTargets) { - for (const auto &child : virtualRootNode.children) { - packLinkedVirtualNode(packLinkedVirtualNode, child.id); + for (const auto &virtualChild : virtualRootNode.children) { + packLinkedVirtualNode(packLinkedVirtualNode, virtualChild.id); } } }; From f6b53dbddbe2baf78b470bb895a26f0eb8b7d413 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 17:09:01 -0400 Subject: [PATCH 35/39] Refactor flexible metrics handling in buildHatchetMsgPack --- third_party/proton/csrc/lib/Data/TreeData.cpp | 96 +++++++------------ 1 file changed, 36 insertions(+), 60 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index af296ff22f75..c2c190b6c41a 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -655,82 +655,54 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, flexibleMetric.getValues()[0]); } }; - // Count flexible metrics attached to child helpers that will be - // promoted into the parent frame's metrics map. - auto countPromotedFlexibleMetricEntries = - [&](const auto &children, - const DataEntry::LinkedFlexibleMetricMap &linkedFlexibleMetrics) - -> uint32_t { - uint32_t metricEntries = 0; - for (const auto &child : children) { - auto it = linkedFlexibleMetrics.find(child.id); - if (it != linkedFlexibleMetrics.end()) { - metricEntries += static_cast(it->second.size()); - } - } - return metricEntries; - }; - // Pack the child helper entries into the parent frame. - auto packPromotedFlexibleMetrics = - [&](const auto &children, - const DataEntry::LinkedFlexibleMetricMap &linkedFlexibleMetrics) { - for (const auto &child : children) { - auto it = linkedFlexibleMetrics.find(child.id); - if (it != linkedFlexibleMetrics.end()) { - packFlexibleMetrics(it->second); - } - } - }; auto packNode = [&](auto &&packNode, TreeData::Tree::TreeNode &treeNode) -> void { + // Write the header packHatchetFrameHeader(treeNode.name); const bool isRoot = treeNode.id == TreeData::Tree::TreeNode::RootId; - const auto &linkedFlexibleMetrics = - treeNode.metricSet.linkedFlexibleMetrics; - const auto promotedFlexibleMetricEntries = - linkedFlexibleMetrics.empty() - ? 0 - : countPromotedFlexibleMetricEntries(virtualRootNode.children, - linkedFlexibleMetrics); + // Write the concrete nodes' own metrics and flexible metrics writer.packMap( countMetricEntries(treeNode.metricSet.metrics, isRoot) + - static_cast(treeNode.metricSet.flexibleMetrics.size()) + - promotedFlexibleMetricEntries); + static_cast(treeNode.metricSet.flexibleMetrics.size())); packMetrics(treeNode.metricSet.metrics, isRoot); packFlexibleMetrics(treeNode.metricSet.flexibleMetrics); - if (!linkedFlexibleMetrics.empty()) { - packPromotedFlexibleMetrics(virtualRootNode.children, - linkedFlexibleMetrics); - } - const bool hasLinkedTargets = !treeNode.metricSet.linkedMetrics.empty() || - !linkedFlexibleMetrics.empty(); auto packLinkedVirtualNode = [&](auto &&packLinkedVirtualNode, size_t virtualNodeId) -> void { const auto &virtualNode = virtualTree->getNode(virtualNodeId); + auto &linkedMetrics = treeNode.metricSet.linkedMetrics; + auto &linkedFlexibleMetrics = treeNode.metricSet.linkedFlexibleMetrics; + // Write the header packHatchetFrameHeader(virtualNode.name); - const auto metricsIt = - treeNode.metricSet.linkedMetrics.find(virtualNodeId); - const auto promotedFlexibleMetricEntries = - linkedFlexibleMetrics.empty() - ? 0 - : countPromotedFlexibleMetricEntries(virtualNode.children, - linkedFlexibleMetrics); - writer.packMap((metricsIt != treeNode.metricSet.linkedMetrics.end() - ? countMetricEntries(metricsIt->second, - /*isRoot=*/false) - : 0) + - promotedFlexibleMetricEntries); + // Count linked metrics + auto metricEntries = 0u; + const auto metricsIt = linkedMetrics.find(virtualNodeId); + if (metricsIt != linkedMetrics.end()) { + metricEntries += + countMetricEntries(metricsIt->second, /*isRoot=*/false); + } + // Count linked flexible metrics exist in the child helpers + if (!linkedFlexibleMetrics.empty()) { + for (const auto &child : virtualNode.children) { + auto it = linkedFlexibleMetrics.find(child.id); + if (it != linkedFlexibleMetrics.end()) { + metricEntries += static_cast(it->second.size()); + } + } + } + // Pack + writer.packMap(metricEntries); if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { packMetrics(metricsIt->second, /*isRoot=*/false); } - // Linked flexible metrics are only attached to - // children, so they are always packed into the parent frame. if (!linkedFlexibleMetrics.empty()) { - packPromotedFlexibleMetrics(virtualNode.children, - linkedFlexibleMetrics); + for (const auto &child : virtualNode.children) { + auto it = linkedFlexibleMetrics.find(child.id); + if (it != linkedFlexibleMetrics.end()) { + packFlexibleMetrics(it->second); + } + } } - writer.packFixStrLiteral("children"); writer.packArray(static_cast(virtualNode.children.size())); for (const auto &child : virtualNode.children) { @@ -738,9 +710,13 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, } }; + const bool hasLinkedTargets = + !treeNode.metricSet.linkedMetrics.empty() || + !treeNode.metricSet.linkedFlexibleMetrics.empty(); uint32_t linkedChildCount = - hasLinkedTargets ? static_cast(virtualRootNode.children.size()) - : 0; + hasLinkedTargets + ? static_cast(virtualRootNode.children.size()) + : 0; writer.packFixStrLiteral("children"); writer.packArray(static_cast(treeNode.children.size()) + linkedChildCount); From 7febd865848ead683d0984e70749960242e9f3b9 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 17:21:48 -0400 Subject: [PATCH 36/39] Tighten Proton linked metric assertions --- third_party/proton/test/test_profile.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index 696c1da7730c..feadbf8d9539 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -1701,19 +1701,16 @@ def fn(): assert scope_a_frame["metrics"]["bytes"] == 160 assert "count" not in scope_a_frame["metrics"] scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") - if scope_a_metric_frame is not None: - assert "bytes" not in scope_a_metric_frame["metrics"] + assert "bytes" not in scope_a_metric_frame["metrics"] assert scope_b_frame is not None assert scope_b_frame["metrics"]["sum"] == 40.0 assert "count" not in scope_b_frame["metrics"] scope_b_metric_frame = _find_child_frame_by_name(scope_b_frame, "") - if scope_b_metric_frame is not None: - assert "sum" not in scope_b_metric_frame["metrics"] + assert "sum" not in scope_b_metric_frame["metrics"] assert scope_d_frame is not None assert scope_d_frame["metrics"]["vec"] == [0, 10, 20, 30] scope_d_metric_frame = _find_child_frame_by_name(scope_d_frame, "") - if scope_d_metric_frame is not None: - assert "vec" not in scope_d_metric_frame["metrics"] + assert "vec" not in scope_d_metric_frame["metrics"] @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports metrics profiling in cudagraphs") @@ -1854,12 +1851,10 @@ def run_on_device(device_id): assert foo_frame["metrics"]["device_id"] == str(device.index) assert scope_a_frame["metrics"]["bytes"] == 160 scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") - if scope_a_metric_frame is not None: - assert "bytes" not in scope_a_metric_frame["metrics"] + assert "bytes" not in scope_a_metric_frame["metrics"] assert scope_b_frame["metrics"]["sum"] == 40.0 scope_b_metric_frame = _find_child_frame_by_name(scope_b_frame, "") - if scope_b_metric_frame is not None: - assert "sum" not in scope_b_metric_frame["metrics"] + assert "sum" not in scope_b_metric_frame["metrics"] assert len(data) > 1 cuda_devices = data[1].get("CUDA", {}) @@ -1977,8 +1972,7 @@ def fn(): assert foo_test_frame is not None assert scope_a_frame["metrics"]["bytes"] == test_iterations / 10 * 16 scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") - if scope_a_metric_frame is not None: - assert "bytes" not in scope_a_metric_frame["metrics"] + assert "bytes" not in scope_a_metric_frame["metrics"] assert foo_test_frame["metrics"]["bytes"] == test_iterations / 10 * 16 assert foo_test_frame["metrics"]["flops"] == test_iterations / 10 * 4 From db814eafb080b32ee973cbe2d2ab141d82641aa1 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 26 May 2026 17:25:23 -0400 Subject: [PATCH 37/39] Revert Proton linked metric test additions --- third_party/proton/test/test_profile.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index feadbf8d9539..d41c48c73acf 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -31,10 +31,6 @@ def _find_frame_by_name(frame, name): return None -def _find_child_frame_by_name(frame, name): - return next((child for child in frame["children"] if child["frame"]["name"] == name), None) - - @contextmanager def cuda_graph_without_gc(*args, **kwargs): # A loaded Triton CompiledKernel may be finalized by Python's cyclic GC. @@ -1700,17 +1696,11 @@ def fn(): assert scope_a_frame is not None assert scope_a_frame["metrics"]["bytes"] == 160 assert "count" not in scope_a_frame["metrics"] - scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") - assert "bytes" not in scope_a_metric_frame["metrics"] assert scope_b_frame is not None assert scope_b_frame["metrics"]["sum"] == 40.0 assert "count" not in scope_b_frame["metrics"] - scope_b_metric_frame = _find_child_frame_by_name(scope_b_frame, "") - assert "sum" not in scope_b_metric_frame["metrics"] assert scope_d_frame is not None assert scope_d_frame["metrics"]["vec"] == [0, 10, 20, 30] - scope_d_metric_frame = _find_child_frame_by_name(scope_d_frame, "") - assert "vec" not in scope_d_metric_frame["metrics"] @pytest.mark.skipif(not is_cuda(), reason="Only CUDA backend supports metrics profiling in cudagraphs") @@ -1850,11 +1840,7 @@ def run_on_device(device_id): assert foo_frame["metrics"]["flops"] == 40 assert foo_frame["metrics"]["device_id"] == str(device.index) assert scope_a_frame["metrics"]["bytes"] == 160 - scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") - assert "bytes" not in scope_a_metric_frame["metrics"] assert scope_b_frame["metrics"]["sum"] == 40.0 - scope_b_metric_frame = _find_child_frame_by_name(scope_b_frame, "") - assert "sum" not in scope_b_metric_frame["metrics"] assert len(data) > 1 cuda_devices = data[1].get("CUDA", {}) @@ -1971,8 +1957,6 @@ def fn(): assert scope_a_frame is not None assert foo_test_frame is not None assert scope_a_frame["metrics"]["bytes"] == test_iterations / 10 * 16 - scope_a_metric_frame = _find_child_frame_by_name(scope_a_frame, "") - assert "bytes" not in scope_a_metric_frame["metrics"] assert foo_test_frame["metrics"]["bytes"] == test_iterations / 10 * 16 assert foo_test_frame["metrics"]["flops"] == test_iterations / 10 * 4 From cfaa437bbdea0844f63fe2051a05d48613cb2369 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Thu, 28 May 2026 13:52:55 -0400 Subject: [PATCH 38/39] Constrain linked flexible JSON promotion --- third_party/proton/csrc/lib/Data/TreeData.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index c2c190b6c41a..f33108d7cd42 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -362,10 +362,12 @@ json TreeData::buildHatchetJson(TreeData::Tree *tree, if (metricsIt != treeNode.metricSet.linkedMetrics.end()) { appendMetrics(outNode["metrics"], metricsIt->second); } - // Linked flexible metrics are attached to helper nodes, - // but they belong on the helper's parent frame. + // Linked flexible metrics are attached to generated + // helper nodes, but they belong on the helper's parent frame. + // Other linked virtual nodes should not carry flexible metrics. if (flexibleIt != - treeNode.metricSet.linkedFlexibleMetrics.end()) { + treeNode.metricSet.linkedFlexibleMetrics.end() && + virtualNode.name == GraphState::metricTag) { appendFlexibleMetrics(parentMetricsJson, flexibleIt->second); } outNode["children"] = json::array(); From f029f51f96911c474d43f528fd924a6662ee588d Mon Sep 17 00:00:00 2001 From: Jokeren Date: Thu, 28 May 2026 14:20:20 -0400 Subject: [PATCH 39/39] Restore Proton MsgPack metric prepass --- third_party/proton/csrc/lib/Data/TreeData.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index f33108d7cd42..16a92c1c972c 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -449,7 +449,17 @@ TreeData::buildHatchetMsgPack(TreeData::Tree *tree, writer.reserve(16 * 1024 * 1024); // 16 MB MetricSummary metricSummary; - metricSummary.hasKernelMetric = true; + // Root metrics are serialized before descendants, so first scan the whole + // concrete tree for fixed-schema metric kinds. This lets the root emit the + // zero-valued Hatchet fields required for any metric kind present below it. + tree->template walk( + [&](TreeData::Tree::TreeNode &treeNode) { + metricSummary.observeMetrics(treeNode.metricSet.metrics); + for (const auto &[_, linkedMetrics] : + treeNode.metricSet.linkedMetrics) { + metricSummary.observeMetrics(linkedMetrics); + } + }); const auto &virtualRootNode = virtualTree->getNode(Tree::TreeNode::RootId); auto packHatchetFrameHeader = [&](std::string_view name) { writer.packMap(3);