Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/envoy/config/metrics/v3/metrics_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,9 @@ message MetricsServiceConfig {
// Eventually (https://github.com/envoyproxy/envoy/issues/10968) if this value is not set, the
// sink will take updates from the :ref:`MetricsResponse <envoy_api_msg_service.metrics.v3.StreamMetricsResponse>`.
google.protobuf.BoolValue report_counters_as_deltas = 2;

// If true, metrics will have their tags emitted as labels on the metrics objects sent to the MetricsService,
// and the tag extracted name will be used instead of the full name, which may contain values used by the tag
// extractor or additional tags added during stats creation.
bool emit_tags_as_labels = 4;
}
5 changes: 5 additions & 0 deletions api/envoy/config/metrics/v4alpha/metrics_service.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,7 @@ Removed Config or Runtime
New Features
------------

* metric service: added support for sending metric tags as labels. This can be enabled by setting the :ref:`emit_tags_as_labels <envoy_v3_api_field_config.metrics.v3.MetricsServiceConfig.emit_tags_as_labels>` field to true.

Deprecated
----------

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion source/extensions/stat_sinks/metrics_service/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ MetricsServiceSinkFactory::createStatsSink(const Protobuf::Message& config,

return std::make_unique<MetricsServiceSink<envoy::service::metrics::v3::StreamMetricsMessage,
envoy::service::metrics::v3::StreamMetricsResponse>>(
grpc_metrics_streamer,
grpc_metrics_streamer, sink_config.emit_tags_as_labels(),
PROTOBUF_GET_WRAPPED_OR_DEFAULT(sink_config, report_counters_as_deltas, false));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,8 @@ MetricsPtr MetricsFlusher::flush(Stats::MetricSnapshot& snapshot) const {
void MetricsFlusher::flushCounter(io::prometheus::client::MetricFamily& metrics_family,
const Stats::MetricSnapshot::CounterSnapshot& counter_snapshot,
int64_t snapshot_time_ms) const {
metrics_family.set_type(io::prometheus::client::MetricType::COUNTER);
metrics_family.set_name(counter_snapshot.counter_.get().name());
auto* metric = metrics_family.add_metric();
metric->set_timestamp_ms(snapshot_time_ms);
auto* metric = populateMetricsFamily(metrics_family, io::prometheus::client::MetricType::COUNTER,
snapshot_time_ms, counter_snapshot.counter_.get());
auto* counter_metric = metric->mutable_counter();
if (report_counters_as_deltas_) {
counter_metric->set_value(counter_snapshot.delta_);
Expand All @@ -96,10 +94,8 @@ void MetricsFlusher::flushCounter(io::prometheus::client::MetricFamily& metrics_

void MetricsFlusher::flushGauge(io::prometheus::client::MetricFamily& metrics_family,
const Stats::Gauge& gauge, int64_t snapshot_time_ms) const {
metrics_family.set_type(io::prometheus::client::MetricType::GAUGE);
metrics_family.set_name(gauge.name());
auto* metric = metrics_family.add_metric();
metric->set_timestamp_ms(snapshot_time_ms);
auto* metric = populateMetricsFamily(metrics_family, io::prometheus::client::MetricType::GAUGE,
snapshot_time_ms, gauge);
auto* gauge_metric = metric->mutable_gauge();
gauge_metric->set_value(gauge.value());
}
Expand All @@ -113,10 +109,9 @@ void MetricsFlusher::flushHistogram(io::prometheus::client::MetricFamily& summar
// performance.

// Add summary information for histograms.
summary_metrics_family.set_type(io::prometheus::client::MetricType::SUMMARY);
summary_metrics_family.set_name(envoy_histogram.name());
auto* summary_metric = summary_metrics_family.add_metric();
summary_metric->set_timestamp_ms(snapshot_time_ms);
auto* summary_metric =
populateMetricsFamily(summary_metrics_family, io::prometheus::client::MetricType::SUMMARY,
snapshot_time_ms, envoy_histogram);
auto* summary = summary_metric->mutable_summary();
const Stats::HistogramStatistics& hist_stats = envoy_histogram.intervalStatistics();
for (size_t i = 0; i < hist_stats.supportedQuantiles().size(); i++) {
Expand All @@ -126,10 +121,9 @@ void MetricsFlusher::flushHistogram(io::prometheus::client::MetricFamily& summar
}

// Add bucket information for histograms.
histogram_metrics_family.set_type(io::prometheus::client::MetricType::HISTOGRAM);
histogram_metrics_family.set_name(envoy_histogram.name());
auto* histogram_metric = histogram_metrics_family.add_metric();
histogram_metric->set_timestamp_ms(snapshot_time_ms);
auto* histogram_metric =
populateMetricsFamily(histogram_metrics_family, io::prometheus::client::MetricType::HISTOGRAM,
snapshot_time_ms, envoy_histogram);
auto* histogram = histogram_metric->mutable_histogram();
histogram->set_sample_count(hist_stats.sampleCount());
histogram->set_sample_sum(hist_stats.sampleSum());
Expand All @@ -139,6 +133,31 @@ void MetricsFlusher::flushHistogram(io::prometheus::client::MetricFamily& summar
bucket->set_cumulative_count(hist_stats.computedBuckets()[i]);
}
}

io::prometheus::client::Metric*
MetricsFlusher::populateMetricsFamily(io::prometheus::client::MetricFamily& metrics_family,
io::prometheus::client::MetricType type,
int64_t snapshot_time_ms, const Stats::Metric& metric) const {
metrics_family.set_type(type);
auto* prometheus_metric = metrics_family.add_metric();
prometheus_metric->set_timestamp_ms(snapshot_time_ms);

if (emit_labels_) {
// TODO(snowp): Look into the perf implication of this. We need to take a lock on the symbol
// table to stringify the StatNames, which could result in some lock contention. Consider
// caching the conversion between stat handle to extracted tags.
metrics_family.set_name(metric.tagExtractedName());
for (const auto& tag : metric.tags()) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm. I'm wondering if this is going to be a perf issue as constructing the tag array is going to take symbol table locks and do some string-building, with those strings then being copied again below.

It might be slightly better to use metric::iterateTagStatNames but we'd still need to convert to strings here. Maybe this transformation should be cached somehow in the MetricsFlusher object?

Maybe leave a note here to take a look at the perf implications of this later? The reason I'm concerned about this is that it looks like this is called on every flush, each with its own value, but it's re-creating a bunch of metadata for each stat that doesn't change. It'd be nice to factor that repeated computation out.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah let me leave a note here to look into the perf of this.

I think there are two options that could be explored: 1) cache just the symbol -> string conversion just to avoid the locks on the symbol table or 2) cache the entire prom Metric per stat. I'm not sure how much 2) gains us over 1) since we'll need to copy the proto completely anyways, so it might just result in a lot of wasted space keeping all these protos stored in memory? At least with 1) we get the benefit of space savings if a symbol is reused between stats. In any case I will leave this as a TODO for now

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. One minor concern to note also is that if we add a cache, it could retain storage for stats that belong to deleted scopes, thus leaking. We can fix all that but it's definitely a separate PR.

auto* label = prometheus_metric->add_label();
label->set_name(tag.name_);
label->set_value(tag.value_);
}
} else {
metrics_family.set_name(metric.name());
}

return prometheus_metric;
}
} // namespace MetricsService
} // namespace StatSinks
} // namespace Extensions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ using GrpcMetricsStreamerImplPtr = std::unique_ptr<GrpcMetricsStreamerImpl>;

class MetricsFlusher {
public:
explicit MetricsFlusher(const bool report_counters_as_deltas)
: report_counters_as_deltas_(report_counters_as_deltas) {}
MetricsFlusher(bool report_counters_as_deltas, bool emit_labels)
: report_counters_as_deltas_(report_counters_as_deltas), emit_labels_(emit_labels) {}

MetricsPtr flush(Stats::MetricSnapshot& snapshot) const;

Expand All @@ -98,7 +98,13 @@ class MetricsFlusher {
const Stats::ParentHistogram& envoy_histogram,
int64_t snapshot_time_ms) const;

io::prometheus::client::Metric*
populateMetricsFamily(io::prometheus::client::MetricFamily& metrics_family,
io::prometheus::client::MetricType type, int64_t snapshot_time_ms,
const Stats::Metric& metric) const;

const bool report_counters_as_deltas_;
const bool emit_labels_;
};

/**
Expand All @@ -109,8 +115,9 @@ template <class RequestProto, class ResponseProto> class MetricsServiceSink : pu
// MetricsService::Sink
MetricsServiceSink(
const GrpcMetricsStreamerSharedPtr<RequestProto, ResponseProto>& grpc_metrics_streamer,
const bool report_counters_as_deltas)
: flusher_(report_counters_as_deltas), grpc_metrics_streamer_(grpc_metrics_streamer) {}
bool report_counters_as_deltas, bool emit_labels)
: flusher_(report_counters_as_deltas, emit_labels),
grpc_metrics_streamer_(grpc_metrics_streamer) {}

void flush(Stats::MetricSnapshot& snapshot) override {
grpc_metrics_streamer_->send(flusher_.flush(snapshot));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class MetricsServiceSinkTest : public testing::Test {
TEST_F(MetricsServiceSinkTest, CheckSendCall) {
MetricsServiceSink<envoy::service::metrics::v3::StreamMetricsMessage,
envoy::service::metrics::v3::StreamMetricsResponse>
sink(streamer_, false);
sink(streamer_, false, false);

auto counter = std::make_shared<NiceMock<Stats::MockCounter>>();
counter->name_ = "test_counter";
Expand All @@ -137,7 +137,7 @@ TEST_F(MetricsServiceSinkTest, CheckSendCall) {
TEST_F(MetricsServiceSinkTest, CheckStatsCount) {
MetricsServiceSink<envoy::service::metrics::v3::StreamMetricsMessage,
envoy::service::metrics::v3::StreamMetricsResponse>
sink(streamer_, false);
sink(streamer_, false, false);

auto counter = std::make_shared<NiceMock<Stats::MockCounter>>();
counter->name_ = "test_counter";
Expand Down Expand Up @@ -168,7 +168,7 @@ TEST_F(MetricsServiceSinkTest, CheckStatsCount) {
TEST_F(MetricsServiceSinkTest, ReportCountersValues) {
MetricsServiceSink<envoy::service::metrics::v3::StreamMetricsMessage,
envoy::service::metrics::v3::StreamMetricsResponse>
sink(streamer_, false);
sink(streamer_, false, false);

auto counter = std::make_shared<NiceMock<Stats::MockCounter>>();
counter->name_ = "test_counter";
Expand All @@ -187,7 +187,7 @@ TEST_F(MetricsServiceSinkTest, ReportCountersValues) {
TEST_F(MetricsServiceSinkTest, ReportCountersAsDeltas) {
MetricsServiceSink<envoy::service::metrics::v3::StreamMetricsMessage,
envoy::service::metrics::v3::StreamMetricsResponse>
sink(streamer_, true);
sink(streamer_, true, false);

auto counter = std::make_shared<NiceMock<Stats::MockCounter>>();
counter->name_ = "test_counter";
Expand All @@ -202,6 +202,86 @@ TEST_F(MetricsServiceSinkTest, ReportCountersAsDeltas) {
sink.flush(snapshot_);
}

// Test the behavior of tag emission based on the emit_tags_as_label flag.
TEST_F(MetricsServiceSinkTest, ReportMetricsWithTags) {
auto counter = std::make_shared<NiceMock<Stats::MockCounter>>();
counter->name_ = "full-counter-name";
counter->value_ = 100;
counter->used_ = true;
counter->setTagExtractedName("tag-counter-name");
counter->setTags({{"a", "b"}});
snapshot_.counters_.push_back({1, *counter});

auto gauge = std::make_shared<NiceMock<Stats::MockGauge>>();
gauge->name_ = "full-gauge-name";
gauge->value_ = 100;
gauge->used_ = true;
gauge->setTagExtractedName("tag-gauge-name");
gauge->setTags({{"a", "b"}});
snapshot_.gauges_.push_back({*gauge});

auto histogram = std::make_shared<NiceMock<Stats::MockParentHistogram>>();
histogram->name_ = "full-histogram-name";
histogram->used_ = true;
histogram->setTagExtractedName("tag-histogram-name");
histogram->setTags({{"a", "b"}});
snapshot_.histograms_.push_back({*histogram});

{
// When the emit_tags flag is false, we don't emit the tags and use the full name.
MetricsServiceSink<envoy::service::metrics::v3::StreamMetricsMessage,
envoy::service::metrics::v3::StreamMetricsResponse>
sink(streamer_, true, false);

EXPECT_CALL(*streamer_, send(_)).WillOnce(Invoke([](MetricsPtr&& metrics) {
EXPECT_EQ(4, metrics->size());

EXPECT_EQ("full-counter-name", (*metrics)[0].name());
EXPECT_EQ(0, (*metrics)[0].metric(0).label().size());

EXPECT_EQ("full-gauge-name", (*metrics)[1].name());
EXPECT_EQ(0, (*metrics)[1].metric(0).label().size());

EXPECT_EQ("full-histogram-name", (*metrics)[2].name());
EXPECT_EQ(0, (*metrics)[2].metric(0).label().size());

EXPECT_EQ("full-histogram-name", (*metrics)[3].name());
EXPECT_EQ(0, (*metrics)[3].metric(0).label().size());
}));
sink.flush(snapshot_);
}

io::prometheus::client::LabelPair expected_label_pair;
expected_label_pair.set_name("a");
expected_label_pair.set_value("b");

// When the emit_tags flag is true, we emit the tags as labels and use the tag extracted name.
MetricsServiceSink<envoy::service::metrics::v3::StreamMetricsMessage,
envoy::service::metrics::v3::StreamMetricsResponse>
sink(streamer_, true, true);

EXPECT_CALL(*streamer_, send(_)).WillOnce(Invoke([&expected_label_pair](MetricsPtr&& metrics) {
EXPECT_EQ(4, metrics->size());

EXPECT_EQ("tag-counter-name", (*metrics)[0].name());
EXPECT_EQ(1, (*metrics)[0].metric(0).label().size());
EXPECT_TRUE(TestUtility::protoEqual(expected_label_pair, (*metrics)[0].metric(0).label()[0]));

EXPECT_EQ("tag-gauge-name", (*metrics)[1].name());
EXPECT_EQ(1, (*metrics)[1].metric(0).label().size());
EXPECT_TRUE(TestUtility::protoEqual(expected_label_pair, (*metrics)[0].metric(0).label()[0]));

EXPECT_EQ("tag-histogram-name", (*metrics)[2].name());
EXPECT_EQ(1, (*metrics)[2].metric(0).label().size());
EXPECT_TRUE(TestUtility::protoEqual(expected_label_pair, (*metrics)[0].metric(0).label()[0]));

EXPECT_EQ("tag-histogram-name", (*metrics)[3].name());
EXPECT_EQ(1, (*metrics)[3].metric(0).label().size());
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check the labels are the ones we expect also?

EXPECT_TRUE(TestUtility::protoEqual(expected_label_pair, (*metrics)[0].metric(0).label()[0]));
}));
sink.flush(snapshot_);
}

} // namespace
} // namespace MetricsService
} // namespace StatSinks
Expand Down