Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
1215e11
stats: convert tag extractor regexs to Re2
rojkov Dec 18, 2020
f242af4
extend tag values to include dash and simplify regex for stat prefixes
rojkov Dec 28, 2020
b6865ad
drop optional matches since stat name is not muated until all extract…
rojkov Dec 28, 2020
230baaf
fix clang_tidy check
rojkov Dec 28, 2020
baee204
add reference results for the benchmark
rojkov Dec 28, 2020
12d7cc4
fix check_spelling_pedantic.py
rojkov Dec 28, 2020
c5561f6
fix clang_tidy check once again
rojkov Dec 28, 2020
bdf4545
update regex for tag values
rojkov Dec 29, 2020
b224e44
make regexes expandable with configurable patterns for sections
rojkov Dec 29, 2020
f2fb755
Merge remote-tracking branch 'upstream/master' into re2
rojkov Dec 29, 2020
78aefc7
simplify regex expansion
rojkov Dec 30, 2020
75b8697
Merge remote-tracking branch 'upstream/master' into re2
rojkov Dec 30, 2020
4c5e2d6
revert tag_extractor test to old tag values
rojkov Dec 31, 2020
6f66047
removed unused TagNameValues::addRegex()
rojkov Dec 31, 2020
b04be99
restore square brackets in comment
rojkov Dec 31, 2020
95aa344
Merge remote-tracking branch 'upstream/master' into re2
rojkov Dec 31, 2020
a833dcc
add comments to regexes
rojkov Jan 5, 2021
cc4d575
Merge remote-tracking branch 'upstream/master' into re2
rojkov Jan 5, 2021
9288659
add a comment explaining HTTP_CONN_MANAGER_PREFIX regex
rojkov Jan 5, 2021
1ecbac0
log tag extractor usage
rojkov Jan 8, 2021
f7c9045
put perf debug output under ifdef
rojkov Jan 8, 2021
d4d9553
make output more greppable with TagStats
rojkov Jan 11, 2021
7e4d7f2
Make expandRegex()'s comment more descriptive
rojkov Jan 11, 2021
618a203
amend comments and unify usage of symbol classes in regexes
rojkov Jan 12, 2021
0ad2460
simplify macros
rojkov Jan 12, 2021
910cb04
put destructor under explicit ifdef
rojkov Jan 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 55 additions & 45 deletions source/common/config/well_known_names.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
#include "common/config/well_known_names.h"

#include "absl/strings/str_replace.h"

namespace Envoy {
namespace Config {

namespace {

std::string expandRegex(const std::string& regex) {
Comment thread
jmarantz marked this conversation as resolved.
return absl::StrReplaceAll(
regex, {{"<ADDRESS>",
R"((?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}_\d+|\[[_aAbBcCdDeEfF[:digit:]]+\]_\d+))"},
Comment thread
htuch marked this conversation as resolved.
Outdated
{"<CIPHER>", R"([0-9A-Za-z_-]+)"},
Comment thread
htuch marked this conversation as resolved.
Outdated
{"<NAME>", R"([^\.]+)"},
{"<ROUTE_CONFIG_NAME>", R"([\w-\.]+)"}});
}

} // namespace

TagNameValues::TagNameValues() {
// Note: the default regexes are defined below in the order that they will typically be matched
// (see the TagExtractor class definition for an explanation of the iterative matching process).
Expand All @@ -24,107 +39,102 @@ TagNameValues::TagNameValues() {
// - Typical * notation will be used to denote an arbitrary set of characters.

// *_rq(_<response_code>)
addRegex(RESPONSE_CODE, "_rq(_(\\d{3}))$", "_rq_");
addRe2(RESPONSE_CODE, R"(_rq(_(\d{3}))$)", "_rq_");

// *_rq_(<response_code_class>)xx
addRegex(RESPONSE_CODE_CLASS, "_rq_(\\d)xx$", "_rq_");
addRe2(RESPONSE_CODE_CLASS, R"(_rq_((\d))xx$)", "_rq_");

// http.[<stat_prefix>.]dynamodb.table.[<table_name>.]capacity.[<operation_name>.](__partition_id=<last_seven_characters_from_partition_id>)
addRegex(DYNAMO_PARTITION_ID,
"^http(?=\\.).*?\\.dynamodb\\.table(?=\\.).*?\\."
"capacity(?=\\.).*?(\\.__partition_id=(\\w{7}))$",
".dynamodb.table.");
addRe2(DYNAMO_PARTITION_ID,
R"(^http\.<NAME>\.dynamodb\.table\.<NAME>\.capacity\.<NAME>(\.__partition_id=(\w{7}))$)",
".dynamodb.table.");

// http.[<stat_prefix>.]dynamodb.operation.(<operation_name>.)<base_stat> or
// http.[<stat_prefix>.]dynamodb.table.[<table_name>.]capacity.(<operation_name>.)[<partition_id>]
addRegex(DYNAMO_OPERATION,
"^http(?=\\.).*?\\.dynamodb.(?:operation|table(?="
"\\.).*?\\.capacity)(\\.(.*?))(?:\\.|$)",
".dynamodb.");
addRe2(DYNAMO_OPERATION,

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think maybe the two options listed above are a holdover from when the stat extraction was serial and this regex would have to run either before or after the previous one. I wonder if this could be simplified now that every extractor runs on the original stat-name, rather than being chained.

This is not related to the RE2 conversion and I'm fine to leave this as a TODO. But I'm bringing it up now because it might make it easier to inspect the RE pattern.

R"(^http\.<NAME>\.dynamodb.(?:operation|table\.<NAME>\.capacity)(\.(<NAME>))(?:\.|$))",
".dynamodb.");

// mongo.[<stat_prefix>.]collection.[<collection>.]callsite.(<callsite>.)query.<base_stat>
addRegex(MONGO_CALLSITE,
R"(^mongo(?=\.).*?\.collection(?=\.).*?\.callsite\.((.*?)\.).*?query.\w+?$)",
".collection.");
addRe2(MONGO_CALLSITE, R"(^mongo\.<NAME>\.collection\.<NAME>\.callsite\.((<NAME>)\.)query\.)",
".collection.");

// http.[<stat_prefix>.]dynamodb.table.(<table_name>.) or
// http.[<stat_prefix>.]dynamodb.error.(<table_name>.)*
addRegex(DYNAMO_TABLE, R"(^http(?=\.).*?\.dynamodb.(?:table|error)\.((.*?)\.))", ".dynamodb.");
addRe2(DYNAMO_TABLE, R"(^http\.<NAME>\.dynamodb.(?:table|error)\.((<NAME>)\.))", ".dynamodb.");

// mongo.[<stat_prefix>.]collection.(<collection>.)query.<base_stat>
addRegex(MONGO_COLLECTION, R"(^mongo(?=\.).*?\.collection\.((.*?)\.).*?query.\w+?$)",
".collection.");
addRe2(MONGO_COLLECTION, R"(^mongo\.<NAME>\.collection\.((<NAME>)\.).*?query\.)", ".collection.");

// mongo.[<stat_prefix>.]cmd.(<cmd>.)<base_stat>
addRegex(MONGO_CMD, R"(^mongo(?=\.).*?\.cmd\.((.*?)\.)\w+?$)", ".cmd.");
Comment thread
htuch marked this conversation as resolved.
addRe2(MONGO_CMD, R"(^mongo\.<NAME>\.cmd\.((<NAME>)\.))", ".cmd.");

// cluster.[<route_target_cluster>.]grpc.[<grpc_service>.](<grpc_method>.)<base_stat>
addRegex(GRPC_BRIDGE_METHOD, R"(^cluster(?=\.).*?\.grpc(?=\.).*\.((.*?)\.)\w+?$)", ".grpc.");
// cluster.[<route_target_cluster>.]grpc.<grpc_service>.(<grpc_method>.)*
addRe2(GRPC_BRIDGE_METHOD, R"(^cluster\.<NAME>\.grpc\.<NAME>\.((<NAME>)\.))", ".grpc.");

// http.[<stat_prefix>.]user_agent.(<user_agent>.)<base_stat>
addRegex(HTTP_USER_AGENT, R"(^http(?=\.).*?\.user_agent\.((.*?)\.)\w+?$)", ".user_agent.");
// http.[<stat_prefix>.]user_agent.(<user_agent>.)*
addRe2(HTTP_USER_AGENT, R"(^http\.<NAME>\.user_agent\.((<NAME>)\.))", ".user_agent.");

// vhost.[<virtual host name>.]vcluster.(<virtual_cluster_name>.)<base_stat>
addRegex(VIRTUAL_CLUSTER, R"(^vhost(?=\.).*?\.vcluster\.((.*?)\.)\w+?$)", ".vcluster.");
Comment thread
htuch marked this conversation as resolved.
// vhost.[<virtual host name>.]vcluster.(<virtual_cluster_name>.)*
addRe2(VIRTUAL_CLUSTER, R"(^vhost\.<NAME>\.vcluster\.((<NAME>)\.))", ".vcluster.");

// http.[<stat_prefix>.]fault.(<downstream_cluster>.)<base_stat>
addRegex(FAULT_DOWNSTREAM_CLUSTER, R"(^http(?=\.).*?\.fault\.((.*?)\.)\w+?$)", ".fault.");
// http.[<stat_prefix>.]fault.(<downstream_cluster>.)*
addRe2(FAULT_DOWNSTREAM_CLUSTER, R"(^http\.<NAME>\.fault\.((<NAME>)\.))", ".fault.");

// listener.[<address>.]ssl.cipher.(<cipher>)
addRegex(SSL_CIPHER, R"(^listener(?=\.).*?\.ssl\.cipher(\.(.*?))$)");
addRe2(SSL_CIPHER, R"(^listener\..*?\.ssl\.cipher(\.(<CIPHER>))$)");

// cluster.[<cluster_name>.]ssl.ciphers.(<cipher>)
addRegex(SSL_CIPHER_SUITE, R"(^cluster(?=\.).*?\.ssl\.ciphers(\.(.*?))$)", ".ssl.ciphers.");
addRe2(SSL_CIPHER_SUITE, R"(^cluster\.<NAME>\.ssl\.ciphers(\.(<CIPHER>))$)", ".ssl.ciphers.");

// cluster.[<route_target_cluster>.]grpc.(<grpc_service>.)*
addRegex(GRPC_BRIDGE_SERVICE, R"(^cluster(?=\.).*?\.grpc\.((.*?)\.))", ".grpc.");
addRe2(GRPC_BRIDGE_SERVICE, R"(^cluster\.<NAME>\.grpc\.((<NAME>)\.))", ".grpc.");

// tcp.(<stat_prefix>.)<base_stat>
addRegex(TCP_PREFIX, R"(^tcp\.((.*?)\.)\w+?$)");
addRe2(TCP_PREFIX, R"(^tcp\.((<NAME>)\.))");

// udp.(<stat_prefix>.)<base_stat>
addRegex(UDP_PREFIX, R"(^udp\.((.*?)\.)\w+?$)");
addRe2(UDP_PREFIX, R"(^udp\.((<NAME>)\.))");

// auth.clientssl.(<stat_prefix>.)<base_stat>
addRegex(CLIENTSSL_PREFIX, R"(^auth\.clientssl\.((.*?)\.)\w+?$)");
// auth.clientssl.(<stat_prefix>.)*
addRe2(CLIENTSSL_PREFIX, R"(^auth\.clientssl\.((<NAME>)\.))");

// ratelimit.(<stat_prefix>.)<base_stat>
addRegex(RATELIMIT_PREFIX, R"(^ratelimit\.((.*?)\.)\w+?$)");
// ratelimit.(<stat_prefix>.)*
addRe2(RATELIMIT_PREFIX, R"(^ratelimit\.((<NAME>)\.))");

// cluster.(<cluster_name>.)*
addRe2(CLUSTER_NAME, "^cluster\\.(([^\\.]+)\\.).*");
addRe2(CLUSTER_NAME, R"(^cluster\.((<NAME>)\.))");

// listener.[<address>.]http.(<stat_prefix>.)*
addRegex(HTTP_CONN_MANAGER_PREFIX, R"(^listener(?=\.).*?\.http\.((.*?)\.))", ".http.");
addRe2(HTTP_CONN_MANAGER_PREFIX, R"(^listener\..*?\.http\.((<NAME>)\.))", ".http.");
Comment thread
jmarantz marked this conversation as resolved.

// http.(<stat_prefix>.)*
addRegex(HTTP_CONN_MANAGER_PREFIX, "^http\\.((.*?)\\.)");
addRe2(HTTP_CONN_MANAGER_PREFIX, R"(^http\.((<NAME>)\.))");

// listener.(<address>.)*
addRegex(LISTENER_ADDRESS,
R"(^listener\.(((?:[_.[:digit:]]*|[_\[\]aAbBcCdDeEfF[:digit:]]*))\.))");
addRe2(LISTENER_ADDRESS, R"(^listener\.((<ADDRESS>)\.))");

// vhost.(<virtual host name>.)*
addRegex(VIRTUAL_HOST, "^vhost\\.((.*?)\\.)");
addRe2(VIRTUAL_HOST, R"(^vhost\.((<NAME>)\.))");

// mongo.(<stat_prefix>.)*
addRegex(MONGO_PREFIX, "^mongo\\.((.*?)\\.)");
addRe2(MONGO_PREFIX, R"(^mongo\.((<NAME>)\.))");

// http.[<stat_prefix>.]rds.(<route_config_name>.)<base_stat>
addRegex(RDS_ROUTE_CONFIG, R"(^http(?=\.).*?\.rds\.((.*?)\.)\w+?$)", ".rds.");
addRe2(RDS_ROUTE_CONFIG, R"(^http\.<NAME>\.rds\.((<ROUTE_CONFIG_NAME>)\.)\w+?$)", ".rds.");

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, why not drop the base_stat at the end in this case?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

<ROUTE_CONFIG_NAME> can contain dots and there's no any other anchor after it except EOL. This makes the regex slower unfortunately.
Added a comment to explain it to my future self.


// listener_manager.(worker_<id>.)*
addRegex(WORKER_ID, R"(^listener_manager\.((worker_\d+)\.))", "listener_manager.worker_");
addRe2(WORKER_ID, R"(^listener_manager\.((worker_\d+)\.))", "listener_manager.worker_");
}

void TagNameValues::addRegex(const std::string& name, const std::string& regex,
const std::string& substr) {
descriptor_vec_.emplace_back(Descriptor{name, regex, substr, Regex::Type::StdRegex});
descriptor_vec_.emplace_back(Descriptor{name, expandRegex(regex), substr, Regex::Type::StdRegex});

@jmarantz jmarantz Dec 30, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I think TagNameValues::addRegex is not called now. Delete?

}

void TagNameValues::addRe2(const std::string& name, const std::string& regex,
const std::string& substr) {
descriptor_vec_.emplace_back(Descriptor{name, regex, substr, Regex::Type::Re2});
descriptor_vec_.emplace_back(Descriptor{name, expandRegex(regex), substr, Regex::Type::Re2});
}

} // namespace Config
Expand Down
4 changes: 2 additions & 2 deletions source/common/stats/tag_extractor_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ bool TagExtractorRe2Impl::extractTag(absl::string_view stat_name, std::vector<Ta
re2::StringPiece remove_subexpr, value_subexpr;

// The regex must match and contain one or more subexpressions (all after the first are ignored).
if (re2::RE2::FullMatch(re2::StringPiece(stat_name.data(), stat_name.size()), regex_,
&remove_subexpr, &value_subexpr) &&
if (re2::RE2::PartialMatch(re2::StringPiece(stat_name.data(), stat_name.size()), regex_,
&remove_subexpr, &value_subexpr) &&
!remove_subexpr.empty()) {

// value_subexpr is the optional second submatch. It is usually inside the first submatch
Expand Down
19 changes: 19 additions & 0 deletions test/common/stats/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,25 @@ envoy_cc_test(
],
)

envoy_cc_benchmark_binary(
name = "tag_extractor_impl_benchmark",
srcs = [
"tag_extractor_impl_speed_test.cc",
],
external_deps = [
"benchmark",
],
deps = [
"//source/common/stats:tag_producer_lib",
"@envoy_api//envoy/config/metrics/v3:pkg_cc_proto",
],
)

envoy_benchmark_test(
name = "tag_extractor_impl_benchmark_test",
benchmark_binary = "tag_extractor_impl_benchmark",
)

envoy_cc_test(
name = "thread_local_store_test",
srcs = ["thread_local_store_test.cc"],
Expand Down
110 changes: 110 additions & 0 deletions test/common/stats/tag_extractor_impl_speed_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Note: this should be run with --compilation_mode=opt
// Running ./bazel-out/k8-opt/bin/test/common/stats/tag_extractor_impl_benchmark
// Run on (24 X 4300 MHz CPU s)
// CPU Caches:
// L1 Data 32 KiB (x12)
// L1 Instruction 32 KiB (x12)
// L2 Unified 1024 KiB (x12)
// L3 Unified 16896 KiB (x1)
// Load Average: 0.94, 0.75, 0.88
// ***WARNING*** CPU scaling is enabled, the benchmark real time
// measurements may be noisy and will incur extra overhead.
// ------------------------------------------------------------
// Benchmark Time CPU Iterations
// ------------------------------------------------------------
// BM_ExtractTags/0 1759 ns 1757 ns 397721
// BM_ExtractTags/1 498 ns 497 ns 1386765
// BM_ExtractTags/2 814 ns 813 ns 789388
// BM_ExtractTags/3 621 ns 620 ns 1109055
// BM_ExtractTags/4 1320 ns 1318 ns 536701
// BM_ExtractTags/5 882 ns 880 ns 817115
// BM_ExtractTags/6 327 ns 327 ns 2171259
// BM_ExtractTags/7 572 ns 571 ns 1205250
// BM_ExtractTags/8 1238 ns 1236 ns 558481
// BM_ExtractTags/9 1669 ns 1667 ns 414483
// BM_ExtractTags/10 310 ns 310 ns 2237065
// BM_ExtractTags/11 476 ns 476 ns 1465925
// BM_ExtractTags/12 1102 ns 1100 ns 631707
// BM_ExtractTags/13 1307 ns 1305 ns 513760
// BM_ExtractTags/14 1583 ns 1581 ns 447159
// BM_ExtractTags/15 957 ns 956 ns 729726
// BM_ExtractTags/16 822 ns 821 ns 869110
// BM_ExtractTags/17 821 ns 820 ns 839293
// BM_ExtractTags/18 783 ns 782 ns 898442
// BM_ExtractTags/19 330 ns 329 ns 2098821
// BM_ExtractTags/20 342 ns 342 ns 2044062
// BM_ExtractTags/21 389 ns 389 ns 1785110
// BM_ExtractTags/22 847 ns 846 ns 831652
// BM_ExtractTags/23 2022 ns 2019 ns 353368
// BM_ExtractTags/24 306 ns 305 ns 2226702
// BM_ExtractTags/25 277 ns 277 ns 2516796
// BM_ExtractTags/26 494 ns 494 ns 1363306

#include "envoy/config/metrics/v3/stats.pb.h"

#include "common/common/assert.h"
#include "common/config/well_known_names.h"
#include "common/stats/tag_producer_impl.h"

#include "benchmark/benchmark.h"

namespace Envoy {
namespace Stats {
namespace {

using Params = std::tuple<std::string, uint32_t>;

const std::vector<Params> params = {
{"listener.127.0.0.1_3012.http.http_prefix.downstream_rq_5xx", 3},
{"cluster.ratelimit.upstream_rq_timeout", 1},
{"listener.[__1]_0.ssl.cipher.AES256-SHA", 2},
{"cluster.ratelimit.ssl.ciphers.ECDHE-RSA-AES128-GCM-SHA256", 2},
{"listener.[2001_0db8_85a3_0000_0000_8a2e_0370_7334]_3543.ssl.cipher.AES256-SHA", 2},
{"listener.127.0.0.1_0.ssl.cipher.AES256-SHA", 2},
{"mongo.mongo_filter.op_reply", 1},
{"mongo.mongo_filter.cmd.foo_cmd.reply_size", 2},
{"mongo.mongo_filter.collection.bar_collection.query.multi_get", 2},
{"mongo.mongo_filter.collection.bar_collection.callsite.baz_callsite.query.scatter_get", 3},
{"ratelimit.foo_ratelimiter.over_limit", 1},
{"http.egress_dynamodb_iad.downstream_cx_total", 1},
{"http.egress_dynamodb_iad.dynamodb.operation.Query.upstream_rq_time", 2},
{"http.egress_dynamodb_iad.dynamodb.table.bar_table.upstream_rq_time", 2},
{"http.egress_dynamodb_iad.dynamodb.table.bar_table.capacity.Query.__partition_id=ABC1234", 4},
{"cluster.grpc_cluster.grpc.grpc_service_1.grpc_method_1.success", 3},
{"vhost.vhost_1.vcluster.vcluster_1.upstream_rq_2xx", 3},
{"vhost.vhost_1.vcluster.vcluster_1.upstream_rq_200", 3},
{"http.egress_dynamodb_iad.user_agent.ios.downstream_cx_total", 2},
{"auth.clientssl.clientssl_prefix.auth_ip_allowlist", 1},
{"tcp.tcp_prefix.downstream_flow_control_resumed_reading_total", 1},
{"udp.udp_prefix-with-dashes.downstream_flow_control_resumed_reading_total", 1},
{"http.fault_connection_manager.fault.fault_cluster.aborts_injected", 2},
{"http.rds_connection_manager.rds.route_config.123.update_success", 2},
{"listener_manager.worker_123.dispatcher.loop_duration_us", 1},
{"mongo_mongo_mongo_mongo.this_is_rather_long_string_which "
"does_not_match_and_consumes_a_lot_in_case_of_backtracking_imposed_by_greedy_pattern",
0},
{"another_long_but_matching_string_which_may_consume_resources_if_missing_end_of_line_lock_rq_"
"2xx",
1},
};

// NOLINTNEXTLINE(readability-identifier-naming)
void BM_ExtractTags(benchmark::State& state) {
TagProducerImpl tag_extractors{envoy::config::metrics::v3::StatsConfig()};
const auto idx = state.range(0);
const auto& p = params[idx];
absl::string_view str = std::get<0>(p);
const uint32_t tags_size = std::get<1>(p);

for (auto _ : state) {
Comment thread
jmarantz marked this conversation as resolved.
UNREFERENCED_PARAMETER(_);
TagVector tags;
tag_extractors.produceTags(str, tags);
RELEASE_ASSERT(tags.size() == tags_size, "");
}
}
BENCHMARK(BM_ExtractTags)->DenseRange(0, 26, 1);

} // namespace
} // namespace Stats
} // namespace Envoy
Loading