-
Notifications
You must be signed in to change notification settings - Fork 5.5k
stats: use RE2 and a better pattern to accelerate a single stats tag-extraction RE #8831
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
6ffa3ee
fa3573a
42ac3d0
e9b9c88
d692af2
fd34b44
e278b2f
7e416e8
74abaa4
a4f297c
c0a8983
eba2624
edf2bb2
626a092
b1bfbdc
46a3900
0fa1ca2
fad62de
1976786
8c79d6c
5c801a6
c445533
00a0ba1
5cf3b48
39783c4
b5bac7e
f66d732
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,8 +25,7 @@ bool regexStartsWithDot(absl::string_view regex) { | |
|
|
||
| TagExtractorImpl::TagExtractorImpl(const std::string& name, const std::string& regex, | ||
| const std::string& substr) | ||
| : name_(name), prefix_(std::string(extractRegexPrefix(regex))), substr_(substr), | ||
| regex_(Regex::Utility::parseStdRegex(regex)) {} | ||
| : name_(name), prefix_(std::string(extractRegexPrefix(regex))), substr_(substr) {} | ||
|
|
||
| std::string TagExtractorImpl::extractRegexPrefix(absl::string_view regex) { | ||
| std::string prefix; | ||
|
|
@@ -49,7 +48,7 @@ std::string TagExtractorImpl::extractRegexPrefix(absl::string_view regex) { | |
|
|
||
| TagExtractorPtr TagExtractorImpl::createTagExtractor(const std::string& name, | ||
| const std::string& regex, | ||
| const std::string& substr) { | ||
| const std::string& substr, bool use_re2) { | ||
|
jmarantz marked this conversation as resolved.
Outdated
|
||
|
|
||
| if (name.empty()) { | ||
| throw EnvoyException("tag_name cannot be empty"); | ||
|
|
@@ -59,19 +58,26 @@ TagExtractorPtr TagExtractorImpl::createTagExtractor(const std::string& name, | |
| throw EnvoyException(fmt::format( | ||
| "No regex specified for tag specifier and no default regex for name: '{}'", name)); | ||
| } | ||
| return TagExtractorPtr{new TagExtractorImpl(name, regex, substr)}; | ||
| if (use_re2) { | ||
| return std::make_unique<TagExtractorRe2Impl>(name, regex, substr); | ||
| } | ||
| return std::make_unique<TagExtractorStdRegexImpl>(name, regex, substr); | ||
| } | ||
|
|
||
| bool TagExtractorImpl::substrMismatch(absl::string_view stat_name) const { | ||
| return !substr_.empty() && stat_name.find(substr_) == absl::string_view::npos; | ||
| } | ||
|
|
||
| bool TagExtractorImpl::extractTag(absl::string_view stat_name, std::vector<Tag>& tags, | ||
| IntervalSet<size_t>& remove_characters) const { | ||
| TagExtractorStdRegexImpl::TagExtractorStdRegexImpl(const std::string& name, const std::string regex, | ||
| const std::string& substr) | ||
| : TagExtractorImpl(name, regex, substr), regex_(Regex::Utility::parseStdRegex(regex)) {} | ||
|
|
||
| bool TagExtractorStdRegexImpl::extractTag(absl::string_view stat_name, std::vector<Tag>& tags, | ||
| IntervalSet<size_t>& remove_characters) const { | ||
| PERF_OPERATION(perf); | ||
|
|
||
| if (substrMismatch(stat_name)) { | ||
| PERF_RECORD(perf, "re-skip-substr", name_); | ||
| PERF_RECORD(perf, "re-skip", name_); | ||
| return false; | ||
| } | ||
|
|
||
|
|
@@ -105,5 +111,50 @@ bool TagExtractorImpl::extractTag(absl::string_view stat_name, std::vector<Tag>& | |
| return false; | ||
| } | ||
|
|
||
| TagExtractorRe2Impl::TagExtractorRe2Impl(const std::string& name, const std::string regex, | ||
| const std::string& substr) | ||
| : TagExtractorImpl(name, regex, substr), regex_(regex) {} | ||
|
|
||
| bool TagExtractorRe2Impl::extractTag(absl::string_view stat_name, std::vector<Tag>& tags, | ||
| IntervalSet<size_t>& remove_characters) const { | ||
| PERF_OPERATION(perf); | ||
|
|
||
| if (substrMismatch(stat_name)) { | ||
| PERF_RECORD(perf, "re2-skip", name_); | ||
| return false; | ||
| } | ||
|
|
||
| // remove_subexpr is the first submatch. It represents the portion of the string to be removed. | ||
| re2::StringPiece remove_subexpr, value_subexpr; | ||
|
|
||
| // The regex must match and contain one or more subexpressions (all after the first are ignored). | ||
| if (re2::RE2::FullMatch(re2::StringPiece(stat_name.data(), stat_name.size()), regex_, | ||
| &remove_subexpr, &value_subexpr) && | ||
| !remove_subexpr.empty()) { | ||
|
|
||
| // value_subexpr is the optional second submatch. It is usually inside the first submatch | ||
| // (remove_subexpr) to allow the expression to strip off extra characters that should be removed | ||
| // from the string but also not necessary in the tag value ("." for example). If there is no | ||
| // second submatch, then the value_subexpr is the same as the remove_subexpr. | ||
| if (value_subexpr.empty()) { | ||
| value_subexpr = remove_subexpr; | ||
| } | ||
|
|
||
| tags.emplace_back(); | ||
| Tag& tag = tags.back(); | ||
| tag.name_ = name_; | ||
| tag.value_ = std::string(value_subexpr); | ||
|
|
||
| // Determines which characters to remove from stat_name to elide remove_subexpr. | ||
| std::string::size_type start = remove_subexpr.data() - stat_name.data(); | ||
| std::string::size_type end = remove_subexpr.data() + remove_subexpr.size() - stat_name.data(); | ||
| remove_characters.insert(start, end); | ||
| PERF_RECORD(perf, "re2-match", name_); | ||
| return true; | ||
|
Comment on lines
+132
to
+158
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you consider options for avoiding duplicating a lot of the logic/comments between here and the std::regex code?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I messed around a bit and was only really able to profitably factor out the code to add a tag to the tags array. There were enough subtle difference in all the other parallels that it didn't seem worth it to refactor. LMK if you have other ideas. |
||
| } | ||
| PERF_RECORD(perf, "re2-miss", name_); | ||
| return false; | ||
| } | ||
|
|
||
| } // namespace Stats | ||
| } // namespace Envoy | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| // Note: this should be run with --compilation_mode=opt, and would benefit from | ||
| // a quiescent system with disabled cstate power management. | ||
|
|
||
| #include <regex> | ||
|
|
||
| #include "common/common/assert.h" | ||
|
|
||
| #include "absl/strings/string_view.h" | ||
| #include "benchmark/benchmark.h" | ||
| #include "re2/re2.h" | ||
|
|
||
| // NOLINT(namespace-envoy) | ||
|
|
||
| static const char* cluster_inputs[] = { | ||
| "cluster.no_trailing_dot", | ||
| "cluster.match.", | ||
| "cluster.match.normal", | ||
| "cluster.match.and.a.whole.lot.of.things.coming.after.the.matches.really.too.much.stuff", | ||
|
jmarantz marked this conversation as resolved.
|
||
| }; | ||
|
|
||
| static const char cluster_re_pattern[] = "^cluster\\.((.*?)\\.)"; | ||
| static const char cluster_re_alt_pattern[] = "^cluster\\.(([^\\.]+)\\.).*"; | ||
|
jmarantz marked this conversation as resolved.
Outdated
|
||
|
|
||
| static void BM_StdRegex(benchmark::State& state) { | ||
| std::regex re(cluster_re_pattern); | ||
| uint32_t passes = 0; | ||
| std::vector<std::string> inputs; | ||
| for (const char* cluster_input : cluster_inputs) { | ||
| inputs.push_back(cluster_input); | ||
| } | ||
|
|
||
| for (auto _ : state) { | ||
| for (const std::string& cluster_input : inputs) { | ||
| std::smatch match; | ||
| if (std::regex_search(cluster_input, match, re)) { | ||
| ASSERT(match.size() >= 3); | ||
| ASSERT(match[1] == "match."); | ||
| ASSERT(match[2] == "match"); | ||
| ++passes; | ||
| } | ||
| } | ||
| } | ||
| RELEASE_ASSERT(passes > 0, ""); | ||
| } | ||
| BENCHMARK(BM_StdRegex); | ||
|
|
||
| static void BM_StdRegexStringView(benchmark::State& state) { | ||
| std::regex re(cluster_re_pattern); | ||
| std::vector<absl::string_view> inputs; | ||
| for (const char* cluster_input : cluster_inputs) { | ||
| inputs.push_back(cluster_input); | ||
| } | ||
| uint32_t passes = 0; | ||
| for (auto _ : state) { | ||
| for (absl::string_view cluster_input : inputs) { | ||
| std::match_results<absl::string_view::iterator> smatch; | ||
| if (std::regex_search(cluster_input.begin(), cluster_input.end(), smatch, re)) { | ||
| ASSERT(smatch.size() >= 3); | ||
| ASSERT(smatch[1] == "match."); | ||
| ASSERT(smatch[2] == "match"); | ||
| ++passes; | ||
| } | ||
| } | ||
| } | ||
| RELEASE_ASSERT(passes > 0, ""); | ||
| } | ||
| BENCHMARK(BM_StdRegexStringView); | ||
|
|
||
| static void BM_StdRegexStringViewAltPattern(benchmark::State& state) { | ||
| std::regex re(cluster_re_alt_pattern); | ||
| std::vector<absl::string_view> inputs; | ||
| for (const char* cluster_input : cluster_inputs) { | ||
| inputs.push_back(cluster_input); | ||
| } | ||
| uint32_t passes = 0; | ||
| for (auto _ : state) { | ||
| for (absl::string_view cluster_input : inputs) { | ||
| std::match_results<absl::string_view::iterator> smatch; | ||
| if (std::regex_search(cluster_input.begin(), cluster_input.end(), smatch, re)) { | ||
| ASSERT(smatch.size() >= 3); | ||
| ASSERT(smatch[1] == "match."); | ||
| ASSERT(smatch[2] == "match"); | ||
| ++passes; | ||
| } | ||
| } | ||
| } | ||
| RELEASE_ASSERT(passes > 0, ""); | ||
| } | ||
| BENCHMARK(BM_StdRegexStringViewAltPattern); | ||
|
|
||
| static void BM_RE2(benchmark::State& state) { | ||
| re2::RE2 re(cluster_re_pattern); | ||
| uint32_t passes = 0; | ||
| for (auto _ : state) { | ||
| for (const char* cluster_input : cluster_inputs) { | ||
| re2::StringPiece match1, match2; | ||
| if (re2::RE2::PartialMatch(cluster_input, re, &match1, &match2)) { | ||
| ASSERT(match1 == "match."); | ||
| ASSERT(match2 == "match"); | ||
| ++passes; | ||
| } | ||
| } | ||
| } | ||
| RELEASE_ASSERT(passes > 0, ""); | ||
| } | ||
| BENCHMARK(BM_RE2); | ||
|
|
||
| static void BM_RE2_AltPattern(benchmark::State& state) { | ||
| re2::RE2 re(cluster_re_alt_pattern); | ||
| uint32_t passes = 0; | ||
| for (auto _ : state) { | ||
| for (const char* cluster_input : cluster_inputs) { | ||
| re2::StringPiece match1, match2; | ||
| if (re2::RE2::PartialMatch(cluster_input, re, &match1, &match2)) { | ||
| ASSERT(match1 == "match."); | ||
| ASSERT(match2 == "match"); | ||
| ++passes; | ||
| } | ||
| } | ||
| } | ||
| RELEASE_ASSERT(passes > 0, ""); | ||
| } | ||
| BENCHMARK(BM_RE2_AltPattern); | ||
|
|
||
| // Boilerplate main(), which discovers benchmarks in the same file and runs them. | ||
| int main(int argc, char** argv) { | ||
| benchmark::Initialize(&argc, argv); | ||
| if (benchmark::ReportUnrecognizedArguments(argc, argv)) { | ||
| return 1; | ||
| } | ||
| benchmark::RunSpecifiedBenchmarks(); | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.