Skip to content
Closed
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
5639cf6
add hand-rolled json sanitizer
jmarantz Mar 20, 2022
1233b00
fix corner case
jmarantz Mar 20, 2022
0509422
inline the char_esacpes_ values rather than unique_ptr<std::string>
jmarantz Mar 20, 2022
a8d4daa
improve internal names and add comments
jmarantz Mar 20, 2022
99a381f
Fix comment typo and explicitly initialize Escape.size_ rather than b…
jmarantz Mar 20, 2022
02216fa
cleanup comments
jmarantz Mar 20, 2022
319e09c
provide link to the utf-8 spec which shows us we can correctly saniti…
jmarantz Mar 20, 2022
696cc04
add fuzz test and corpus of cases pulled from the unit test.
jmarantz Mar 22, 2022
6251863
checkpoint
jmarantz Mar 23, 2022
b11e6de
multi-byte utf8 decoder
jmarantz Mar 23, 2022
ed5616c
utf8 decoding works
jmarantz Mar 23, 2022
b651e77
cleanup
jmarantz Mar 23, 2022
03a33b1
cleanup & partial format
jmarantz Mar 23, 2022
cc95823
typo
jmarantz Mar 23, 2022
f6d49a3
minor speed tweak
jmarantz Mar 23, 2022
747fccc
remove unused constants and update benchmark data
jmarantz Mar 23, 2022
412a1c4
add comment, fix tests.
jmarantz Mar 23, 2022
6bc7e43
tidy
jmarantz Mar 23, 2022
974220e
Move isValidUtf8 out of prod code, as it's not needed in prod; only i…
jmarantz Mar 24, 2022
2faf91c
add an assert and some more comments.
jmarantz Mar 24, 2022
7db4d85
re-sort constants so the bits line up better.
jmarantz Mar 24, 2022
318f05e
add some invalid-utf8 tests
jmarantz Mar 24, 2022
63906bd
test & fix behavior for invalid input.
jmarantz Mar 24, 2022
d3d1d03
speed hacks.
jmarantz Mar 25, 2022
6a3ef94
Add more invalid-utf8 tests.
jmarantz Mar 25, 2022
11b9e47
Use StrFormat rather than snprintf even though it's likely slower.
jmarantz Mar 25, 2022
918d9d1
added one more test surveying all 8-bit inputs.
jmarantz Mar 25, 2022
0cec2cd
refine the invalid-utf8 test to include special case unicode ranges t…
jmarantz Mar 28, 2022
d1cec3b
cleanup and fix corner cases.
jmarantz Mar 29, 2022
e2bceaa
add fuzzing utility.cc impl.
jmarantz Mar 29, 2022
dbb3861
tidy & gcc cleanups.
jmarantz Mar 29, 2022
98e485d
remove backslash at EOL in comment.
jmarantz Mar 29, 2022
1acc7d7
Merge branch 'main' into json-sanitizer
jmarantz Mar 30, 2022
1ac564a
checkpoint with tweaked utf8 converter and nloehman benchmark.
jmarantz Apr 1, 2022
d8b6393
format (not fully compliant)
jmarantz Apr 1, 2022
6e3055c
add missing file
jmarantz Apr 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions envoy/common/interval_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ template <typename Value> class IntervalSet {
* Clears the contents of the interval set.
*/
virtual void clear() PURE;

/**
* Determines whether the specified Value is in any of the intervals.
*
* @param value the value
* @return true if value is covered in the inteval set.
*/
virtual bool test(Value value) const PURE;
};

} // namespace Envoy
5 changes: 5 additions & 0 deletions source/common/common/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,11 @@ template <typename Value> class IntervalSetImpl : public IntervalSet<Value> {
intervals_.insert(Interval(left, right));
}

bool test(Value value) const override {
const auto left_pos = intervals_.lower_bound(Interval(value, value + 1));

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this assuming an integer representation of intervals? Just trying to figure out why the +1 works.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interval is documented as being [left, right). I'd have to think about how this would be used with floats. Maybe it's better just to document this as integer-only.

return left_pos != intervals_.end() && value >= left_pos->first && value < left_pos->second;
}

std::vector<Interval> toVector() const override {
return std::vector<Interval>(intervals_.begin(), intervals_.end());
}
Expand Down
7 changes: 7 additions & 0 deletions source/common/json/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,10 @@ envoy_cc_library(
"//source/common/runtime:runtime_features_lib",
],
)

envoy_cc_library(
name = "json_sanitizer_lib",
srcs = ["json_sanitizer.cc"],
hdrs = ["json_sanitizer.h"],
deps = ["//source/common/common:assert_lib"],
)
239 changes: 239 additions & 0 deletions source/common/json/json_sanitizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
#include "source/common/json/json_sanitizer.h"

#include <utility>

#include "source/common/common/assert.h"

#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"

namespace Envoy {
namespace Json {

namespace {

const uint8_t Literal = 0;
const uint8_t ControlEscapeSize = 2; // e.g. \b
const uint8_t UnicodeEscapeSize = 6; // e.g. \u1234
const uint8_t Utf8DecodeSentinel = 0xff;

} // namespace

JsonSanitizer::JsonSanitizer() {
// Single-char escape sequences for common control characters.
auto symbolic_escape = [this](char control_char, char symbolic) {
Escape& escape = char_escapes_[char2uint32(control_char)];
escape.size_ = ControlEscapeSize;
escape.chars_[0] = '\\';
escape.chars_[1] = symbolic;
};
symbolic_escape('\b', 'b');
symbolic_escape('\f', 'f');
symbolic_escape('\n', 'n');
symbolic_escape('\r', 'r');
symbolic_escape('\t', 't');
symbolic_escape('\\', '\\');
symbolic_escape('"', '"');

// Low characters (0-31) not listed above are encoded as unicode 4-digit hex.
auto unicode_escape = [this](uint32_t index) {
// We capture unicode Escapes both in a char-indexed array, for direct
// substitutions on literal inputs, and in a unicode-indexed hash-map,
// for lookup after utf8 decode.
std::string escape_str = absl::StrFormat("\\u%04x", index);
ASSERT(escape_str.size() == UnicodeEscapeSize);
Escape& escape = unicode_escapes_[index];
escape.size_ = escape_str.size();
RELEASE_ASSERT(escape.size_ <= sizeof(escape.chars_), "escaped string too large");
memcpy(escape.chars_, escape_str.data(), escape_str.size()); // NOLINT(safe-memcpy)*/

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at this approach in general, I think we have a lot of complexity in code being executed at runtime. Sure, it's going to build the same table every time, but it's a bit hard to reason about.

Can't we build an offline table in the build process and then just #include it? The advantage of that is it would also allow for easy inspection.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking we would instantiate one of these Json tables in the StatsHandler so it would only be on startup.

However I think it's fine to move this code to a generator and either generate during build, or generate manually and check it in as source, annotating the array with comments.

if (index < NumEscapes) {
char_escapes_[index] = escape;
}
};

// Add unicode escapes for control-characters below 32 that don't have symbolic escapes.
for (uint32_t i = 0; i < ' '; ++i) {
if (char_escapes_[i].size_ == 0) {
unicode_escape(i);
}
}

// Unicode-escaped ascii constants above SPACE (32).
for (char ch : {'<', '>', '\177'}) {
unicode_escape(char2uint32(ch));
}

// There's a range of 8-bit characters that are unicode escaped by the
// protobuf library, so we match behavior.
for (uint32_t i = 0x0080; i < 0x00a0; ++i) {
unicode_escape(i);
}

// The remaining unicode characters are mostly passed through literally. We'll
// initialize all of them and then override some below.
for (uint32_t i = 0x00a0; i < NumEscapes; ++i) {
char_escapes_[i].size_ = Literal;
}

// All the bytes matching pattern 11xxxxxx will be evaluated as utf-8.
for (uint32_t i = Utf8_2BytePattern; i <= 0xff; ++i) {
char_escapes_[i].size_ = Utf8DecodeSentinel;
}

// There are an assortment of unicode characters that protobufs quote, so we
// do likewise here to make differential testing/fuzzing feasible.
for (uint32_t i : {0x00ad, 0x0600, 0x0601, 0x0602, 0x0603, 0x06dd, 0x070f}) {
unicode_escape(i);
}
}

absl::string_view JsonSanitizer::sanitize(std::string& buffer, absl::string_view str) const {
// Fast-path to see whether any escapes or utf-encoding are needed. If str has
// only unescaped ascii characters, we can simply return it. So before doing
// anything too fancy, do a lookup in char_escapes_ for each character, and
// simply OR in the return sizes. We use 0 for the return-size when we are
// simply leaving the character as is, so anything non-zero means we need to
// initiate the slow path.
//
// Benchmarks show it's faster to just rip through the string with no
// conditionals, so we only check the ORed sizes after the loop. This avoids
// branches and allows simpler loop unrolling by the compiler.
uint32_t sizes_ored_together = 0;
for (char c : str) {
sizes_ored_together |= char_escapes_[char2uint32(c)].size_;
}
if (sizes_ored_together == 0) {
return str; // Happy path, should be executed most of the time.
}
return slowSanitize(buffer, str);
}

absl::string_view JsonSanitizer::slowSanitize(std::string& buffer, absl::string_view str) const {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you audited https://nlohmann.github.io/json/ for similar functions? I'm wondering if we we have anything that will simplify either sanitize or the UTF8 piece in our existing deps.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch -- I didn't know about nlohmann and now I see we are already pulling that in. I'll take a look.

Do you know why we started using protobufs as an IR to generate JSON in the first place, given that?

@jmarantz jmarantz Mar 31, 2022

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RE nlohmann a quick read suggests it has a similar usage model as protobufs: https://github.com/nlohmann/json#serialization--deserialization

ie to serialize you need to populate an object and then serialize it, rather than serializing directly. I think this pattern of creating intermediate objects in order to convert a string to a safer string is the core reason the protobuf version is so slow.

Note that even though I named this function slowSanitize it is still 10x faster than protobuf serialization due to not needing an intermediate representation.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The history here was that Protobuf for the longest time was the only reliable dependency here, and we were trying to reduce our dependency on rapidjson. The introduction of nlohmann JSON has changed this dynamic.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you run a quick benchmark to convince that the nlohmann JSON library is not suitable? I think Protobuf is slow for all kinds of reasons, it's never been highly tuned for performance on this path most likely, whereas the nlohmann library is designed for speed.

Basically, what we have in this PR represents a real speed up that seems to justify the complexity, but we should do the due diligence that there isn't something else within reasonable reach in one of our existing dependencies (the only ones I can think of are nlohmann JSON, maybe Abseil?) that does the job.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm! That wasn't too hard, and probably that's the right way to go. I think the fast-path wrapper I have should stay because for stats that will be used most of the time, it's still 20x faster than nlohmann, and it's pretty easy to reason about. But nlohmann is only 2x slower than the sanitizer I added here and it might not be worth it adding all the complexity:

---------------------------------------------------------------------------
Benchmark                                 Time             CPU   Iterations
---------------------------------------------------------------------------
BM_ProtoEncoderNoEscape                1330 ns         1330 ns       458178
BM_JsonSanitizerNoEscape               8.83 ns         8.82 ns     79050443
BM_NlohmannNoEscape                     187 ns          187 ns      3752300
BM_StaticJsonSanitizerNoEscape         9.52 ns         9.52 ns     73460589
BM_ProtoEncoderWithEscape              1483 ns         1483 ns       471766
BM_NlohmannWithEscape                   199 ns          199 ns      3489229
BM_JsonSanitizerWithEscape              103 ns          103 ns      6850721
BM_StaticJsonSanitizerWithEscape        102 ns          102 ns      6859187

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more point about nlohmann -- it's not at all compatible in terms of its output with Protobufs. It makes all kinds of different (more rational) escaping decisions than protobufs, but it can't be differentially fuzzed against them.

std::string oct_escape_buf;
size_t past_escape = absl::string_view::npos;
const uint8_t* first = reinterpret_cast<const uint8_t*>(str.data());
const uint8_t* data = first;
absl::string_view escape_view;
for (uint32_t n = str.size(); n != 0; ++data, --n) {
const Escape& escape = char_escapes_[*data];
if (escape.size_ != Literal) {
uint32_t start_of_escape = data - first;
switch (escape.size_) {
case ControlEscapeSize:
case UnicodeEscapeSize:
escape_view = absl::string_view(escape.chars_, escape.size_);
break;
case Utf8DecodeSentinel: {
auto [unicode, consumed] = decodeUtf8(data, n);
if (consumed != 0) {
--consumed;
data += consumed;
n -= consumed;

// Having validated and constructed the unicode for the utf-8
// sequence we must determine whether to render it literally by
// simply leaving it alone, or whether we ought to render it
// as a unicode escape. We do this using a hash-map set up during
// the constructor with all desired unicode escapes, to mimic the
// behavior of the protobuf json serializer.
auto iter = unicode_escapes_.find(unicode);
if (iter == unicode_escapes_.end()) {
continue;
}
escape_view = absl::string_view(iter->second.chars_, iter->second.size_);
} else {
// Using StrFormat during decode seems slow, but this case should be
// rare.
oct_escape_buf = absl::StrFormat("\\%03o", *data);
escape_view = absl::string_view(oct_escape_buf);
}
break;
}
default:
ASSERT(false);
}

if (past_escape == absl::string_view::npos) {
// We only initialize buffer when we first learn we need to add an
// escape-sequence to the sanitized string.
if (start_of_escape == 0) {
// The first character is an escape, and 'buffer' has not been cleared yet,
// so we need to assign it rather than append to it.
buffer.assign(escape_view.data(), escape_view.size());
} else {
// We found our first escape, but this is not the first character in the
// string, so we combine the unescaped characters in the string we already
// looped over with the new escaped character.
buffer = absl::StrCat(str.substr(0, start_of_escape), escape_view);
}
} else if (start_of_escape == past_escape) {
// We are adding an escape immediately after another escaped character.
absl::StrAppend(&buffer, escape_view);
} else {
// We are adding a new escape but must first cover the characters
// encountered since the previous escape.
absl::StrAppend(&buffer, str.substr(past_escape, start_of_escape - past_escape),
escape_view);
}
past_escape = data - first + 1;
}
}
Comment on lines +41 to +180

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be codes can be more simple by this way?

Suggested change
size_t past_escape = absl::string_view::npos;
for (uint32_t i = 0, n = str.size(); i < n; ++i) {
uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
const std::string* escape = char_escapes_[index].get();
if (escape != nullptr) {
if (past_escape == absl::string_view::npos) {
// We only initialize buffer when we first learn we need to add an
// escape-sequence to the sanitized string.
if (i == 0) {
buffer = *escape;
} else {
buffer = absl::StrCat(str.substr(0, i), *escape);
}
} else if (i == past_escape) {
absl::StrAppend(&buffer, *escape);
} else {
absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
}
past_escape = i + 1;
}
}
size_t past_escape = 0;
for (uint32_t i = 0, n = str.size(); i < n; ++i) {
const std::string* escape = char_escapes_[static_cast<uint8_t>(str[i])].get();
if (escape != nullptr) {
absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
past_escape = i + 1;
}
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the above suggestion is part of the code I propose, but it doesn't handle some of the other cases, such as 2 escaped chars in a row, or clearing the buffer between each call to sanitize that requires escaping. I'll add more comments to make each conditional clearer.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code could be simpler but I wanted to avoid having the user clear buffer on every call, because that does not optimize for the case where the string does not require escaping, which in the expected usage will be the common case.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sound reasonable. But string clear just sets string length to zero and first char to '\0', will this brings significant performance impact? 🤔

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing the code to what you suggested definitely made things materially slower, but adding the clear() at the start was only marginally slower (within margin of noise), I guess due to compilers/CPUs doing branch prediction well (clear() will need to do different things depending on whether the std::string has inlined the bytes or note).

I'll mess around a little more with that.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Get it. If we can simplify the code without paying a performance price, it is surely best to do so. Otherwise, just ignore my suggestion. 😸


// If no escape-sequence was needed, we just return the input.
if (past_escape == absl::string_view::npos) {
return str;
}

// Otherwise we append on any unescaped chunk at the end of the input, and
// return buffer as the result.
if (past_escape < str.size()) {
absl::StrAppend(&buffer, str.substr(past_escape, str.size() - past_escape));
}
return buffer;
}

std::pair<uint32_t, uint32_t> JsonSanitizer::decodeUtf8(const uint8_t* bytes, uint32_t size) {
uint32_t unicode = 0;
uint32_t consumed = 0;

// See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section.
//
// See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is
// marked as deprecated. There is also support in Windows libraries and Boost,
// which can be discovered on StackOverflow. I could not find a usable OSS
// implementation. However it's easily derived from the spec on Wikipedia.
//
// Note that the code below could be optimized a bit, e.g. by factoring out
// repeated lookups of the same index in the bytes array and using SSE
// instructions for the multi-word bit hacking.
//
// See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot
// faster, though less readable. As coded, though, it looks like it would read
// past the end of the input if the input is malformed.
if (size >= 2 && (bytes[0] & Utf8_2ByteMask) == Utf8_2BytePattern &&
(bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
unicode = bytes[0] & ~Utf8_2ByteMask;
unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
consumed = 2;
} else if (size >= 3 && (bytes[0] & Utf8_3ByteMask) == Utf8_3BytePattern &&
(bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
(bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
unicode = bytes[0] & ~Utf8_3ByteMask;
unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask);
consumed = 3;
} else if (size >= 4 && (bytes[0] & Utf8_4ByteMask) == Utf8_4BytePattern &&
(bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
(bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
(bytes[3] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
unicode = bytes[0] & ~Utf8_4ByteMask;
unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask);
unicode = (unicode << Utf8_Shift) | (bytes[3] & ~Utf8_ContinueMask);
consumed = 4;
}
return UnicodeSizePair(unicode, consumed);
}
Comment on lines +136 to +252

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May this Branchless UTF-8 Decoder can get a better performance?

@jmarantz jmarantz Mar 24, 2022

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh that's a cool article. I don't think the branchless one is ideal for this use-case as (my guess) is our distribution is expected to weigh heavily toward 1 and 2 byte codes. Also I think we'd get ASAN violations if we don't branch on the length before reading extra bytes.

The DFA one that points to looks interesting though (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/), and is more along the lines I was thinking. I may play around with that but I think the 60x perf gain from this API approach is probably good enough for now :) However as coded I think that can also read past the end the buffer if it is given invalid input. The coding I have here will reject an invalid encoding before reading past memory.


} // namespace Json
} // namespace Envoy
93 changes: 93 additions & 0 deletions source/common/json/json_sanitizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#pragma once

#include <string>

#include "absl/container/flat_hash_map.h"
#include "absl/strings/string_view.h"

namespace Envoy {
namespace Json {

// Hand-rolled JSON sanitizer that has exactly the same behavior as serializing
// through protobufs, but is more than 10x faster. From
// test/common/json/json_sanitizer_speed_test.cc:
//
// ---------------------------------------------------------------------------
// Benchmark Time CPU Iterations
// ---------------------------------------------------------------------------
// BM_ProtoEncoderNoEscape 1123 ns 1123 ns 545345
// BM_JsonSanitizerNoEscape 8.77 ns 8.77 ns 79517538
// BM_StaticJsonSanitizerNoEscape 9.52 ns 9.52 ns 73570603
// BM_ProtoEncoderWithEscape 1326 ns 1326 ns 528576
// BM_JsonSanitizerWithEscape 96.3 ns 96.3 ns 7289627
// BM_StaticJsonSanitizerWithEscape 97.5 ns 97.5 ns 7157098
//
class JsonSanitizer {
public:
static constexpr uint32_t Utf8_2ByteMask = 0b11100000;
static constexpr uint32_t Utf8_3ByteMask = 0b11110000;
static constexpr uint32_t Utf8_4ByteMask = 0b11111000;

static constexpr uint32_t Utf8_2BytePattern = 0b11000000;
static constexpr uint32_t Utf8_3BytePattern = 0b11100000;
static constexpr uint32_t Utf8_4BytePattern = 0b11110000;

static constexpr uint32_t Utf8_ContinueMask = 0b11000000;
static constexpr uint32_t Utf8_ContinuePattern = 0b10000000;

static constexpr uint32_t Utf8_Shift = 6;

// Constructing the sanitizer fills in a table with all escape-sequences,
// indexed by character. To make this perform well, you should instantiate the
// sanitizer in a context that lives across a large number of sanitizations.
JsonSanitizer();

/**
* Sanitizes a string so it is suitable for JSON. The buffer is
* used if any of the characters in str need to be escaped.
*
* @param buffer a string in which an escaped string can be written, if needed. It
* is not necessary for callers to clear the buffer first; it be cleared
* by this method if the input needs to be escaped.
* @param str the string to be translated
* @return the translated string_view.
*/
absl::string_view sanitize(std::string& buffer, absl::string_view str) const;

/** The Unicode code-point and the number of utf8-bytes consumed */
using UnicodeSizePair = std::pair<uint32_t, uint32_t>;

/**
* Decodes a byte-stream of UTF8, returning the resulting unicode and the
* number of bytes consumed as a pair.
*
* @param bytes The data with utf8 bytes.
* @param size The number of bytes available in data
* @return UnicodeSizePair(unicode, consumed) -- if the decode fails consumed will be 0.
*/
static UnicodeSizePair decodeUtf8(const uint8_t* bytes, uint32_t size);

private:
// static constexpr uint32_t NumEscapes = 1 << 11; // 2^11=2048 codes possible in 2-byte utf8.
static constexpr uint32_t NumEscapes = 256;

// Character-indexed array of translation strings. If an entry is nullptr then
// the character does not require substitution. This strategy is dependent on
// the property of UTF-8 where all two-byte characters have the high-order bit
// set for both bytes, and don't require escaping for JSON. Thus we can
// consider each character in isolation for escaping. Reference:
// https://en.wikipedia.org/wiki/UTF-8.
struct Escape {
uint8_t size_{0};
char chars_[7]; // No need to initialize char data, as we are not null-terminating.
};

static uint32_t char2uint32(char c) { return static_cast<uint32_t>(static_cast<uint8_t>(c)); }
absl::string_view slowSanitize(std::string& buffer, absl::string_view str) const;

Escape char_escapes_[NumEscapes];
absl::flat_hash_map<uint32_t, Escape> unicode_escapes_;
};

} // namespace Json
} // namespace Envoy
15 changes: 15 additions & 0 deletions test/common/common/utility_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,21 @@ TEST(IntervalSet, testIntervalTargeted) {
EXPECT_EQ("[15, 20), [25, 30), [35, 40), [41, 43)", test(41, 43));
}

TEST(IntervalSet, testTest) {
IntervalSetImpl<uint32_t> set;
set.insert(4, 6);
EXPECT_FALSE(set.test(0));
set.insert(0, 2);
EXPECT_TRUE(set.test(0));
EXPECT_TRUE(set.test(1));
EXPECT_FALSE(set.test(2));
EXPECT_FALSE(set.test(3));
EXPECT_TRUE(set.test(4));
EXPECT_TRUE(set.test(5));
EXPECT_FALSE(set.test(6));
EXPECT_FALSE(set.test(7));
}

TEST(WelfordStandardDeviation, AllEntriesTheSame) {
WelfordStandardDeviation wsd;
wsd.update(10);
Expand Down
Loading