Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
5639cf6
add hand-rolled json sanitizer
jmarantz Mar 20, 2022
1233b00
fix corner case
jmarantz Mar 20, 2022
0509422
inline the char_esacpes_ values rather than unique_ptr<std::string>
jmarantz Mar 20, 2022
a8d4daa
improve internal names and add comments
jmarantz Mar 20, 2022
99a381f
Fix comment typo and explicitly initialize Escape.size_ rather than b…
jmarantz Mar 20, 2022
02216fa
cleanup comments
jmarantz Mar 20, 2022
319e09c
provide link to the utf-8 spec which shows us we can correctly saniti…
jmarantz Mar 20, 2022
696cc04
add fuzz test and corpus of cases pulled from the unit test.
jmarantz Mar 22, 2022
6251863
checkpoint
jmarantz Mar 23, 2022
b11e6de
multi-byte utf8 decoder
jmarantz Mar 23, 2022
ed5616c
utf8 decoding works
jmarantz Mar 23, 2022
b651e77
cleanup
jmarantz Mar 23, 2022
03a33b1
cleanup & partial format
jmarantz Mar 23, 2022
cc95823
typo
jmarantz Mar 23, 2022
f6d49a3
minor speed tweak
jmarantz Mar 23, 2022
747fccc
remove unused constants and update benchmark data
jmarantz Mar 23, 2022
412a1c4
add comment, fix tests.
jmarantz Mar 23, 2022
6bc7e43
tidy
jmarantz Mar 23, 2022
974220e
Move isValidUtf8 out of prod code, as it's not needed in prod; only i…
jmarantz Mar 24, 2022
2faf91c
add an assert and some more comments.
jmarantz Mar 24, 2022
7db4d85
re-sort constants so the bits line up better.
jmarantz Mar 24, 2022
318f05e
add some invalid-utf8 tests
jmarantz Mar 24, 2022
63906bd
test & fix behavior for invalid input.
jmarantz Mar 24, 2022
d3d1d03
speed hacks.
jmarantz Mar 25, 2022
6a3ef94
Add more invalid-utf8 tests.
jmarantz Mar 25, 2022
11b9e47
Use StrFormat rather than snprintf even though it's likely slower.
jmarantz Mar 25, 2022
918d9d1
added one more test surveying all 8-bit inputs.
jmarantz Mar 25, 2022
0cec2cd
refine the invalid-utf8 test to include special case unicode ranges t…
jmarantz Mar 28, 2022
d1cec3b
cleanup and fix corner cases.
jmarantz Mar 29, 2022
e2bceaa
add fuzzing utility.cc impl.
jmarantz Mar 29, 2022
dbb3861
tidy & gcc cleanups.
jmarantz Mar 29, 2022
98e485d
remove backslash at EOL in comment.
jmarantz Mar 29, 2022
1acc7d7
Merge branch 'main' into json-sanitizer
jmarantz Mar 30, 2022
1ac564a
checkpoint with tweaked utf8 converter and nloehman benchmark.
jmarantz Apr 1, 2022
d8b6393
format (not fully compliant)
jmarantz Apr 1, 2022
6e3055c
add missing file
jmarantz Apr 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions envoy/common/interval_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ template <typename Value> class IntervalSet {
* Clears the contents of the interval set.
*/
virtual void clear() PURE;

/**
* Determines whether the specified Value is in any of the intervals.
*
* @param value the value
* @return true if value is covered in the inteval set.
*/
virtual bool test(Value value) const PURE;
};

} // namespace Envoy
5 changes: 5 additions & 0 deletions source/common/common/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,11 @@ template <typename Value> class IntervalSetImpl : public IntervalSet<Value> {
intervals_.insert(Interval(left, right));
}

bool test(Value value) const override {
const auto left_pos = intervals_.lower_bound(Interval(value, value + 1));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this assuming an integer representation of intervals? Just trying to figure out why the +1 works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interval is documented as being [left, right). I'd have to think about how this would be used with floats. Maybe it's better just to document this as integer-only.

return left_pos != intervals_.end() && value >= left_pos->first && value < left_pos->second;
}

std::vector<Interval> toVector() const override {
return std::vector<Interval>(intervals_.begin(), intervals_.end());
}
Expand Down
7 changes: 7 additions & 0 deletions source/common/json/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,10 @@ envoy_cc_library(
"//source/common/runtime:runtime_features_lib",
],
)

envoy_cc_library(
name = "json_sanitizer_lib",
srcs = ["json_sanitizer.cc"],
hdrs = ["json_sanitizer.h"],
deps = ["//source/common/common:assert_lib"],
)
5 changes: 5 additions & 0 deletions source/common/json/json_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,11 @@ ObjectSharedPtr Factory::loadFromString(const std::string& json) {
return handler.getRoot();
}

std::string Factory::serialize(absl::string_view str) {
nlohmann::json j(str);
return j.dump();
}

} // namespace Nlohmann
} // namespace Json
} // namespace Envoy
4 changes: 4 additions & 0 deletions source/common/json/json_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "envoy/json/json_object.h"

#include "absl/strings/string_view.h"

namespace Envoy {
namespace Json {
namespace Nlohmann {
Expand All @@ -15,6 +17,8 @@ class Factory {
* Constructs a Json Object from a string.
*/
static ObjectSharedPtr loadFromString(const std::string& json);

static std::string serialize(absl::string_view str);
};

} // namespace Nlohmann
Expand Down
255 changes: 255 additions & 0 deletions source/common/json/json_sanitizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
#include "source/common/json/json_sanitizer.h"

#include <utility>

#include "source/common/common/assert.h"

#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"

namespace Envoy {
namespace Json {

namespace {

const uint8_t Literal = 0;
const uint8_t ControlEscapeSize = 2; // e.g. \b
const uint8_t UnicodeEscapeSize = 6; // e.g. \u1234
const uint8_t Utf8DecodeSentinel = 0xff;

} // namespace

JsonSanitizer::JsonSanitizer() {
// Single-char escape sequences for common control characters.
auto symbolic_escape = [this](char control_char, char symbolic) {
Escape& escape = char_escapes_[char2uint32(control_char)];
escape.size_ = ControlEscapeSize;
escape.chars_[0] = '\\';
escape.chars_[1] = symbolic;
};
symbolic_escape('\b', 'b');
symbolic_escape('\f', 'f');
symbolic_escape('\n', 'n');
symbolic_escape('\r', 'r');
symbolic_escape('\t', 't');
symbolic_escape('\\', '\\');
symbolic_escape('"', '"');

// Low characters (0-31) not listed above are encoded as unicode 4-digit hex.
auto unicode_escape = [this](uint32_t index) {
// We capture unicode Escapes both in a char-indexed array, for direct
// substitutions on literal inputs, and in a unicode-indexed hash-map,
// for lookup after utf8 decode.
std::string escape_str = absl::StrFormat("\\u%04x", index);
ASSERT(escape_str.size() == UnicodeEscapeSize);
Escape& escape = unicode_escapes_[index];
escape.size_ = escape_str.size();
RELEASE_ASSERT(escape.size_ <= sizeof(escape.chars_), "escaped string too large");
memcpy(escape.chars_, escape_str.data(), escape_str.size()); // NOLINT(safe-memcpy)*/
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at this approach in general, I think we have a lot of complexity in code being executed at runtime. Sure, it's going to build the same table every time, but it's a bit hard to reason about.

Can't we build an offline table in the build process and then just #include it? The advantage of that is it would also allow for easy inspection.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking we would instantiate one of these Json tables in the StatsHandler so it would only be on startup.

However I think it's fine to move this code to a generator and either generate during build, or generate manually and check it in as source, annotating the array with comments.

if (index < NumEscapes) {
char_escapes_[index] = escape;
}
};

// Add unicode escapes for control-characters below 32 that don't have symbolic escapes.
for (uint32_t i = 0; i < ' '; ++i) {
if (char_escapes_[i].size_ == 0) {
unicode_escape(i);
}
}

// Unicode-escaped ascii constants above SPACE (32).
for (char ch : {'<', '>', '\177'}) {
unicode_escape(char2uint32(ch));
}

// There's a range of 8-bit characters that are unicode escaped by the
// protobuf library, so we match behavior.
for (uint32_t i = 0x0080; i < 0x00a0; ++i) {
unicode_escape(i);
}

// The remaining unicode characters are mostly passed through literally. We'll
// initialize all of them and then override some below.
for (uint32_t i = 0x00a0; i < NumEscapes; ++i) {
char_escapes_[i].size_ = Literal;
}

// All the bytes matching pattern 11xxxxxx will be evaluated as utf-8.
for (uint32_t i = Utf8_2BytePattern; i <= 0xff; ++i) {
char_escapes_[i].size_ = Utf8DecodeSentinel;
}

// There are an assortment of unicode characters that protobufs quote, so we
// do likewise here to make differential testing/fuzzing feasible.
for (uint32_t i : {0x00ad, 0x0600, 0x0601, 0x0602, 0x0603, 0x06dd, 0x070f}) {
unicode_escape(i);
}
}

absl::string_view JsonSanitizer::sanitize(std::string& buffer, absl::string_view str) const {
// Fast-path to see whether any escapes or utf-encoding are needed. If str has
// only unescaped ascii characters, we can simply return it. So before doing
// anything too fancy, do a lookup in char_escapes_ for each character, and
// simply OR in the return sizes. We use 0 for the return-size when we are
// simply leaving the character as is, so anything non-zero means we need to
// initiate the slow path.
//
// Benchmarks show it's faster to just rip through the string with no
// conditionals, so we only check the ORed sizes after the loop. This avoids
// branches and allows simpler loop unrolling by the compiler.
uint32_t sizes_ored_together = 0;
for (char c : str) {
sizes_ored_together |= char_escapes_[char2uint32(c)].size_;
}
if (sizes_ored_together == 0) {
return str; // Happy path, should be executed most of the time.
}
return slowSanitize(buffer, str);
}

absl::string_view JsonSanitizer::slowSanitize(std::string& buffer, absl::string_view str) const {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you audited https://nlohmann.github.io/json/ for similar functions? I'm wondering if we we have anything that will simplify either sanitize or the UTF8 piece in our existing deps.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch -- I didn't know about nlohmann and now I see we are already pulling that in. I'll take a look.

Do you know why we started using protobufs as an IR to generate JSON in the first place, given that?

Copy link
Contributor Author

@jmarantz jmarantz Mar 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RE nlohmann a quick read suggests it has a similar usage model as protobufs: https://github.com/nlohmann/json#serialization--deserialization

ie to serialize you need to populate an object and then serialize it, rather than serializing directly. I think this pattern of creating intermediate objects in order to convert a string to a safer string is the core reason the protobuf version is so slow.

Note that even though I named this function slowSanitize it is still 10x faster than protobuf serialization due to not needing an intermediate representation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The history here was that Protobuf for the longest time was the only reliable dependency here, and we were trying to reduce our dependency on rapidjson. The introduction of nlohmann JSON has changed this dynamic.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you run a quick benchmark to convince that the nlohmann JSON library is not suitable? I think Protobuf is slow for all kinds of reasons, it's never been highly tuned for performance on this path most likely, whereas the nlohmann library is designed for speed.

Basically, what we have in this PR represents a real speed up that seems to justify the complexity, but we should do the due diligence that there isn't something else within reasonable reach in one of our existing dependencies (the only ones I can think of are nlohmann JSON, maybe Abseil?) that does the job.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm! That wasn't too hard, and probably that's the right way to go. I think the fast-path wrapper I have should stay because for stats that will be used most of the time, it's still 20x faster than nlohmann, and it's pretty easy to reason about. But nlohmann is only 2x slower than the sanitizer I added here and it might not be worth it adding all the complexity:

---------------------------------------------------------------------------
Benchmark                                 Time             CPU   Iterations
---------------------------------------------------------------------------
BM_ProtoEncoderNoEscape                1330 ns         1330 ns       458178
BM_JsonSanitizerNoEscape               8.83 ns         8.82 ns     79050443
BM_NlohmannNoEscape                     187 ns          187 ns      3752300
BM_StaticJsonSanitizerNoEscape         9.52 ns         9.52 ns     73460589
BM_ProtoEncoderWithEscape              1483 ns         1483 ns       471766
BM_NlohmannWithEscape                   199 ns          199 ns      3489229
BM_JsonSanitizerWithEscape              103 ns          103 ns      6850721
BM_StaticJsonSanitizerWithEscape        102 ns          102 ns      6859187

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more point about nlohmann -- it's not at all compatible in terms of its output with Protobufs. It makes all kinds of different (more rational) escaping decisions than protobufs, but it can't be differentially fuzzed against them.

std::string oct_escape_buf;
size_t past_escape = absl::string_view::npos;
const uint8_t* first = reinterpret_cast<const uint8_t*>(str.data());
const uint8_t* data = first;
absl::string_view escape_view;
for (uint32_t n = str.size(); n != 0; ++data, --n) {
const Escape& escape = char_escapes_[*data];
if (escape.size_ != Literal) {
uint32_t start_of_escape = data - first;
switch (escape.size_) {
case ControlEscapeSize:
case UnicodeEscapeSize:
escape_view = absl::string_view(escape.chars_, escape.size_);
break;
case Utf8DecodeSentinel: {
auto [unicode, consumed] = decodeUtf8(data, n);
if (consumed != 0) {
--consumed;
data += consumed;
n -= consumed;

// Having validated and constructed the unicode for the utf-8
// sequence we must determine whether to render it literally by
// simply leaving it alone, or whether we ought to render it
// as a unicode escape. We do this using a hash-map set up during
// the constructor with all desired unicode escapes, to mimic the
// behavior of the protobuf json serializer.
auto iter = unicode_escapes_.find(unicode);
if (iter == unicode_escapes_.end()) {
continue;
}
escape_view = absl::string_view(iter->second.chars_, iter->second.size_);
} else {
// Using StrFormat during decode seems slow, but this case should be
// rare.
oct_escape_buf = absl::StrFormat("\\%03o", *data);
escape_view = absl::string_view(oct_escape_buf);
}
break;
}
default:
ASSERT(false);
}

if (past_escape == absl::string_view::npos) {
// We only initialize buffer when we first learn we need to add an
// escape-sequence to the sanitized string.
if (start_of_escape == 0) {
// The first character is an escape, and 'buffer' has not been cleared yet,
// so we need to assign it rather than append to it.
buffer.assign(escape_view.data(), escape_view.size());
} else {
// We found our first escape, but this is not the first character in the
// string, so we combine the unescaped characters in the string we already
// looped over with the new escaped character.
buffer = absl::StrCat(str.substr(0, start_of_escape), escape_view);
}
} else if (start_of_escape == past_escape) {
// We are adding an escape immediately after another escaped character.
absl::StrAppend(&buffer, escape_view);
} else {
// We are adding a new escape but must first cover the characters
// encountered since the previous escape.
absl::StrAppend(&buffer, str.substr(past_escape, start_of_escape - past_escape),
escape_view);
}
past_escape = data - first + 1;
}
}

// If no escape-sequence was needed, we just return the input.
if (past_escape == absl::string_view::npos) {
return str;
}

// Otherwise we append on any unescaped chunk at the end of the input, and
// return buffer as the result.
if (past_escape < str.size()) {
absl::StrAppend(&buffer, str.substr(past_escape, str.size() - past_escape));
}
return buffer;
}

std::pair<uint32_t, uint32_t> JsonSanitizer::decodeUtf8(const uint8_t* bytes, uint32_t size) {
uint32_t unicode = 0;
uint32_t consumed = 0;

// See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section.
//
// See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is
// marked as deprecated. There is also support in Windows libraries and Boost,
// which can be discovered on StackOverflow. I could not find a usable OSS
// implementation. However it's easily derived from the spec on Wikipedia.
//
// Note that the code below could be optimized a bit, e.g. by factoring out
// repeated lookups of the same index in the bytes array and using SSE
// instructions for the multi-word bit hacking.
//
// See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot
// faster, though less readable. As coded, though, it looks like it would read
// past the end of the input if the input is malformed.
if (size >= 2 && (bytes[0] & Utf8_2ByteMask) == Utf8_2BytePattern &&
(bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
unicode = bytes[0] & ~Utf8_2ByteMask;
unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
if (unicode < 0x80) {
return UnicodeSizePair(0, 0);
}
consumed = 2;
} else if (size >= 3 && (bytes[0] & Utf8_3ByteMask) == Utf8_3BytePattern &&
(bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
(bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
unicode = bytes[0] & ~Utf8_3ByteMask;
unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask);
if (unicode < 0x800) { // 3-byte starts at 0x800
return UnicodeSizePair(0, 0);
}
consumed = 3;
} else if (size >= 4 && (bytes[0] & Utf8_4ByteMask) == Utf8_4BytePattern &&
(bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
(bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
(bytes[3] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
unicode = bytes[0] & ~Utf8_4ByteMask;
unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask);
unicode = (unicode << Utf8_Shift) | (bytes[3] & ~Utf8_ContinueMask);

// 4-byte starts at 0x10000
//
// Note from https://en.wikipedia.org/wiki/UTF-8:
// The earlier RFC2279 allowed UTF-8 encoding through code point U+7FFFFFF.
// But the current RFC3629 section 3 limits UTF-8 encoding through code
// point U+10FFFF, to match the limits of UTF-16.
if (unicode < 0x10000 || unicode > 0x10ffff) {
return UnicodeSizePair(0, 0);
}
consumed = 4;
}
return UnicodeSizePair(unicode, consumed);
}

} // namespace Json
} // namespace Envoy
Loading