-
Notifications
You must be signed in to change notification settings - Fork 5.3k
json: add hand-rolled json sanitizer #20428
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5639cf6
1233b00
0509422
a8d4daa
99a381f
02216fa
319e09c
696cc04
6251863
b11e6de
ed5616c
b651e77
03a33b1
cc95823
f6d49a3
747fccc
412a1c4
6bc7e43
974220e
2faf91c
7db4d85
318f05e
63906bd
d3d1d03
6a3ef94
11b9e47
918d9d1
0cec2cd
d1cec3b
e2bceaa
dbb3861
98e485d
1acc7d7
1ac564a
d8b6393
6e3055c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,255 @@ | ||
| #include "source/common/json/json_sanitizer.h" | ||
|
|
||
| #include <utility> | ||
|
|
||
| #include "source/common/common/assert.h" | ||
|
|
||
| #include "absl/strings/str_cat.h" | ||
| #include "absl/strings/str_format.h" | ||
|
|
||
| namespace Envoy { | ||
| namespace Json { | ||
|
|
||
| namespace { | ||
|
|
||
| const uint8_t Literal = 0; | ||
| const uint8_t ControlEscapeSize = 2; // e.g. \b | ||
| const uint8_t UnicodeEscapeSize = 6; // e.g. \u1234 | ||
| const uint8_t Utf8DecodeSentinel = 0xff; | ||
|
|
||
| } // namespace | ||
|
|
||
| JsonSanitizer::JsonSanitizer() { | ||
| // Single-char escape sequences for common control characters. | ||
| auto symbolic_escape = [this](char control_char, char symbolic) { | ||
| Escape& escape = char_escapes_[char2uint32(control_char)]; | ||
| escape.size_ = ControlEscapeSize; | ||
| escape.chars_[0] = '\\'; | ||
| escape.chars_[1] = symbolic; | ||
| }; | ||
| symbolic_escape('\b', 'b'); | ||
| symbolic_escape('\f', 'f'); | ||
| symbolic_escape('\n', 'n'); | ||
| symbolic_escape('\r', 'r'); | ||
| symbolic_escape('\t', 't'); | ||
| symbolic_escape('\\', '\\'); | ||
| symbolic_escape('"', '"'); | ||
|
|
||
| // Low characters (0-31) not listed above are encoded as unicode 4-digit hex. | ||
| auto unicode_escape = [this](uint32_t index) { | ||
| // We capture unicode Escapes both in a char-indexed array, for direct | ||
| // substitutions on literal inputs, and in a unicode-indexed hash-map, | ||
| // for lookup after utf8 decode. | ||
| std::string escape_str = absl::StrFormat("\\u%04x", index); | ||
| ASSERT(escape_str.size() == UnicodeEscapeSize); | ||
| Escape& escape = unicode_escapes_[index]; | ||
| escape.size_ = escape_str.size(); | ||
| RELEASE_ASSERT(escape.size_ <= sizeof(escape.chars_), "escaped string too large"); | ||
| memcpy(escape.chars_, escape_str.data(), escape_str.size()); // NOLINT(safe-memcpy)*/ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at this approach in general, I think we have a lot of complexity in code being executed at runtime. Sure, it's going to build the same table every time, but it's a bit hard to reason about. Can't we build an offline table in the build process and then just
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking we would instantiate one of these Json tables in the StatsHandler so it would only be on startup. However I think it's fine to move this code to a generator and either generate during build, or generate manually and check it in as source, annotating the array with comments. |
||
| if (index < NumEscapes) { | ||
| char_escapes_[index] = escape; | ||
| } | ||
| }; | ||
|
|
||
| // Add unicode escapes for control-characters below 32 that don't have symbolic escapes. | ||
| for (uint32_t i = 0; i < ' '; ++i) { | ||
| if (char_escapes_[i].size_ == 0) { | ||
| unicode_escape(i); | ||
| } | ||
| } | ||
|
|
||
| // Unicode-escaped ascii constants above SPACE (32). | ||
| for (char ch : {'<', '>', '\177'}) { | ||
| unicode_escape(char2uint32(ch)); | ||
| } | ||
|
|
||
| // There's a range of 8-bit characters that are unicode escaped by the | ||
| // protobuf library, so we match behavior. | ||
| for (uint32_t i = 0x0080; i < 0x00a0; ++i) { | ||
| unicode_escape(i); | ||
| } | ||
|
|
||
| // The remaining unicode characters are mostly passed through literally. We'll | ||
| // initialize all of them and then override some below. | ||
| for (uint32_t i = 0x00a0; i < NumEscapes; ++i) { | ||
| char_escapes_[i].size_ = Literal; | ||
| } | ||
|
|
||
| // All the bytes matching pattern 11xxxxxx will be evaluated as utf-8. | ||
| for (uint32_t i = Utf8_2BytePattern; i <= 0xff; ++i) { | ||
| char_escapes_[i].size_ = Utf8DecodeSentinel; | ||
| } | ||
|
|
||
| // There are an assortment of unicode characters that protobufs quote, so we | ||
| // do likewise here to make differential testing/fuzzing feasible. | ||
| for (uint32_t i : {0x00ad, 0x0600, 0x0601, 0x0602, 0x0603, 0x06dd, 0x070f}) { | ||
| unicode_escape(i); | ||
| } | ||
| } | ||
|
|
||
| absl::string_view JsonSanitizer::sanitize(std::string& buffer, absl::string_view str) const { | ||
| // Fast-path to see whether any escapes or utf-encoding are needed. If str has | ||
| // only unescaped ascii characters, we can simply return it. So before doing | ||
| // anything too fancy, do a lookup in char_escapes_ for each character, and | ||
| // simply OR in the return sizes. We use 0 for the return-size when we are | ||
| // simply leaving the character as is, so anything non-zero means we need to | ||
| // initiate the slow path. | ||
| // | ||
| // Benchmarks show it's faster to just rip through the string with no | ||
| // conditionals, so we only check the ORed sizes after the loop. This avoids | ||
| // branches and allows simpler loop unrolling by the compiler. | ||
| uint32_t sizes_ored_together = 0; | ||
| for (char c : str) { | ||
| sizes_ored_together |= char_escapes_[char2uint32(c)].size_; | ||
| } | ||
| if (sizes_ored_together == 0) { | ||
| return str; // Happy path, should be executed most of the time. | ||
| } | ||
| return slowSanitize(buffer, str); | ||
| } | ||
|
|
||
| absl::string_view JsonSanitizer::slowSanitize(std::string& buffer, absl::string_view str) const { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you audited https://nlohmann.github.io/json/ for similar functions? I'm wondering if we we have anything that will simplify either sanitize or the UTF8 piece in our existing deps.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch -- I didn't know about nlohmann and now I see we are already pulling that in. I'll take a look. Do you know why we started using protobufs as an IR to generate JSON in the first place, given that?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. RE nlohmann a quick read suggests it has a similar usage model as protobufs: https://github.com/nlohmann/json#serialization--deserialization ie to serialize you need to populate an object and then serialize it, rather than serializing directly. I think this pattern of creating intermediate objects in order to convert a string to a safer string is the core reason the protobuf version is so slow. Note that even though I named this function
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The history here was that Protobuf for the longest time was the only reliable dependency here, and we were trying to reduce our dependency on rapidjson. The introduction of nlohmann JSON has changed this dynamic.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you run a quick benchmark to convince that the nlohmann JSON library is not suitable? I think Protobuf is slow for all kinds of reasons, it's never been highly tuned for performance on this path most likely, whereas the nlohmann library is designed for speed. Basically, what we have in this PR represents a real speed up that seems to justify the complexity, but we should do the due diligence that there isn't something else within reasonable reach in one of our existing dependencies (the only ones I can think of are nlohmann JSON, maybe Abseil?) that does the job.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm! That wasn't too hard, and probably that's the right way to go. I think the fast-path wrapper I have should stay because for stats that will be used most of the time, it's still 20x faster than nlohmann, and it's pretty easy to reason about. But nlohmann is only 2x slower than the sanitizer I added here and it might not be worth it adding all the complexity:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One more point about nlohmann -- it's not at all compatible in terms of its output with Protobufs. It makes all kinds of different (more rational) escaping decisions than protobufs, but it can't be differentially fuzzed against them. |
||
| std::string oct_escape_buf; | ||
| size_t past_escape = absl::string_view::npos; | ||
| const uint8_t* first = reinterpret_cast<const uint8_t*>(str.data()); | ||
| const uint8_t* data = first; | ||
| absl::string_view escape_view; | ||
| for (uint32_t n = str.size(); n != 0; ++data, --n) { | ||
| const Escape& escape = char_escapes_[*data]; | ||
| if (escape.size_ != Literal) { | ||
| uint32_t start_of_escape = data - first; | ||
| switch (escape.size_) { | ||
| case ControlEscapeSize: | ||
| case UnicodeEscapeSize: | ||
| escape_view = absl::string_view(escape.chars_, escape.size_); | ||
| break; | ||
| case Utf8DecodeSentinel: { | ||
| auto [unicode, consumed] = decodeUtf8(data, n); | ||
| if (consumed != 0) { | ||
| --consumed; | ||
| data += consumed; | ||
| n -= consumed; | ||
|
|
||
| // Having validated and constructed the unicode for the utf-8 | ||
| // sequence we must determine whether to render it literally by | ||
| // simply leaving it alone, or whether we ought to render it | ||
| // as a unicode escape. We do this using a hash-map set up during | ||
| // the constructor with all desired unicode escapes, to mimic the | ||
| // behavior of the protobuf json serializer. | ||
| auto iter = unicode_escapes_.find(unicode); | ||
| if (iter == unicode_escapes_.end()) { | ||
| continue; | ||
| } | ||
| escape_view = absl::string_view(iter->second.chars_, iter->second.size_); | ||
| } else { | ||
| // Using StrFormat during decode seems slow, but this case should be | ||
| // rare. | ||
| oct_escape_buf = absl::StrFormat("\\%03o", *data); | ||
| escape_view = absl::string_view(oct_escape_buf); | ||
| } | ||
| break; | ||
| } | ||
| default: | ||
| ASSERT(false); | ||
| } | ||
|
|
||
| if (past_escape == absl::string_view::npos) { | ||
| // We only initialize buffer when we first learn we need to add an | ||
| // escape-sequence to the sanitized string. | ||
| if (start_of_escape == 0) { | ||
| // The first character is an escape, and 'buffer' has not been cleared yet, | ||
| // so we need to assign it rather than append to it. | ||
| buffer.assign(escape_view.data(), escape_view.size()); | ||
| } else { | ||
| // We found our first escape, but this is not the first character in the | ||
| // string, so we combine the unescaped characters in the string we already | ||
| // looped over with the new escaped character. | ||
| buffer = absl::StrCat(str.substr(0, start_of_escape), escape_view); | ||
| } | ||
| } else if (start_of_escape == past_escape) { | ||
| // We are adding an escape immediately after another escaped character. | ||
| absl::StrAppend(&buffer, escape_view); | ||
| } else { | ||
| // We are adding a new escape but must first cover the characters | ||
| // encountered since the previous escape. | ||
| absl::StrAppend(&buffer, str.substr(past_escape, start_of_escape - past_escape), | ||
| escape_view); | ||
| } | ||
| past_escape = data - first + 1; | ||
| } | ||
| } | ||
|
|
||
| // If no escape-sequence was needed, we just return the input. | ||
| if (past_escape == absl::string_view::npos) { | ||
| return str; | ||
| } | ||
|
|
||
| // Otherwise we append on any unescaped chunk at the end of the input, and | ||
| // return buffer as the result. | ||
| if (past_escape < str.size()) { | ||
| absl::StrAppend(&buffer, str.substr(past_escape, str.size() - past_escape)); | ||
| } | ||
| return buffer; | ||
| } | ||
|
|
||
| std::pair<uint32_t, uint32_t> JsonSanitizer::decodeUtf8(const uint8_t* bytes, uint32_t size) { | ||
| uint32_t unicode = 0; | ||
| uint32_t consumed = 0; | ||
|
|
||
| // See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section. | ||
| // | ||
| // See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is | ||
| // marked as deprecated. There is also support in Windows libraries and Boost, | ||
| // which can be discovered on StackOverflow. I could not find a usable OSS | ||
| // implementation. However it's easily derived from the spec on Wikipedia. | ||
| // | ||
| // Note that the code below could be optimized a bit, e.g. by factoring out | ||
| // repeated lookups of the same index in the bytes array and using SSE | ||
| // instructions for the multi-word bit hacking. | ||
| // | ||
| // See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot | ||
| // faster, though less readable. As coded, though, it looks like it would read | ||
| // past the end of the input if the input is malformed. | ||
| if (size >= 2 && (bytes[0] & Utf8_2ByteMask) == Utf8_2BytePattern && | ||
| (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern) { | ||
| unicode = bytes[0] & ~Utf8_2ByteMask; | ||
| unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask); | ||
| if (unicode < 0x80) { | ||
| return UnicodeSizePair(0, 0); | ||
| } | ||
| consumed = 2; | ||
| } else if (size >= 3 && (bytes[0] & Utf8_3ByteMask) == Utf8_3BytePattern && | ||
| (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern && | ||
| (bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern) { | ||
| unicode = bytes[0] & ~Utf8_3ByteMask; | ||
| unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask); | ||
| unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask); | ||
| if (unicode < 0x800) { // 3-byte starts at 0x800 | ||
| return UnicodeSizePair(0, 0); | ||
| } | ||
| consumed = 3; | ||
| } else if (size >= 4 && (bytes[0] & Utf8_4ByteMask) == Utf8_4BytePattern && | ||
| (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern && | ||
| (bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern && | ||
| (bytes[3] & Utf8_ContinueMask) == Utf8_ContinuePattern) { | ||
| unicode = bytes[0] & ~Utf8_4ByteMask; | ||
| unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask); | ||
| unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask); | ||
| unicode = (unicode << Utf8_Shift) | (bytes[3] & ~Utf8_ContinueMask); | ||
|
|
||
| // 4-byte starts at 0x10000 | ||
| // | ||
| // Note from https://en.wikipedia.org/wiki/UTF-8: | ||
| // The earlier RFC2279 allowed UTF-8 encoding through code point U+7FFFFFF. | ||
| // But the current RFC3629 section 3 limits UTF-8 encoding through code | ||
| // point U+10FFFF, to match the limits of UTF-16. | ||
| if (unicode < 0x10000 || unicode > 0x10ffff) { | ||
| return UnicodeSizePair(0, 0); | ||
| } | ||
| consumed = 4; | ||
| } | ||
| return UnicodeSizePair(unicode, consumed); | ||
| } | ||
|
|
||
| } // namespace Json | ||
| } // namespace Envoy | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this assuming an integer representation of intervals? Just trying to figure out why the
+1works.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interval is documented as being [left, right). I'd have to think about how this would be used with floats. Maybe it's better just to document this as integer-only.