Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
5639cf6
add hand-rolled json sanitizer
jmarantz Mar 20, 2022
1233b00
fix corner case
jmarantz Mar 20, 2022
0509422
inline the char_esacpes_ values rather than unique_ptr<std::string>
jmarantz Mar 20, 2022
a8d4daa
improve internal names and add comments
jmarantz Mar 20, 2022
99a381f
Fix comment typo and explicitly initialize Escape.size_ rather than b…
jmarantz Mar 20, 2022
02216fa
cleanup comments
jmarantz Mar 20, 2022
319e09c
provide link to the utf-8 spec which shows us we can correctly saniti…
jmarantz Mar 20, 2022
696cc04
add fuzz test and corpus of cases pulled from the unit test.
jmarantz Mar 22, 2022
6251863
checkpoint
jmarantz Mar 23, 2022
b11e6de
multi-byte utf8 decoder
jmarantz Mar 23, 2022
ed5616c
utf8 decoding works
jmarantz Mar 23, 2022
b651e77
cleanup
jmarantz Mar 23, 2022
03a33b1
cleanup & partial format
jmarantz Mar 23, 2022
cc95823
typo
jmarantz Mar 23, 2022
f6d49a3
minor speed tweak
jmarantz Mar 23, 2022
747fccc
remove unused constants and update benchmark data
jmarantz Mar 23, 2022
412a1c4
add comment, fix tests.
jmarantz Mar 23, 2022
6bc7e43
tidy
jmarantz Mar 23, 2022
974220e
Move isValidUtf8 out of prod code, as it's not needed in prod; only i…
jmarantz Mar 24, 2022
2faf91c
add an assert and some more comments.
jmarantz Mar 24, 2022
7db4d85
re-sort constants so the bits line up better.
jmarantz Mar 24, 2022
318f05e
add some invalid-utf8 tests
jmarantz Mar 24, 2022
63906bd
test & fix behavior for invalid input.
jmarantz Mar 24, 2022
d3d1d03
speed hacks.
jmarantz Mar 25, 2022
6a3ef94
Add more invalid-utf8 tests.
jmarantz Mar 25, 2022
11b9e47
Use StrFormat rather than snprintf even though it's likely slower.
jmarantz Mar 25, 2022
918d9d1
added one more test surveying all 8-bit inputs.
jmarantz Mar 25, 2022
0cec2cd
refine the invalid-utf8 test to include special case unicode ranges t…
jmarantz Mar 28, 2022
d1cec3b
cleanup and fix corner cases.
jmarantz Mar 29, 2022
e2bceaa
add fuzzing utility.cc impl.
jmarantz Mar 29, 2022
dbb3861
tidy & gcc cleanups.
jmarantz Mar 29, 2022
98e485d
remove backslash at EOL in comment.
jmarantz Mar 29, 2022
1acc7d7
Merge branch 'main' into json-sanitizer
jmarantz Mar 30, 2022
1ac564a
checkpoint with tweaked utf8 converter and nloehman benchmark.
jmarantz Apr 1, 2022
d8b6393
format (not fully compliant)
jmarantz Apr 1, 2022
6e3055c
add missing file
jmarantz Apr 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions source/common/json/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,15 @@ envoy_cc_library(
"//source/common/runtime:runtime_features_lib",
],
)

envoy_cc_library(
name = "json_sanitizer_lib",
srcs = ["json_sanitizer.cc"],
hdrs = ["json_sanitizer.h"],
external_deps = [
"json",
],
deps = [
"//source/common/common:assert_lib",
],
)
94 changes: 94 additions & 0 deletions source/common/json/json_sanitizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include "source/common/json/json_sanitizer.h"

#include "source/common/common/assert.h"

#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"

namespace Envoy {
namespace Json {

JsonSanitizer::JsonSanitizer() {
// Single-char escape sequences for common control characters.
auto symbolic_escape = [this](char escape_char, char symbolic) {
Escape& escape = char_escapes_[static_cast<uint32_t>(escape_char)];
escape.size_ = 2;
escape.chars_[0] = '\\';
escape.chars_[1] = symbolic;
};
symbolic_escape('\b', 'b');
symbolic_escape('\f', 'f');
symbolic_escape('\n', 'n');
symbolic_escape('\r', 'r');
symbolic_escape('\t', 't');
symbolic_escape('\\', '\\');
symbolic_escape('"', '"');

// Low characters (0-31) not listed above are encoded as unicode 4-digit hex, plus
// a few other specific ones.
auto unicode_escape = [this](uint32_t index) {
Escape& escape = char_escapes_[static_cast<uint32_t>(index)];
if (escape.size_ == 0) {
std::string escape_str = absl::StrFormat("\\u%04x", index);
escape.size_ = escape_str.size();
RELEASE_ASSERT(escape.size_ <= sizeof(escape.chars_), "escaped string too large");
memcpy(escape.chars_, escape_str.data(), escape_str.size()); // NOLINT(safe-memcpy)
}
};
unicode_escape('<');
unicode_escape('>');
unicode_escape(127);

// Control-characters below 32.
for (uint32_t i = 0; i < ' '; ++i) {
unicode_escape(i);
}
}

absl::string_view JsonSanitizer::sanitize(std::string& buffer, absl::string_view str) const {
size_t past_escape = absl::string_view::npos;
for (uint32_t i = 0, n = str.size(); i < n; ++i) {
uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
const Escape& escape = char_escapes_[index];
if (escape.size_ != 0) {
absl::string_view escape_view(escape.chars_, escape.size_);
if (past_escape == absl::string_view::npos) {
// We only initialize buffer when we first learn we need to add an
// escape-sequence to the sanitized string.
if (i == 0) {
// The first character is an escape, and 'buffer' has not been cleared yet,
// so we need to assign it rather than append to it.
buffer.assign(escape_view.data(), escape_view.size());
} else {
// We found our first escape, but this is not the first character in the
// string, so we combine the unescaped characters in the string we already
// looped over with the new escaped character.
buffer = absl::StrCat(str.substr(0, i), escape_view);
}
} else if (i == past_escape) {
// We are adding an escape immediately after another escaped character.
absl::StrAppend(&buffer, escape_view);
} else {
// We are adding a new escape but must first cover the characters
// encountered since the previous escape.
absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), escape_view);
}
past_escape = i + 1;
}
}
Comment on lines +41 to +180

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be codes can be more simple by this way?

Suggested change
size_t past_escape = absl::string_view::npos;
for (uint32_t i = 0, n = str.size(); i < n; ++i) {
uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
const std::string* escape = char_escapes_[index].get();
if (escape != nullptr) {
if (past_escape == absl::string_view::npos) {
// We only initialize buffer when we first learn we need to add an
// escape-sequence to the sanitized string.
if (i == 0) {
buffer = *escape;
} else {
buffer = absl::StrCat(str.substr(0, i), *escape);
}
} else if (i == past_escape) {
absl::StrAppend(&buffer, *escape);
} else {
absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
}
past_escape = i + 1;
}
}
size_t past_escape = 0;
for (uint32_t i = 0, n = str.size(); i < n; ++i) {
const std::string* escape = char_escapes_[static_cast<uint8_t>(str[i])].get();
if (escape != nullptr) {
absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
past_escape = i + 1;
}
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the above suggestion is part of the code I propose, but it doesn't handle some of the other cases, such as 2 escaped chars in a row, or clearing the buffer between each call to sanitize that requires escaping. I'll add more comments to make each conditional clearer.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code could be simpler but I wanted to avoid having the user clear buffer on every call, because that does not optimize for the case where the string does not require escaping, which in the expected usage will be the common case.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sound reasonable. But string clear just sets string length to zero and first char to '\0', will this brings significant performance impact? 🤔

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing the code to what you suggested definitely made things materially slower, but adding the clear() at the start was only marginally slower (within margin of noise), I guess due to compilers/CPUs doing branch prediction well (clear() will need to do different things depending on whether the std::string has inlined the bytes or note).

I'll mess around a little more with that.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Get it. If we can simplify the code without paying a performance price, it is surely best to do so. Otherwise, just ignore my suggestion. 😸


// If no escape-sequence was needed, we just return the input.
if (past_escape == absl::string_view::npos) {
return str;
}

// Otherwise we append on any unescaped chunk at the end of the input, and
// return buffer as the result.
if (past_escape < str.size()) {
absl::StrAppend(&buffer, str.substr(past_escape, str.size() - past_escape));
}
return buffer;
}

} // namespace Json
} // namespace Envoy
56 changes: 56 additions & 0 deletions source/common/json/json_sanitizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#pragma once

#include <string>

#include "absl/strings/string_view.h"

namespace Envoy {
namespace Json {

// Hand-rolled JSON sanitizer that has exactly the same behavior as serializing
// through protobufs, but is more than 10x faster. From
// test/common/json/json_sanitizer_speed_test.cc:
//
// ---------------------------------------------------------------------
// Benchmark Time CPU Iterations
// ---------------------------------------------------------------------
// BM_ProtoEncoderNoEscape 1102 ns 1102 ns 542926
// BM_JsonSanitizerNoEscape 12.0 ns 12.0 ns 57347682
// BM_ProtoEncoderWithEscape 1377 ns 1377 ns 507363
// BM_JsonSanitizerWithEscape 77.3 ns 77.3 ns 8729626
//
class JsonSanitizer {
public:
// Constructing the sanitizer fills in a table with all escape-sequences,
// indexed by character. To make this perform well, you should instantiate the
// sanitizer in a context that lives across a large number of sanitizations.
JsonSanitizer();

/**
* Sanitizes a string so it is suitable for JSON. The buffer is
* used if any of the characters in str need to be escaped.
*
* @param buffer a string in which an escaped string can be written, if needed. It
* is not necessary for callers to clear the buffer first; it be cleared
* by this method if the input needs to be escaped.
* @param str the string to be translated
* @return the translated string_view.
*/
absl::string_view sanitize(std::string& buffer, absl::string_view str) const;

private:
// Character-indexed array of translation strings. If an entry is nullptr then
// the character does not require substitution. This strategy is dependent on
// the property of UTF-8 where all two-byte characters have the high-order bit
// set for both bytes, and don't require escaping for JSON. Thus we can
// consider each character in isolation for escaping. Reference:
// https://en.wikipedia.org/wiki/UTF-8.
struct Escape {
uint8_t size_{0};
char chars_[7]; // No need to initialize char data, as we are not null-terminating.
};
Escape char_escapes_[256];
};

} // namespace Json
} // namespace Envoy
19 changes: 19 additions & 0 deletions test/common/json/BUILD
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_benchmark_binary",
"envoy_cc_fuzz_test",
"envoy_cc_test",
"envoy_package",
Expand Down Expand Up @@ -36,3 +37,21 @@ envoy_cc_test(
"//test/test_common:utility_lib",
],
)

envoy_cc_test(
name = "json_sanitizer_test",
srcs = ["json_sanitizer_test.cc"],
deps = [
"//source/common/json:json_sanitizer_lib",
"//source/common/protobuf:utility_lib",
],
)

envoy_cc_benchmark_binary(
name = "json_sanitizer_speed_test",
srcs = ["json_sanitizer_speed_test.cc"],
deps = [
"//source/common/json:json_sanitizer_lib",
"//source/common/protobuf:utility_lib",
],
)
53 changes: 53 additions & 0 deletions test/common/json/json_sanitizer_speed_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "source/common/json/json_sanitizer.h"
#include "source/common/protobuf/utility.h"

#include "benchmark/benchmark.h"

// NOLINT(namespace-envoy)

constexpr absl::string_view pass_through_encoding = "Now is the time for all good men";
constexpr absl::string_view escaped_encoding = "Now <is the \"time\"> for all good men";

// NOLINTNEXTLINE(readability-identifier-naming)
static void BM_ProtoEncoderNoEscape(benchmark::State& state) {
const std::string str = std::string(pass_through_encoding);

for (auto _ : state) { // NOLINT
Envoy::MessageUtil::getJsonStringFromMessageOrDie(Envoy::ValueUtil::stringValue(str), false,
true);
}
}
BENCHMARK(BM_ProtoEncoderNoEscape);

// NOLINTNEXTLINE(readability-identifier-naming)
static void BM_JsonSanitizerNoEscape(benchmark::State& state) {
Envoy::Json::JsonSanitizer sanitizer;
std::string buffer;

for (auto _ : state) { // NOLINT
sanitizer.sanitize(buffer, pass_through_encoding);
}
}
BENCHMARK(BM_JsonSanitizerNoEscape);

// NOLINTNEXTLINE(readability-identifier-naming)
static void BM_ProtoEncoderWithEscape(benchmark::State& state) {
const std::string str = std::string(escaped_encoding);

for (auto _ : state) { // NOLINT
Envoy::MessageUtil::getJsonStringFromMessageOrDie(Envoy::ValueUtil::stringValue(str), false,
true);
}
}
BENCHMARK(BM_ProtoEncoderWithEscape);

// NOLINTNEXTLINE(readability-identifier-naming)
static void BM_JsonSanitizerWithEscape(benchmark::State& state) {
Envoy::Json::JsonSanitizer sanitizer;
std::string buffer;

for (auto _ : state) { // NOLINT
sanitizer.sanitize(buffer, escaped_encoding);
}
}
BENCHMARK(BM_JsonSanitizerWithEscape);
96 changes: 96 additions & 0 deletions test/common/json/json_sanitizer_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include "source/common/json/json_sanitizer.h"
#include "source/common/protobuf/utility.h"

#include "gtest/gtest.h"

namespace Envoy {
namespace Json {
namespace {

class JsonSanitizerTest : public testing::Test {
protected:
absl::string_view sanitizeAndCheckAgainstProtobufJson(absl::string_view str) {
absl::string_view hand_sanitized = sanitizer_.sanitize(buffer_, str);
std::string proto_sanitized = MessageUtil::getJsonStringFromMessageOrDie(
ValueUtil::stringValue(std::string(str)), false, true);
EXPECT_EQ(proto_sanitized, absl::StrCat("\"", hand_sanitized, "\"")) << "str=" << str;
return hand_sanitized;
}

void expectUnchanged(absl::string_view str) {
EXPECT_EQ(str, sanitizeAndCheckAgainstProtobufJson(str));
}

JsonSanitizer sanitizer_;
std::string buffer_;
};

TEST_F(JsonSanitizerTest, Empty) { expectUnchanged(""); }

TEST_F(JsonSanitizerTest, NoEscape) {
expectUnchanged("abcdefghijklmnopqrstuvwxyz");
expectUnchanged("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
expectUnchanged("1234567890");
expectUnchanged(" `~!@#$%^&*()_+-={}|[]");
}

TEST_F(JsonSanitizerTest, SlashChars) {
EXPECT_EQ("\\b", sanitizeAndCheckAgainstProtobufJson("\b"));
EXPECT_EQ("\\f", sanitizeAndCheckAgainstProtobufJson("\f"));
EXPECT_EQ("\\n", sanitizeAndCheckAgainstProtobufJson("\n"));
EXPECT_EQ("\\r", sanitizeAndCheckAgainstProtobufJson("\r"));
EXPECT_EQ("\\t", sanitizeAndCheckAgainstProtobufJson("\t"));
EXPECT_EQ("\\\\", sanitizeAndCheckAgainstProtobufJson("\\"));
EXPECT_EQ("\\\"", sanitizeAndCheckAgainstProtobufJson("\""));
}

TEST_F(JsonSanitizerTest, ControlChars) {
EXPECT_EQ("\\u0001", sanitizeAndCheckAgainstProtobufJson("\001"));
EXPECT_EQ("\\u0002", sanitizeAndCheckAgainstProtobufJson("\002"));
EXPECT_EQ("\\b", sanitizeAndCheckAgainstProtobufJson("\010"));
EXPECT_EQ("\\t", sanitizeAndCheckAgainstProtobufJson("\011"));
EXPECT_EQ("\\n", sanitizeAndCheckAgainstProtobufJson("\012"));
EXPECT_EQ("\\u000b", sanitizeAndCheckAgainstProtobufJson("\013"));
EXPECT_EQ("\\f", sanitizeAndCheckAgainstProtobufJson("\014"));
EXPECT_EQ("\\r", sanitizeAndCheckAgainstProtobufJson("\015"));
EXPECT_EQ("\\u000e", sanitizeAndCheckAgainstProtobufJson("\016"));
EXPECT_EQ("\\u000f", sanitizeAndCheckAgainstProtobufJson("\017"));
EXPECT_EQ("\\u0010", sanitizeAndCheckAgainstProtobufJson("\020"));
EXPECT_EQ("\\u003c", sanitizeAndCheckAgainstProtobufJson("<"));
EXPECT_EQ("\\u003e", sanitizeAndCheckAgainstProtobufJson(">"));
}

TEST_F(JsonSanitizerTest, SevenBitAscii) {
// Cover all the 7-bit ascii values, calling sanitize so that it checks
// our hand-rolled sanitizer vs protobuf. We ignore the return-value of
// sanitize(); we are just calling for it to test against protobuf.
for (uint32_t i = 0; i < 128; ++i) {
char c = i;
sanitizeAndCheckAgainstProtobufJson(absl::string_view(&c, 1));
}
}

TEST_F(JsonSanitizerTest, Utf8) {
// reference; https://www.charset.org/utf-8
auto unicode = [](std::vector<uint8_t> chars) -> std::string {
return std::string(reinterpret_cast<const char*>(&chars[0]), chars.size());
};

expectUnchanged(unicode({0xc2, 0xa2})); // Cent.
expectUnchanged(unicode({0xc2, 0xa9})); // Copyright.
expectUnchanged(unicode({0xc3, 0xa0})); // 'a' with accent grave.
}

TEST_F(JsonSanitizerTest, Interspersed) {
EXPECT_EQ("a\\bc", sanitizeAndCheckAgainstProtobufJson("a\bc"));
EXPECT_EQ("a\\b\\fc", sanitizeAndCheckAgainstProtobufJson("a\b\fc"));
EXPECT_EQ("\\bac", sanitizeAndCheckAgainstProtobufJson("\bac"));
EXPECT_EQ("\\b\\fac", sanitizeAndCheckAgainstProtobufJson("\b\fac"));
EXPECT_EQ("ac\\b", sanitizeAndCheckAgainstProtobufJson("ac\b"));
EXPECT_EQ("ac\\b", sanitizeAndCheckAgainstProtobufJson("ac\b"));
EXPECT_EQ("\\ra\\f", sanitizeAndCheckAgainstProtobufJson("\ra\f"));
}

} // namespace
} // namespace Json
} // namespace Envoy
1 change: 1 addition & 0 deletions tools/spelling/spelling_dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,7 @@ rver
rxhash
sandboxed
sanitization
sanitizations
sanitizer
satisfiable
scalability
Expand Down